├── 1.REINFORCE ├── README.md ├── REINFORCE.py ├── REINFORCE_baseline.py ├── data_train │ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_0.npy │ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_10.npy │ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_100.npy │ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_0.npy │ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_10.npy │ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_100.npy │ ├── REINFORCE_env_CartPole-v0_number_2_seed_0.npy │ ├── REINFORCE_env_CartPole-v0_number_2_seed_10.npy │ ├── REINFORCE_env_CartPole-v0_number_2_seed_100.npy │ ├── REINFORCE_env_CartPole-v1_number_2_seed_0.npy │ ├── REINFORCE_env_CartPole-v1_number_2_seed_10.npy │ └── REINFORCE_env_CartPole-v1_number_2_seed_100.npy ├── runs │ └── REINFORCE │ │ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_0 │ │ └── events.out.tfevents.1648121668.李智.23156.0 │ │ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_10 │ │ └── events.out.tfevents.1648121786.李智.23156.1 │ │ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_100 │ │ └── events.out.tfevents.1648121899.李智.23156.2 │ │ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_0 │ │ └── events.out.tfevents.1648121670.李智.15096.0 │ │ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_10 │ │ └── events.out.tfevents.1648121797.李智.15096.1 │ │ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_100 │ │ └── events.out.tfevents.1648121918.李智.15096.2 │ │ ├── REINFORCE_env_CartPole-v0_number_2_seed_0 │ │ └── events.out.tfevents.1648121512.李智.32424.0 │ │ ├── REINFORCE_env_CartPole-v0_number_2_seed_10 │ │ └── events.out.tfevents.1648121583.李智.32424.1 │ │ ├── REINFORCE_env_CartPole-v0_number_2_seed_100 │ │ └── events.out.tfevents.1648121655.李智.32424.2 │ │ ├── REINFORCE_env_CartPole-v1_number_2_seed_0 │ │ └── events.out.tfevents.1648121526.李智.11256.0 │ │ ├── REINFORCE_env_CartPole-v1_number_2_seed_10 │ │ └── events.out.tfevents.1648121607.李智.11256.1 │ │ └── REINFORCE_env_CartPole-v1_number_2_seed_100 │ │ └── events.out.tfevents.1648121688.李智.11256.2 └── training results.png ├── 2.Actor-Critic ├── A2C.py ├── A2C_results.png ├── README.md ├── data_train │ ├── A2C_env_CartPole-v0_number_9_seed_0.npy │ ├── A2C_env_CartPole-v0_number_9_seed_10.npy │ ├── A2C_env_CartPole-v0_number_9_seed_100.npy │ ├── A2C_env_CartPole-v1_number_9_seed_0.npy │ ├── A2C_env_CartPole-v1_number_9_seed_10.npy │ └── A2C_env_CartPole-v1_number_9_seed_100.npy └── runs │ └── A2C │ ├── A2C_env_CartPole-v0_number_9_seed_0 │ └── events.out.tfevents.1648553119.李智.62564.0 │ ├── A2C_env_CartPole-v0_number_9_seed_10 │ └── events.out.tfevents.1648553543.李智.62564.1 │ ├── A2C_env_CartPole-v0_number_9_seed_100 │ └── events.out.tfevents.1648554019.李智.62564.2 │ ├── A2C_env_CartPole-v1_number_9_seed_0 │ └── events.out.tfevents.1648553122.李智.63460.0 │ ├── A2C_env_CartPole-v1_number_9_seed_10 │ └── events.out.tfevents.1648553561.李智.63460.1 │ └── A2C_env_CartPole-v1_number_9_seed_100 │ └── events.out.tfevents.1648554055.李智.63460.2 ├── 3.Rainbow_DQN ├── README.md ├── Rainbow_DQN_main.py ├── __pycache__ │ ├── network.cpython-37.pyc │ ├── rainbow_dqn.cpython-37.pyc │ ├── replay_buffer.cpython-37.pyc │ └── sum_tree.cpython-37.pyc ├── data_train │ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0.npy │ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10.npy │ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100.npy │ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0.npy │ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10.npy │ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100.npy │ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy │ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy │ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy │ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy │ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy │ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy │ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy │ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy │ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy │ ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_0.npy │ ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_10.npy │ └── Rainbow_DQN_env_LunarLander-v2_number_1_seed_100.npy ├── drawing_Rainbow_DQN.py ├── network.py ├── rainbow_dqn.py ├── rainbow_dqn_result.png ├── replay_buffer.py ├── runs │ └── DQN │ │ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0 │ │ └── events.out.tfevents.1658494478.DESKTOP-LMKC0MO.1408.0 │ │ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10 │ │ └── events.out.tfevents.1658507126.DESKTOP-LMKC0MO.1408.1 │ │ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100 │ │ └── events.out.tfevents.1658520541.DESKTOP-LMKC0MO.1408.2 │ │ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0 │ │ └── events.out.tfevents.1658494481.DESKTOP-LMKC0MO.9316.0 │ │ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10 │ │ └── events.out.tfevents.1658512436.DESKTOP-LMKC0MO.9316.1 │ │ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100 │ │ └── events.out.tfevents.1658531515.DESKTOP-LMKC0MO.9316.2 │ │ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0 │ │ └── events.out.tfevents.1658494475.DESKTOP-LMKC0MO.5976.0 │ │ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10 │ │ └── events.out.tfevents.1658511615.DESKTOP-LMKC0MO.5976.1 │ │ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100 │ │ └── events.out.tfevents.1658528978.DESKTOP-LMKC0MO.5976.2 │ │ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0 │ │ └── events.out.tfevents.1658494473.DESKTOP-LMKC0MO.2144.0 │ │ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10 │ │ └── events.out.tfevents.1658511489.DESKTOP-LMKC0MO.2144.1 │ │ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100 │ │ └── events.out.tfevents.1658529336.DESKTOP-LMKC0MO.2144.2 │ │ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0 │ │ └── events.out.tfevents.1658494471.DESKTOP-LMKC0MO.9964.0 │ │ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10 │ │ └── events.out.tfevents.1658510515.DESKTOP-LMKC0MO.9964.1 │ │ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100 │ │ └── events.out.tfevents.1658526626.DESKTOP-LMKC0MO.9964.2 │ │ ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_0 │ │ └── events.out.tfevents.1658479209.DESKTOP-LMKC0MO.1228.0 │ │ ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_10 │ │ └── events.out.tfevents.1658479212.DESKTOP-LMKC0MO.10500.0 │ │ └── Rainbow_DQN_env_LunarLander-v2_number_1_seed_100 │ │ └── events.out.tfevents.1658479214.DESKTOP-LMKC0MO.9512.0 └── sum_tree.py ├── 4.PPO-discrete ├── PPO_discrete_main.py ├── README.md ├── __pycache__ │ ├── normalization.cpython-37.pyc │ ├── ppo_discrete.cpython-37.pyc │ └── replaybuffer.cpython-37.pyc ├── data_train │ └── readme.txt ├── normalization.py ├── ppo_discrete.py ├── replaybuffer.py ├── runs │ └── readme.txt └── training_result.png ├── 5.PPO-continuous ├── PPO_continuous_main.py ├── README.md ├── __pycache__ │ ├── normalization.cpython-37.pyc │ ├── ppo_continuous.cpython-37.pyc │ └── replaybuffer.cpython-37.pyc ├── data_train │ └── readme.txt ├── normalization.py ├── ppo_continuous.py ├── replaybuffer.py ├── runs │ └── readme.txt └── training_result.png ├── 6.DDPG └── DDPG.py ├── 7.TD3 ├── README.md ├── TD3.py ├── TD3_result.png ├── data_train │ ├── TD3_env_BipedalWalker-v3_number_1_seed_0.npy │ ├── TD3_env_BipedalWalker-v3_number_1_seed_10.npy │ ├── TD3_env_BipedalWalker-v3_number_1_seed_100.npy │ ├── TD3_env_HalfCheetah-v2_number_1_seed_0.npy │ ├── TD3_env_HalfCheetah-v2_number_1_seed_10.npy │ ├── TD3_env_HalfCheetah-v2_number_1_seed_100.npy │ ├── TD3_env_Hopper-v2_number_1_seed_0.npy │ ├── TD3_env_Hopper-v2_number_1_seed_10.npy │ ├── TD3_env_Hopper-v2_number_1_seed_100.npy │ ├── TD3_env_Pendulum-v1_number_1_seed_0.npy │ ├── TD3_env_Pendulum-v1_number_1_seed_10.npy │ ├── TD3_env_Pendulum-v1_number_1_seed_100.npy │ ├── TD3_env_Walker2d-v2_number_1_seed_0.npy │ ├── TD3_env_Walker2d-v2_number_1_seed_10.npy │ └── TD3_env_Walker2d-v2_number_1_seed_100.npy └── runs │ └── TD3 │ ├── TD3_env_BipedalWalker-v3_number_1_seed_0 │ └── events.out.tfevents.1648952137.李智.93956.0 │ ├── TD3_env_BipedalWalker-v3_number_1_seed_10 │ └── events.out.tfevents.1648882414.李智.81744.0 │ ├── TD3_env_BipedalWalker-v3_number_1_seed_100 │ └── events.out.tfevents.1648925401.李智.81744.1 │ ├── TD3_env_HalfCheetah-v2_number_1_seed_0 │ └── events.out.tfevents.1648909506.李智.60360.2 │ ├── TD3_env_HalfCheetah-v2_number_1_seed_10 │ └── events.out.tfevents.1648800524.李智.60360.0 │ ├── TD3_env_HalfCheetah-v2_number_1_seed_100 │ └── events.out.tfevents.1648852975.李智.60360.1 │ ├── TD3_env_Hopper-v2_number_1_seed_0 │ └── events.out.tfevents.1649010066.李智.85868.2 │ ├── TD3_env_Hopper-v2_number_1_seed_10 │ └── events.out.tfevents.1648901654.李智.85868.0 │ ├── TD3_env_Hopper-v2_number_1_seed_100 │ └── events.out.tfevents.1648956951.李智.85868.1 │ ├── TD3_env_Pendulum-v1_number_1_seed_0 │ └── events.out.tfevents.1649065960.李智.18392.2 │ ├── TD3_env_Pendulum-v1_number_1_seed_10 │ └── events.out.tfevents.1649057339.李智.18392.0 │ ├── TD3_env_Pendulum-v1_number_1_seed_100 │ └── events.out.tfevents.1649061632.李智.18392.1 │ ├── TD3_env_Walker2d-v2_number_1_seed_0 │ └── events.out.tfevents.1648846023.李智.76672.2 │ ├── TD3_env_Walker2d-v2_number_1_seed_10 │ └── events.out.tfevents.1648735005.李智.76672.0 │ └── TD3_env_Walker2d-v2_number_1_seed_100 │ └── events.out.tfevents.1648793243.李智.76672.1 ├── 8.SAC └── SAC-continuous.py ├── 9.PPO-discrete-RNN ├── PPO+RNN.png ├── PPO_discrete_rnn_main.py ├── README.md ├── __pycache__ │ ├── normalization.cpython-37.pyc │ ├── ppo_discrete_rnn.cpython-37.pyc │ └── replaybuffer.cpython-37.pyc ├── data_train │ ├── PPO_env_CartPole-v1_number_3_seed_0.npy │ ├── PPO_env_CartPole-v1_number_3_seed_10.npy │ ├── PPO_env_CartPole-v1_number_3_seed_100.npy │ ├── PPO_env_CartPole-v1_number_5_seed_0.npy │ ├── PPO_env_CartPole-v1_number_5_seed_10.npy │ ├── PPO_env_CartPole-v1_number_5_seed_100.npy │ ├── PPO_env_LunarLander-v2_number_3_seed_0.npy │ ├── PPO_env_LunarLander-v2_number_3_seed_10.npy │ ├── PPO_env_LunarLander-v2_number_3_seed_100.npy │ ├── PPO_env_LunarLander-v2_number_5_seed_0.npy │ ├── PPO_env_LunarLander-v2_number_5_seed_10.npy │ └── PPO_env_LunarLander-v2_number_5_seed_100.npy ├── normalization.py ├── ppo_discrete_rnn.py ├── replaybuffer.py └── runs │ └── PPO_discrete │ ├── env_CartPole-v1_number_3_seed_0 │ └── events.out.tfevents.1659015484.DESKTOP-LMKC0MO.6444.0 │ ├── env_CartPole-v1_number_3_seed_10 │ └── events.out.tfevents.1659019387.DESKTOP-LMKC0MO.6444.1 │ ├── env_CartPole-v1_number_3_seed_100 │ └── events.out.tfevents.1659022676.DESKTOP-LMKC0MO.6444.2 │ ├── env_CartPole-v1_number_5_seed_0 │ └── events.out.tfevents.1659083525.DESKTOP-LMKC0MO.2204.0 │ ├── env_CartPole-v1_number_5_seed_10 │ └── events.out.tfevents.1659084659.DESKTOP-LMKC0MO.2204.1 │ ├── env_CartPole-v1_number_5_seed_100 │ └── events.out.tfevents.1659085747.DESKTOP-LMKC0MO.2204.2 │ ├── env_LunarLander-v2_number_3_seed_0 │ └── events.out.tfevents.1659015468.DESKTOP-LMKC0MO.13484.0 │ ├── env_LunarLander-v2_number_3_seed_10 │ └── events.out.tfevents.1659031613.DESKTOP-LMKC0MO.13484.1 │ ├── env_LunarLander-v2_number_3_seed_100 │ └── events.out.tfevents.1659045291.DESKTOP-LMKC0MO.13484.2 │ ├── env_LunarLander-v2_number_5_seed_0 │ └── events.out.tfevents.1659083526.DESKTOP-LMKC0MO.12096.0 │ ├── env_LunarLander-v2_number_5_seed_10 │ └── events.out.tfevents.1659083528.DESKTOP-LMKC0MO.11420.0 │ └── env_LunarLander-v2_number_5_seed_100 │ └── events.out.tfevents.1659083530.DESKTOP-LMKC0MO.11484.0 ├── LICENSE └── README.md /1.REINFORCE/README.md: -------------------------------------------------------------------------------- 1 | # REINFORCE 2 | This is a concise Pytorch implementation of REINFORCE.
3 | REINFORCE.py is a implementation of REINFORCE without the baseline.
4 | REINFORCE_baseline.py is a implementation of REINFORCE with the baseline.
5 | 6 | ## How to use my code? 7 | You can dircetly run REINFORCE.py and REINFORCE_baseline.py in your own IDE.
8 | 9 | ### Trainning environments 10 | You can set the 'env_index' in the codes to change the environments.
11 | env_index=0 represent 'CartPole-v0'
12 | env_index=1 represent 'CartPole-v1'
13 | 14 | ### How to see the training results? 15 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.
16 | The rewards data are saved as numpy in the file 'data_train'.
17 | The training curves are shown below, which are smoothed by averaging over a window of 10 steps.
18 | The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)
19 | 20 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/1.REINFORCE/training%20results.png) 21 | -------------------------------------------------------------------------------- /1.REINFORCE/REINFORCE.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | 9 | class Policy(nn.Module): 10 | def __init__(self, state_dim, action_dim, hidden_width): 11 | super(Policy, self).__init__() 12 | self.l1 = nn.Linear(state_dim, hidden_width) 13 | self.l2 = nn.Linear(hidden_width, action_dim) 14 | 15 | def forward(self, s): 16 | s = F.relu(self.l1(s)) 17 | a_prob = F.softmax(self.l2(s), dim=1) 18 | return a_prob 19 | 20 | 21 | class REINFORCE(object): 22 | def __init__(self, state_dim, action_dim): 23 | self.state_dim = state_dim 24 | self.action_dim = action_dim 25 | self.hidden_width = 64 # The number of neurons in hidden layers of the neural network 26 | self.lr = 4e-4 # learning rate 27 | self.GAMMA = 0.99 # discount factor 28 | self.episode_s, self.episode_a, self.episode_r = [], [], [] 29 | 30 | self.policy = Policy(state_dim, action_dim, self.hidden_width) 31 | self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.lr) 32 | 33 | def choose_action(self, s, deterministic): 34 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 35 | prob_weights = self.policy(s).detach().numpy().flatten() # probability distribution(numpy) 36 | if deterministic: # We use the deterministic policy during the evaluating 37 | a = np.argmax(prob_weights) # Select the action with the highest probability 38 | return a 39 | else: # We use the stochastic policy during the training 40 | a = np.random.choice(range(self.action_dim), p=prob_weights) # Sample the action according to the probability distribution 41 | return a 42 | 43 | def store(self, s, a, r): 44 | self.episode_s.append(s) 45 | self.episode_a.append(a) 46 | self.episode_r.append(r) 47 | 48 | def learn(self, ): 49 | G = [] 50 | g = 0 51 | for r in reversed(self.episode_r): # calculate the return G reversely 52 | g = self.GAMMA * g + r 53 | G.insert(0, g) 54 | 55 | for t in range(len(self.episode_r)): 56 | s = torch.unsqueeze(torch.tensor(self.episode_s[t], dtype=torch.float), 0) 57 | a = self.episode_a[t] 58 | g = G[t] 59 | 60 | a_prob = self.policy(s).flatten() 61 | policy_loss = -pow(self.GAMMA, t) * g * torch.log(a_prob[a]) 62 | self.policy_optimizer.zero_grad() 63 | policy_loss.backward() 64 | self.policy_optimizer.step() 65 | 66 | # Clean the buffer 67 | self.episode_s, self.episode_a, self.episode_r = [], [], [] 68 | 69 | 70 | def evaluate_policy(env, agent): 71 | times = 3 # Perform three evaluations and calculate the average 72 | evaluate_reward = 0 73 | for _ in range(times): 74 | s = env.reset() 75 | done = False 76 | episode_reward = 0 77 | while not done: 78 | a = agent.choose_action(s, deterministic=True) # We use the deterministic policy during the evaluating 79 | s_, r, done, _ = env.step(a) 80 | episode_reward += r 81 | s = s_ 82 | evaluate_reward += episode_reward 83 | 84 | return int(evaluate_reward / times) 85 | 86 | 87 | if __name__ == '__main__': 88 | env_name = ['CartPole-v0', 'CartPole-v1'] 89 | env_index = 0 # The index of the environments above 90 | env = gym.make(env_name[env_index]) 91 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment 92 | number = 1 93 | seed = 0 94 | env.seed(seed) 95 | env_evaluate.seed(seed) 96 | np.random.seed(seed) 97 | torch.manual_seed(seed) 98 | 99 | state_dim = env.observation_space.shape[0] 100 | action_dim = env.action_space.n 101 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode 102 | print("state_dim={}".format(state_dim)) 103 | print("action_dim={}".format(action_dim)) 104 | print("max_episode_steps={}".format(max_episode_steps)) 105 | 106 | agent = REINFORCE(state_dim, action_dim) 107 | writer = SummaryWriter(log_dir='runs/REINFORCE/REINFORCE_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed)) # build a tensorboard 108 | 109 | max_train_steps = 1e5 # Maximum number of training steps 110 | evaluate_freq = 1e3 # Evaluate the policy every 'evaluate_freq' steps 111 | evaluate_num = 0 # Record the number of evaluations 112 | evaluate_rewards = [] # Record the rewards during the evaluating 113 | total_steps = 0 # Record the total steps during the training 114 | 115 | while total_steps < max_train_steps: 116 | episode_steps = 0 117 | s = env.reset() 118 | done = False 119 | while not done: 120 | episode_steps += 1 121 | a = agent.choose_action(s, deterministic=False) 122 | s_, r, done, _ = env.step(a) 123 | agent.store(s, a, r) 124 | s = s_ 125 | 126 | # Evaluate the policy every 'evaluate_freq' steps 127 | if (total_steps + 1) % evaluate_freq == 0: 128 | evaluate_num += 1 129 | evaluate_reward = evaluate_policy(env_evaluate, agent) 130 | evaluate_rewards.append(evaluate_reward) 131 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward)) 132 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps) 133 | if evaluate_num % 10 == 0: 134 | np.save('./data_train/REINFORCE_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards)) 135 | 136 | total_steps += 1 137 | 138 | # An episode is over,then update 139 | agent.learn() 140 | -------------------------------------------------------------------------------- /1.REINFORCE/REINFORCE_baseline.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | 9 | class Policy(nn.Module): 10 | def __init__(self, state_dim, action_dim, hidden_width): 11 | super(Policy, self).__init__() 12 | self.l1 = nn.Linear(state_dim, hidden_width) 13 | self.l2 = nn.Linear(hidden_width, action_dim) 14 | 15 | def forward(self, s): 16 | s = F.relu(self.l1(s)) 17 | a_prob = F.softmax(self.l2(s), dim=1) 18 | return a_prob 19 | 20 | 21 | class Value(nn.Module): 22 | def __init__(self, state_dim, hidden_width): 23 | super(Value, self).__init__() 24 | self.l1 = nn.Linear(state_dim, hidden_width) 25 | self.l2 = nn.Linear(hidden_width, 1) 26 | 27 | def forward(self, s): 28 | s = F.relu(self.l1(s)) 29 | v_s = self.l2(s) 30 | return v_s 31 | 32 | 33 | class REINFORCE(object): 34 | def __init__(self, state_dim, action_dim): 35 | self.state_dim = state_dim 36 | self.action_dim = action_dim 37 | self.hidden_width = 64 # The number of neurons in hidden layers of the neural network 38 | self.lr = 4e-4 # learning rate 39 | self.GAMMA = 0.99 # discount factor 40 | self.episode_s, self.episode_a, self.episode_r = [], [], [] 41 | 42 | self.policy = Policy(state_dim, action_dim, self.hidden_width) 43 | self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.lr) 44 | 45 | self.value = Value(state_dim, self.hidden_width) 46 | self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=self.lr) 47 | 48 | def choose_action(self, s, deterministic): 49 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 50 | prob_weights = self.policy(s).detach().numpy().flatten() # probability distribution(numpy) 51 | if deterministic: # We use the deterministic policy during the evaluating 52 | a = np.argmax(prob_weights) # Select the action with the highest probability 53 | return a 54 | else: # We use the stochastic policy during the training 55 | a = np.random.choice(range(self.action_dim), p=prob_weights) # Sample the action according to the probability distribution 56 | return a 57 | 58 | def store(self, s, a, r): 59 | self.episode_s.append(s) 60 | self.episode_a.append(a) 61 | self.episode_r.append(r) 62 | 63 | def learn(self, ): 64 | G = [] 65 | g = 0 66 | for r in reversed(self.episode_r): # calculate the return G reversely 67 | g = self.GAMMA * g + r 68 | G.insert(0, g) 69 | 70 | for t in range(len(self.episode_r)): 71 | s = torch.unsqueeze(torch.tensor(self.episode_s[t], dtype=torch.float), 0) 72 | a = self.episode_a[t] 73 | g = G[t] 74 | v_s = self.value(s).flatten() 75 | 76 | # Update policy 77 | a_prob = self.policy(s).flatten() 78 | policy_loss = -pow(self.GAMMA, t) * ((g - v_s).detach()) * torch.log(a_prob[a]) 79 | self.policy_optimizer.zero_grad() 80 | policy_loss.backward() 81 | self.policy_optimizer.step() 82 | 83 | # Update value function 84 | value_loss = (g - v_s) ** 2 85 | self.value_optimizer.zero_grad() 86 | value_loss.backward() 87 | self.value_optimizer.step() 88 | 89 | # Clean the buffer 90 | self.episode_s, self.episode_a, self.episode_r = [], [], [] 91 | 92 | 93 | def evaluate_policy(env, agent): 94 | times = 3 # Perform three evaluations and calculate the average 95 | evaluate_reward = 0 96 | for _ in range(times): 97 | s = env.reset() 98 | done = False 99 | episode_reward = 0 100 | while not done: 101 | a = agent.choose_action(s, deterministic=True) # We use the deterministic policy during the evaluating 102 | s_, r, done, _ = env.step(a) 103 | episode_reward += r 104 | s = s_ 105 | evaluate_reward += episode_reward 106 | 107 | return int(evaluate_reward / times) 108 | 109 | 110 | if __name__ == '__main__': 111 | env_name = ['CartPole-v0', 'CartPole-v1'] 112 | env_index = 0 # The index of the environments above 113 | env = gym.make(env_name[env_index]) 114 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment 115 | number = 1 116 | seed = 0 117 | env.seed(seed) 118 | env_evaluate.seed(seed) 119 | np.random.seed(seed) 120 | torch.manual_seed(seed) 121 | 122 | state_dim = env.observation_space.shape[0] 123 | action_dim = env.action_space.n 124 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode 125 | print("state_dim={}".format(state_dim)) 126 | print("action_dim={}".format(action_dim)) 127 | print("max_episode_steps={}".format(max_episode_steps)) 128 | 129 | agent = REINFORCE(state_dim, action_dim) 130 | writer = SummaryWriter(log_dir='runs/REINFORCE/REINFORCE_baseline_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed)) # build a tensorboard 131 | 132 | max_train_steps = 1e5 # Maximum number of training steps 133 | evaluate_freq = 1e3 # Evaluate the policy every 'evaluate_freq' steps 134 | evaluate_num = 0 # Record the number of evaluations 135 | evaluate_rewards = [] # Record the rewards during the evaluating 136 | total_steps = 0 # Record the total steps during the training 137 | 138 | while total_steps < max_train_steps: 139 | episode_steps = 0 140 | s = env.reset() 141 | done = False 142 | while not done: 143 | episode_steps += 1 144 | a = agent.choose_action(s, deterministic=False) 145 | s_, r, done, _ = env.step(a) 146 | agent.store(s, a, r) 147 | s = s_ 148 | 149 | # Evaluate the policy every 'evaluate_freq' steps 150 | if (total_steps + 1) % evaluate_freq == 0: 151 | evaluate_num += 1 152 | evaluate_reward = evaluate_policy(env_evaluate, agent) 153 | evaluate_rewards.append(evaluate_reward) 154 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward)) 155 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps) 156 | if evaluate_num % 10 == 0: 157 | np.save('./data_train/REINFORCE_baseline_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards)) 158 | 159 | total_steps += 1 160 | 161 | # An episode is over,then update 162 | agent.learn() 163 | -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_0.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_10.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_100.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_0.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_10.npy -------------------------------------------------------------------------------- /1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_100.npy -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121668.李智.23156.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121668.李智.23156.0 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121786.李智.23156.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121786.李智.23156.1 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121899.李智.23156.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121899.李智.23156.2 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121670.李智.15096.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121670.李智.15096.0 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121797.李智.15096.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121797.李智.15096.1 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121918.李智.15096.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121918.李智.15096.2 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121512.李智.32424.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121512.李智.32424.0 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121583.李智.32424.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121583.李智.32424.1 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121655.李智.32424.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121655.李智.32424.2 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121526.李智.11256.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121526.李智.11256.0 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121607.李智.11256.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121607.李智.11256.1 -------------------------------------------------------------------------------- /1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121688.李智.11256.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121688.李智.11256.2 -------------------------------------------------------------------------------- /1.REINFORCE/training results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/training results.png -------------------------------------------------------------------------------- /2.Actor-Critic/A2C.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | 9 | # The network of the actor 10 | class Actor(nn.Module): 11 | def __init__(self, state_dim, action_dim, hidden_width): 12 | super(Actor, self).__init__() 13 | self.l1 = nn.Linear(state_dim, hidden_width) 14 | self.l2 = nn.Linear(hidden_width, action_dim) 15 | 16 | def forward(self, s): 17 | s = F.relu(self.l1(s)) 18 | a_prob = F.softmax(self.l2(s), dim=1) 19 | return a_prob 20 | 21 | 22 | # The network of the critic 23 | class Critic(nn.Module): 24 | def __init__(self, state_dim, hidden_width): 25 | super(Critic, self).__init__() 26 | self.l1 = nn.Linear(state_dim, hidden_width) 27 | self.l2 = nn.Linear(hidden_width, 1) 28 | 29 | def forward(self, s): 30 | s = F.relu(self.l1(s)) 31 | v_s = self.l2(s) 32 | return v_s 33 | 34 | 35 | class A2C(object): 36 | def __init__(self, state_dim, action_dim): 37 | self.state_dim = state_dim 38 | self.action_dim = action_dim 39 | self.hidden_width = 64 # The number of neurons in hidden layers of the neural network 40 | self.lr = 5e-4 # learning rate 41 | self.GAMMA = 0.99 # discount factor 42 | self.I = 1 43 | 44 | self.actor = Actor(state_dim, action_dim, self.hidden_width) 45 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr) 46 | 47 | self.critic = Critic(state_dim, self.hidden_width) 48 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr) 49 | 50 | def choose_action(self, s, deterministic): 51 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 52 | prob_weights = self.actor(s).detach().numpy().flatten() # probability distribution(numpy) 53 | if deterministic: # We use the deterministic policy during the evaluating 54 | a = np.argmax(prob_weights) # Select the action with the highest probability 55 | return a 56 | else: # We use the stochastic policy during the training 57 | a = np.random.choice(range(self.action_dim), p=prob_weights) # Sample the action according to the probability distribution 58 | return a 59 | 60 | def learn(self, s, a, r, s_, dw): 61 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 62 | s_ = torch.unsqueeze(torch.tensor(s_, dtype=torch.float), 0) 63 | v_s = self.critic(s).flatten() # v(s) 64 | v_s_ = self.critic(s_).flatten() # v(s') 65 | 66 | with torch.no_grad(): # td_target has no gradient 67 | td_target = r + self.GAMMA * (1 - dw) * v_s_ 68 | 69 | # Update actor 70 | log_pi = torch.log(self.actor(s).flatten()[a]) # log pi(a|s) 71 | actor_loss = -self.I * ((td_target - v_s).detach()) * log_pi # Only calculate the derivative of log_pi 72 | self.actor_optimizer.zero_grad() 73 | actor_loss.backward() 74 | self.actor_optimizer.step() 75 | 76 | # Update critic 77 | critic_loss = (td_target - v_s) ** 2 # Only calculate the derivative of v(s) 78 | self.critic_optimizer.zero_grad() 79 | critic_loss.backward() 80 | self.critic_optimizer.step() 81 | 82 | self.I *= self.GAMMA # Represent the gamma^t in th policy gradient theorem 83 | 84 | 85 | def evaluate_policy(env, agent): 86 | times = 3 # Perform three evaluations and calculate the average 87 | evaluate_reward = 0 88 | for _ in range(times): 89 | s = env.reset() 90 | done = False 91 | episode_reward = 0 92 | while not done: 93 | a = agent.choose_action(s, deterministic=True) # We use the deterministic policy during the evaluating 94 | s_, r, done, _ = env.step(a) 95 | episode_reward += r 96 | s = s_ 97 | evaluate_reward += episode_reward 98 | 99 | return int(evaluate_reward / times) 100 | 101 | 102 | if __name__ == '__main__': 103 | env_name = ['CartPole-v0', 'CartPole-v1'] 104 | env_index = 0 105 | env = gym.make(env_name[env_index]) 106 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment 107 | number = 9 108 | # Set random seed 109 | seed = 0 110 | env.seed(seed) 111 | env.action_space.seed(seed) 112 | env_evaluate.seed(seed) 113 | env_evaluate.action_space.seed(seed) 114 | np.random.seed(seed) 115 | torch.manual_seed(seed) 116 | 117 | state_dim = env.observation_space.shape[0] 118 | action_dim = env.action_space.n 119 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode 120 | print("state_dim={}".format(state_dim)) 121 | print("action_dim={}".format(action_dim)) 122 | print("max_episode_steps={}".format(max_episode_steps)) 123 | 124 | agent = A2C(state_dim, action_dim) 125 | writer = SummaryWriter(log_dir='runs/A2C/A2C_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed)) # Build a tensorboard 126 | 127 | max_train_steps = 3e5 # Maximum number of training steps 128 | evaluate_freq = 1e3 # Evaluate the policy every 'evaluate_freq' steps 129 | evaluate_rewards = [] # Record the rewards during the evaluating 130 | evaluate_num = 0 # Record the number of evaluations 131 | total_steps = 0 # Record the total steps during the training 132 | 133 | while total_steps < max_train_steps: 134 | episode_steps = 0 135 | s = env.reset() 136 | done = False 137 | agent.I = 1 138 | while not done: 139 | episode_steps += 1 140 | a = agent.choose_action(s, deterministic=False) 141 | s_, r, done, _ = env.step(a) 142 | 143 | # When dead or win or reaching the max_epsiode_steps, done will be Ture, we need to distinguish them; 144 | # dw means dead or win,there is no next state s'; 145 | # but when reaching the max_episode_steps,there is a next state s' actually. 146 | if done and episode_steps != max_episode_steps: 147 | dw = True 148 | else: 149 | dw = False 150 | 151 | agent.learn(s, a, r, s_, dw) 152 | s = s_ 153 | 154 | # Evaluate the policy every 'evaluate_freq' steps 155 | if (total_steps + 1) % evaluate_freq == 0: 156 | evaluate_num += 1 157 | evaluate_reward = evaluate_policy(env_evaluate, agent) 158 | evaluate_rewards.append(evaluate_reward) 159 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward)) 160 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps) 161 | # Save the rewards 162 | if evaluate_num % 10 == 0: 163 | np.save('./data_train/A2C_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards)) 164 | 165 | total_steps += 1 166 | -------------------------------------------------------------------------------- /2.Actor-Critic/A2C_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/A2C_results.png -------------------------------------------------------------------------------- /2.Actor-Critic/README.md: -------------------------------------------------------------------------------- 1 | # Actor-Critic(A2C) 2 | This is a concise Pytorch implementation of Advantage Actor-Critic(A2C).
3 | 4 | ## How to use my code? 5 | You can dircetly run A2C.py in your own IDE.
6 | 7 | ### Trainning environments 8 | You can set the 'env_index' in the codes to change the environments.
9 | env_index=0 represent 'CartPole-v0'
10 | env_index=1 represent 'CartPole-v1'
11 | 12 | ### How to see the training results? 13 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.
14 | The rewards data are saved as numpy in the file 'data_train'.
15 | The training curves are shown below, which are smoothed by averaging over a window of 10 steps.
16 | The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)
17 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/2.Actor-Critic/A2C_results.png) 18 | -------------------------------------------------------------------------------- /2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_0.npy -------------------------------------------------------------------------------- /2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_10.npy -------------------------------------------------------------------------------- /2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_100.npy -------------------------------------------------------------------------------- /2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_0.npy -------------------------------------------------------------------------------- /2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_10.npy -------------------------------------------------------------------------------- /2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_100.npy -------------------------------------------------------------------------------- /2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_0/events.out.tfevents.1648553119.李智.62564.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_0/events.out.tfevents.1648553119.李智.62564.0 -------------------------------------------------------------------------------- /2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_10/events.out.tfevents.1648553543.李智.62564.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_10/events.out.tfevents.1648553543.李智.62564.1 -------------------------------------------------------------------------------- /2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_100/events.out.tfevents.1648554019.李智.62564.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_100/events.out.tfevents.1648554019.李智.62564.2 -------------------------------------------------------------------------------- /2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_0/events.out.tfevents.1648553122.李智.63460.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_0/events.out.tfevents.1648553122.李智.63460.0 -------------------------------------------------------------------------------- /2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_10/events.out.tfevents.1648553561.李智.63460.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_10/events.out.tfevents.1648553561.李智.63460.1 -------------------------------------------------------------------------------- /2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_100/events.out.tfevents.1648554055.李智.63460.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_100/events.out.tfevents.1648554055.李智.63460.2 -------------------------------------------------------------------------------- /3.Rainbow_DQN/README.md: -------------------------------------------------------------------------------- 1 | # Rainbow DQN 2 | This is a concise Pytorch implementation of Rainbow DQN, including Double Q-learning, Dueling network, Noisy network, PER and n-steps Q-learning.
3 | 4 | ## How to use my code? 5 | You can dircetly run Rainbow_DQN_main.py in your own IDE.
6 | 7 | ### Trainning environments 8 | You can set the 'env_index' in the code to change the environments.
9 | env_index=0 represent 'CartPole-v1'
10 | env_index=1 represent 'LunarLander-v2'
11 | 12 | ### How to see the training results? 13 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.
14 | The rewards data are saved as numpy in the file 'data_train'.
15 | The training curves are shown below.
16 | The right picture is smoothed by averaging over a window of 10 steps. The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)
17 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/3.Rainbow_DQN/rainbow_dqn_result.png) 18 | 19 | ## Reference 20 | [1] Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep reinforcement learning[J]. nature, 2015, 518(7540): 529-533.
21 | [2] Van Hasselt H, Guez A, Silver D. Deep reinforcement learning with double q-learning[C]//Proceedings of the AAAI conference on artificial intelligence. 2016, 30(1).
22 | [3] Wang Z, Schaul T, Hessel M, et al. Dueling network architectures for deep reinforcement learning[C]//International conference on machine learning. PMLR, 2016: 1995-2003.
23 | [4] Fortunato M, Azar M G, Piot B, et al. Noisy networks for exploration[J]. arXiv preprint arXiv:1706.10295, 2017.
24 | [5] Schaul T, Quan J, Antonoglou I, et al. Prioritized experience replay[J]. arXiv preprint arXiv:1511.05952, 2015.
25 | [6] Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining improvements in deep reinforcement learning[C]//Thirty-second AAAI conference on artificial intelligence. 2018.
26 | -------------------------------------------------------------------------------- /3.Rainbow_DQN/Rainbow_DQN_main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import gym 4 | from torch.utils.tensorboard import SummaryWriter 5 | from replay_buffer import * 6 | from rainbow_dqn import DQN 7 | import argparse 8 | 9 | 10 | class Runner: 11 | def __init__(self, args, env_name, number, seed): 12 | self.args = args 13 | self.env_name = env_name 14 | self.number = number 15 | self.seed = seed 16 | 17 | self.env = gym.make(env_name) 18 | self.env_evaluate = gym.make(env_name) # When evaluating the policy, we need to rebuild an environment 19 | self.env.seed(seed) 20 | self.env.action_space.seed(seed) 21 | self.env_evaluate.seed(seed) 22 | self.env_evaluate.action_space.seed(seed) 23 | np.random.seed(seed) 24 | torch.manual_seed(seed) 25 | 26 | self.args.state_dim = self.env.observation_space.shape[0] 27 | self.args.action_dim = self.env.action_space.n 28 | self.args.episode_limit = self.env._max_episode_steps # Maximum number of steps per episode 29 | print("env={}".format(self.env_name)) 30 | print("state_dim={}".format(self.args.state_dim)) 31 | print("action_dim={}".format(self.args.action_dim)) 32 | print("episode_limit={}".format(self.args.episode_limit)) 33 | 34 | if args.use_per and args.use_n_steps: 35 | self.replay_buffer = N_Steps_Prioritized_ReplayBuffer(args) 36 | elif args.use_per: 37 | self.replay_buffer = Prioritized_ReplayBuffer(args) 38 | elif args.use_n_steps: 39 | self.replay_buffer = N_Steps_ReplayBuffer(args) 40 | else: 41 | self.replay_buffer = ReplayBuffer(args) 42 | self.agent = DQN(args) 43 | 44 | self.algorithm = 'DQN' 45 | if args.use_double and args.use_dueling and args.use_noisy and args.use_per and args.use_n_steps: 46 | self.algorithm = 'Rainbow_' + self.algorithm 47 | else: 48 | if args.use_double: 49 | self.algorithm += '_Double' 50 | if args.use_dueling: 51 | self.algorithm += '_Dueling' 52 | if args.use_noisy: 53 | self.algorithm += '_Noisy' 54 | if args.use_per: 55 | self.algorithm += '_PER' 56 | if args.use_n_steps: 57 | self.algorithm += "_N_steps" 58 | 59 | self.writer = SummaryWriter(log_dir='runs/DQN/{}_env_{}_number_{}_seed_{}'.format(self.algorithm, env_name, number, seed)) 60 | 61 | self.evaluate_num = 0 # Record the number of evaluations 62 | self.evaluate_rewards = [] # Record the rewards during the evaluating 63 | self.total_steps = 0 # Record the total steps during the training 64 | if args.use_noisy: # 如果使用Noisy net,就不需要epsilon贪心策略了 65 | self.epsilon = 0 66 | else: 67 | self.epsilon = self.args.epsilon_init 68 | self.epsilon_min = self.args.epsilon_min 69 | self.epsilon_decay = (self.args.epsilon_init - self.args.epsilon_min) / self.args.epsilon_decay_steps 70 | 71 | def run(self, ): 72 | self.evaluate_policy() 73 | while self.total_steps < self.args.max_train_steps: 74 | state = self.env.reset() 75 | done = False 76 | episode_steps = 0 77 | while not done: 78 | action = self.agent.choose_action(state, epsilon=self.epsilon) 79 | next_state, reward, done, _ = self.env.step(action) 80 | episode_steps += 1 81 | self.total_steps += 1 82 | 83 | if not self.args.use_noisy: # Decay epsilon 84 | self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon - self.epsilon_decay > self.epsilon_min else self.epsilon_min 85 | 86 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them; 87 | # terminal means dead or win,there is no next state s'; 88 | # but when reaching the max_episode_steps,there is a next state s' actually. 89 | if done and episode_steps != self.args.episode_limit: 90 | if self.env_name == 'LunarLander-v2': 91 | if reward <= -100: reward = -1 # good for LunarLander 92 | terminal = True 93 | else: 94 | terminal = False 95 | 96 | self.replay_buffer.store_transition(state, action, reward, next_state, terminal, done) # Store the transition 97 | state = next_state 98 | 99 | if self.replay_buffer.current_size >= self.args.batch_size: 100 | self.agent.learn(self.replay_buffer, self.total_steps) 101 | 102 | if self.total_steps % self.args.evaluate_freq == 0: 103 | self.evaluate_policy() 104 | # Save reward 105 | np.save('./data_train/{}_env_{}_number_{}_seed_{}.npy'.format(self.algorithm, self.env_name, self.number, self.seed), np.array(self.evaluate_rewards)) 106 | 107 | def evaluate_policy(self, ): 108 | evaluate_reward = 0 109 | self.agent.net.eval() 110 | for _ in range(self.args.evaluate_times): 111 | state = self.env_evaluate.reset() 112 | done = False 113 | episode_reward = 0 114 | while not done: 115 | action = self.agent.choose_action(state, epsilon=0) 116 | next_state, reward, done, _ = self.env_evaluate.step(action) 117 | episode_reward += reward 118 | state = next_state 119 | evaluate_reward += episode_reward 120 | self.agent.net.train() 121 | evaluate_reward /= self.args.evaluate_times 122 | self.evaluate_rewards.append(evaluate_reward) 123 | print("total_steps:{} \t evaluate_reward:{} \t epsilon:{}".format(self.total_steps, evaluate_reward, self.epsilon)) 124 | self.writer.add_scalar('step_rewards_{}'.format(self.env_name), evaluate_reward, global_step=self.total_steps) 125 | 126 | 127 | if __name__ == '__main__': 128 | parser = argparse.ArgumentParser("Hyperparameter Setting for DQN") 129 | parser.add_argument("--max_train_steps", type=int, default=int(4e5), help=" Maximum number of training steps") 130 | parser.add_argument("--evaluate_freq", type=float, default=1e3, help="Evaluate the policy every 'evaluate_freq' steps") 131 | parser.add_argument("--evaluate_times", type=float, default=3, help="Evaluate times") 132 | 133 | parser.add_argument("--buffer_capacity", type=int, default=int(1e5), help="The maximum replay-buffer capacity ") 134 | parser.add_argument("--batch_size", type=int, default=256, help="batch size") 135 | parser.add_argument("--hidden_dim", type=int, default=256, help="The number of neurons in hidden layers of the neural network") 136 | parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate of actor") 137 | parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") 138 | parser.add_argument("--epsilon_init", type=float, default=0.5, help="Initial epsilon") 139 | parser.add_argument("--epsilon_min", type=float, default=0.1, help="Minimum epsilon") 140 | parser.add_argument("--epsilon_decay_steps", type=int, default=int(1e5), help="How many steps before the epsilon decays to the minimum") 141 | parser.add_argument("--tau", type=float, default=0.005, help="soft update the target network") 142 | parser.add_argument("--use_soft_update", type=bool, default=True, help="Whether to use soft update") 143 | parser.add_argument("--target_update_freq", type=int, default=200, help="Update frequency of the target network(hard update)") 144 | parser.add_argument("--n_steps", type=int, default=5, help="n_steps") 145 | parser.add_argument("--alpha", type=float, default=0.6, help="PER parameter") 146 | parser.add_argument("--beta_init", type=float, default=0.4, help="Important sampling parameter in PER") 147 | parser.add_argument("--use_lr_decay", type=bool, default=True, help="Learning rate Decay") 148 | parser.add_argument("--grad_clip", type=float, default=10.0, help="Gradient clip") 149 | 150 | parser.add_argument("--use_double", type=bool, default=True, help="Whether to use double Q-learning") 151 | parser.add_argument("--use_dueling", type=bool, default=True, help="Whether to use dueling network") 152 | parser.add_argument("--use_noisy", type=bool, default=True, help="Whether to use noisy network") 153 | parser.add_argument("--use_per", type=bool, default=True, help="Whether to use PER") 154 | parser.add_argument("--use_n_steps", type=bool, default=True, help="Whether to use n_steps Q-learning") 155 | 156 | args = parser.parse_args() 157 | 158 | env_names = ['CartPole-v1', 'LunarLander-v2'] 159 | env_index = 1 160 | for seed in [0, 10, 100]: 161 | runner = Runner(args=args, env_name=env_names[env_index], number=1, seed=seed) 162 | runner.run() 163 | -------------------------------------------------------------------------------- /3.Rainbow_DQN/__pycache__/network.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/network.cpython-37.pyc -------------------------------------------------------------------------------- /3.Rainbow_DQN/__pycache__/rainbow_dqn.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/rainbow_dqn.cpython-37.pyc -------------------------------------------------------------------------------- /3.Rainbow_DQN/__pycache__/replay_buffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/replay_buffer.cpython-37.pyc -------------------------------------------------------------------------------- /3.Rainbow_DQN/__pycache__/sum_tree.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/sum_tree.cpython-37.pyc -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100.npy -------------------------------------------------------------------------------- /3.Rainbow_DQN/drawing_Rainbow_DQN.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import seaborn as sns 4 | 5 | 6 | def smooth(reward): 7 | smooth_reward = [] 8 | for i in range(reward.shape[0]): 9 | if i == 0: 10 | smooth_reward.append(reward[i]) 11 | else: 12 | smooth_reward.append(smooth_reward[-1] * 0.9 + reward[i] * 0.1) 13 | return np.array(smooth_reward) 14 | 15 | 16 | env_name = ['CartPole-v1', 'LunarLander-v2'] 17 | colors = ['r', 'darkorange', 'dodgerblue', 'limegreen', 'yellow', 'magenta', 'chocolate', 'indigo', 'gray', 'aqua', 'g', 'black'] 18 | 19 | 20 | def get_data(algorithm, env_index, number): 21 | reward1 = smooth(np.load('./data_train/{}_env_{}_number_{}_seed_0.npy'.format(algorithm, env_name[env_index], number))) 22 | reward2 = smooth(np.load('./data_train/{}_env_{}_number_{}_seed_10.npy'.format(algorithm, env_name[env_index], number))) 23 | reward3 = smooth(np.load('./data_train/{}_env_{}_number_{}_seed_100.npy'.format(algorithm, env_name[env_index], number))) 24 | reward = np.stack((reward1, reward2, reward3), axis=0) 25 | len = reward1.shape[0] 26 | 27 | return reward, len 28 | 29 | 30 | def drawing_CP(plt, algorithm, number, color, label): 31 | reward, len = get_data(algorithm=algorithm, env_index=0, number=number) 32 | sns.tsplot(time=np.arange(len), data=reward, color=color, linestyle='-') # color=darkorange dodgerblue 33 | plt.plot(reward.mean(0), color=color, label=label) 34 | plt.title("CartPole-v1", size=14) 35 | plt.xlabel("Steps", size=14) 36 | plt.ylabel("Reward", size=14) 37 | plt.xticks([0, 50, 100, 150], ['0', '50k', '100k', '150k'], size=14) 38 | plt.yticks(size=14) 39 | plt.ylim([0, 510]) 40 | plt.legend(loc='lower right', fontsize=14) 41 | 42 | 43 | def drawing_LL(plt, algorithm, number, color, label): 44 | reward, len = get_data(algorithm=algorithm, env_index=1, number=number) 45 | sns.tsplot(time=np.arange(len), data=reward, color=color, linestyle='-') # color=darkorange dodgerblue 46 | plt.plot(reward.mean(0), color=color, label=label) 47 | plt.title("LunarLander-v2", size=14) 48 | plt.xlabel("Steps", size=14) 49 | plt.ylabel("Reward", size=14) 50 | plt.xticks([0, 100, 200, 300, 400], ['0', '100k', '200k', '300k', '400k'], size=14) 51 | plt.yticks(size=14) 52 | plt.ylim([-300, 300]) 53 | plt.legend(loc='lower right', fontsize=14) 54 | 55 | 56 | sns.set_style('darkgrid') 57 | plt.figure() 58 | drawing_LL(plt, algorithm='Rainbow_DQN', number=1, color=colors[0], label='Rainbow_DQN') 59 | 60 | drawing_LL(plt, algorithm='DQN_dueling_Noisy_PER_N_steps', number=1, color=colors[1], label='Rainbow_DQN without Double') 61 | 62 | drawing_LL(plt, algorithm='DQN_double_Noisy_PER_N_steps', number=1, color=colors[2], label='Rainbow_DQN without Dueling') 63 | 64 | drawing_LL(plt, algorithm='DQN_double_dueling_Noisy_N_steps', number=1, color=colors[3], label='Rainbow DQN without PER') 65 | 66 | drawing_LL(plt, algorithm='DQN_double_dueling_Noisy_PER', number=1, color=colors[4], label='Rainbow_DQN without N-steps') 67 | 68 | drawing_LL(plt, algorithm='DQN_double_dueling_PER_N_steps', number=1, color=colors[9], label='Rainbow_DQN without Noisy') 69 | 70 | 71 | plt.show() 72 | -------------------------------------------------------------------------------- /3.Rainbow_DQN/network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import math 5 | 6 | 7 | class Dueling_Net(nn.Module): 8 | def __init__(self, args): 9 | super(Dueling_Net, self).__init__() 10 | self.fc1 = nn.Linear(args.state_dim, args.hidden_dim) 11 | self.fc2 = nn.Linear(args.hidden_dim, args.hidden_dim) 12 | if args.use_noisy: 13 | self.V = NoisyLinear(args.hidden_dim, 1) 14 | self.A = NoisyLinear(args.hidden_dim, args.action_dim) 15 | else: 16 | self.V = nn.Linear(args.hidden_dim, 1) 17 | self.A = nn.Linear(args.hidden_dim, args.action_dim) 18 | 19 | def forward(self, s): 20 | s = torch.relu(self.fc1(s)) 21 | s = torch.relu(self.fc2(s)) 22 | V = self.V(s) # batch_size X 1 23 | A = self.A(s) # batch_size X action_dim 24 | Q = V + (A - torch.mean(A, dim=-1, keepdim=True)) # Q(s,a)=V(s)+A(s,a)-mean(A(s,a)) 25 | return Q 26 | 27 | 28 | class Net(nn.Module): 29 | def __init__(self, args): 30 | super(Net, self).__init__() 31 | self.fc1 = nn.Linear(args.state_dim, args.hidden_dim) 32 | self.fc2 = nn.Linear(args.hidden_dim, args.hidden_dim) 33 | if args.use_noisy: 34 | self.fc3 = NoisyLinear(args.hidden_dim, args.action_dim) 35 | else: 36 | self.fc3 = nn.Linear(args.hidden_dim, args.action_dim) 37 | 38 | def forward(self, s): 39 | s = torch.relu(self.fc1(s)) 40 | s = torch.relu(self.fc2(s)) 41 | Q = self.fc3(s) 42 | return Q 43 | 44 | 45 | class NoisyLinear(nn.Module): 46 | def __init__(self, in_features, out_features, sigma_init=0.5): 47 | super(NoisyLinear, self).__init__() 48 | self.in_features = in_features 49 | self.out_features = out_features 50 | self.sigma_init = sigma_init 51 | 52 | self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features)) 53 | self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features)) 54 | self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features)) 55 | 56 | self.bias_mu = nn.Parameter(torch.FloatTensor(out_features)) 57 | self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features)) 58 | self.register_buffer('bias_epsilon', torch.FloatTensor(out_features)) 59 | 60 | self.reset_parameters() 61 | self.reset_noise() 62 | 63 | def forward(self, x): 64 | if self.training: 65 | self.reset_noise() 66 | weight = self.weight_mu + self.weight_sigma.mul(self.weight_epsilon) # mul是对应元素相乘 67 | bias = self.bias_mu + self.bias_sigma.mul(self.bias_epsilon) 68 | 69 | else: 70 | weight = self.weight_mu 71 | bias = self.bias_mu 72 | 73 | return F.linear(x, weight, bias) 74 | 75 | def reset_parameters(self): 76 | mu_range = 1 / math.sqrt(self.in_features) 77 | self.weight_mu.data.uniform_(-mu_range, mu_range) 78 | self.bias_mu.data.uniform_(-mu_range, mu_range) 79 | 80 | self.weight_sigma.data.fill_(self.sigma_init / math.sqrt(self.in_features)) 81 | self.bias_sigma.data.fill_(self.sigma_init / math.sqrt(self.out_features)) # 这里要除以out_features 82 | 83 | def reset_noise(self): 84 | epsilon_i = self.scale_noise(self.in_features) 85 | epsilon_j = self.scale_noise(self.out_features) 86 | self.weight_epsilon.copy_(torch.ger(epsilon_j, epsilon_i)) 87 | self.bias_epsilon.copy_(epsilon_j) 88 | 89 | def scale_noise(self, size): 90 | x = torch.randn(size) # torch.randn产生标准高斯分布 91 | x = x.sign().mul(x.abs().sqrt()) 92 | return x 93 | -------------------------------------------------------------------------------- /3.Rainbow_DQN/rainbow_dqn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import copy 4 | from network import Dueling_Net, Net 5 | 6 | 7 | class DQN(object): 8 | def __init__(self, args): 9 | self.action_dim = args.action_dim 10 | self.batch_size = args.batch_size # batch size 11 | self.max_train_steps = args.max_train_steps 12 | self.lr = args.lr # learning rate 13 | self.gamma = args.gamma # discount factor 14 | self.tau = args.tau # Soft update 15 | self.use_soft_update = args.use_soft_update 16 | self.target_update_freq = args.target_update_freq # hard update 17 | self.update_count = 0 18 | 19 | self.grad_clip = args.grad_clip 20 | self.use_lr_decay = args.use_lr_decay 21 | self.use_double = args.use_double 22 | self.use_dueling = args.use_dueling 23 | self.use_per = args.use_per 24 | self.use_n_steps = args.use_n_steps 25 | if self.use_n_steps: 26 | self.gamma = self.gamma ** args.n_steps 27 | 28 | if self.use_dueling: # Whether to use the 'dueling network' 29 | self.net = Dueling_Net(args) 30 | else: 31 | self.net = Net(args) 32 | 33 | self.target_net = copy.deepcopy(self.net) # Copy the online_net to the target_net 34 | 35 | self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.lr) 36 | 37 | def choose_action(self, state, epsilon): 38 | with torch.no_grad(): 39 | state = torch.unsqueeze(torch.tensor(state, dtype=torch.float), 0) 40 | q = self.net(state) 41 | if np.random.uniform() > epsilon: 42 | action = q.argmax(dim=-1).item() 43 | else: 44 | action = np.random.randint(0, self.action_dim) 45 | return action 46 | 47 | def learn(self, replay_buffer, total_steps): 48 | batch, batch_index, IS_weight = replay_buffer.sample(total_steps) 49 | 50 | with torch.no_grad(): # q_target has no gradient 51 | if self.use_double: # Whether to use the 'double q-learning' 52 | # Use online_net to select the action 53 | a_argmax = self.net(batch['next_state']).argmax(dim=-1, keepdim=True) # shape:(batch_size,1) 54 | # Use target_net to estimate the q_target 55 | q_target = batch['reward'] + self.gamma * (1 - batch['terminal']) * self.target_net(batch['next_state']).gather(-1, a_argmax).squeeze(-1) # shape:(batch_size,) 56 | else: 57 | q_target = batch['reward'] + self.gamma * (1 - batch['terminal']) * self.target_net(batch['next_state']).max(dim=-1)[0] # shape:(batch_size,) 58 | 59 | q_current = self.net(batch['state']).gather(-1, batch['action']).squeeze(-1) # shape:(batch_size,) 60 | td_errors = q_current - q_target # shape:(batch_size,) 61 | 62 | if self.use_per: 63 | loss = (IS_weight * (td_errors ** 2)).mean() 64 | replay_buffer.update_batch_priorities(batch_index, td_errors.detach().numpy()) 65 | else: 66 | loss = (td_errors ** 2).mean() 67 | 68 | self.optimizer.zero_grad() 69 | loss.backward() 70 | if self.grad_clip: 71 | torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_clip) 72 | self.optimizer.step() 73 | 74 | if self.use_soft_update: # soft update 75 | for param, target_param in zip(self.net.parameters(), self.target_net.parameters()): 76 | target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) 77 | else: # hard update 78 | self.update_count += 1 79 | if self.update_count % self.target_update_freq == 0: 80 | self.target_net.load_state_dict(self.net.state_dict()) 81 | 82 | if self.use_lr_decay: # learning rate Decay 83 | self.lr_decay(total_steps) 84 | 85 | def lr_decay(self, total_steps): 86 | lr_now = 0.9 * self.lr * (1 - total_steps / self.max_train_steps) + 0.1 * self.lr 87 | for p in self.optimizer.param_groups: 88 | p['lr'] = lr_now 89 | -------------------------------------------------------------------------------- /3.Rainbow_DQN/rainbow_dqn_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/rainbow_dqn_result.png -------------------------------------------------------------------------------- /3.Rainbow_DQN/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from collections import deque 4 | from sum_tree import SumTree 5 | 6 | 7 | class ReplayBuffer(object): 8 | def __init__(self, args): 9 | self.batch_size = args.batch_size 10 | self.buffer_capacity = args.buffer_capacity 11 | self.current_size = 0 12 | self.count = 0 13 | self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)), 14 | 'action': np.zeros((self.buffer_capacity, 1)), 15 | 'reward': np.zeros(self.buffer_capacity), 16 | 'next_state': np.zeros((self.buffer_capacity, args.state_dim)), 17 | 'terminal': np.zeros(self.buffer_capacity), 18 | } 19 | 20 | def store_transition(self, state, action, reward, next_state, terminal, done): 21 | self.buffer['state'][self.count] = state 22 | self.buffer['action'][self.count] = action 23 | self.buffer['reward'][self.count] = reward 24 | self.buffer['next_state'][self.count] = next_state 25 | self.buffer['terminal'][self.count] = terminal 26 | self.count = (self.count + 1) % self.buffer_capacity # When the 'count' reaches buffer_capacity, it will be reset to 0. 27 | self.current_size = min(self.current_size + 1, self.buffer_capacity) 28 | 29 | def sample(self, total_steps): 30 | index = np.random.randint(0, self.current_size, size=self.batch_size) 31 | batch = {} 32 | for key in self.buffer.keys(): # numpy->tensor 33 | if key == 'action': 34 | batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.long) 35 | else: 36 | batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.float32) 37 | 38 | return batch, None, None 39 | 40 | 41 | class N_Steps_ReplayBuffer(object): 42 | def __init__(self, args): 43 | self.gamma = args.gamma 44 | self.batch_size = args.batch_size 45 | self.buffer_capacity = args.buffer_capacity 46 | self.current_size = 0 47 | self.count = 0 48 | self.n_steps = args.n_steps 49 | self.n_steps_deque = deque(maxlen=self.n_steps) 50 | self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)), 51 | 'action': np.zeros((self.buffer_capacity, 1)), 52 | 'reward': np.zeros(self.buffer_capacity), 53 | 'next_state': np.zeros((self.buffer_capacity, args.state_dim)), 54 | 'terminal': np.zeros(self.buffer_capacity), 55 | } 56 | 57 | def store_transition(self, state, action, reward, next_state, terminal, done): 58 | transition = (state, action, reward, next_state, terminal, done) 59 | self.n_steps_deque.append(transition) 60 | if len(self.n_steps_deque) == self.n_steps: 61 | state, action, n_steps_reward, next_state, terminal = self.get_n_steps_transition() 62 | self.buffer['state'][self.count] = state 63 | self.buffer['action'][self.count] = action 64 | self.buffer['reward'][self.count] = n_steps_reward 65 | self.buffer['next_state'][self.count] = next_state 66 | self.buffer['terminal'][self.count] = terminal 67 | self.count = (self.count + 1) % self.buffer_capacity # When the 'count' reaches buffer_capacity, it will be reset to 0. 68 | self.current_size = min(self.current_size + 1, self.buffer_capacity) 69 | 70 | def get_n_steps_transition(self): 71 | state, action = self.n_steps_deque[0][:2] 72 | next_state, terminal = self.n_steps_deque[-1][3:5] 73 | n_steps_reward = 0 74 | for i in reversed(range(self.n_steps)): 75 | r, s_, ter, d = self.n_steps_deque[i][2:] 76 | n_steps_reward = r + self.gamma * (1 - d) * n_steps_reward 77 | if d: 78 | next_state, terminal = s_, ter 79 | 80 | return state, action, n_steps_reward, next_state, terminal 81 | 82 | def sample(self, total_steps): 83 | index = np.random.randint(0, self.current_size, size=self.batch_size) 84 | batch = {} 85 | for key in self.buffer.keys(): # numpy->tensor 86 | if key == 'action': 87 | batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.long) 88 | else: 89 | batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.float32) 90 | 91 | return batch, None, None 92 | 93 | 94 | class Prioritized_ReplayBuffer(object): 95 | def __init__(self, args): 96 | self.max_train_steps = args.max_train_steps 97 | self.alpha = args.alpha 98 | self.beta_init = args.beta_init 99 | self.beta = args.beta_init 100 | self.batch_size = args.batch_size 101 | self.buffer_capacity = args.buffer_capacity 102 | self.sum_tree = SumTree(self.buffer_capacity) 103 | self.current_size = 0 104 | self.count = 0 105 | self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)), 106 | 'action': np.zeros((self.buffer_capacity, 1)), 107 | 'reward': np.zeros(self.buffer_capacity), 108 | 'next_state': np.zeros((self.buffer_capacity, args.state_dim)), 109 | 'terminal': np.zeros(self.buffer_capacity), 110 | } 111 | 112 | def store_transition(self, state, action, reward, next_state, terminal, done): 113 | self.buffer['state'][self.count] = state 114 | self.buffer['action'][self.count] = action 115 | self.buffer['reward'][self.count] = reward 116 | self.buffer['next_state'][self.count] = next_state 117 | self.buffer['terminal'][self.count] = terminal 118 | # 如果是第一条经验,初始化优先级为1.0;否则,对于新存入的经验,指定为当前最大的优先级 119 | priority = 1.0 if self.current_size == 0 else self.sum_tree.priority_max 120 | self.sum_tree.update(data_index=self.count, priority=priority) # 更新当前经验在sum_tree中的优先级 121 | self.count = (self.count + 1) % self.buffer_capacity # When the 'count' reaches buffer_capacity, it will be reset to 0. 122 | self.current_size = min(self.current_size + 1, self.buffer_capacity) 123 | 124 | def sample(self, total_steps): 125 | batch_index, IS_weight = self.sum_tree.get_batch_index(current_size=self.current_size, batch_size=self.batch_size, beta=self.beta) 126 | self.beta = self.beta_init + (1 - self.beta_init) * (total_steps / self.max_train_steps) # beta:beta_init->1.0 127 | batch = {} 128 | for key in self.buffer.keys(): # numpy->tensor 129 | if key == 'action': 130 | batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.long) 131 | else: 132 | batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.float32) 133 | 134 | return batch, batch_index, IS_weight 135 | 136 | def update_batch_priorities(self, batch_index, td_errors): # 根据传入的td_error,更新batch_index所对应数据的priorities 137 | priorities = (np.abs(td_errors) + 0.01) ** self.alpha 138 | for index, priority in zip(batch_index, priorities): 139 | self.sum_tree.update(data_index=index, priority=priority) 140 | 141 | 142 | class N_Steps_Prioritized_ReplayBuffer(object): 143 | def __init__(self, args): 144 | self.max_train_steps = args.max_train_steps 145 | self.alpha = args.alpha 146 | self.beta_init = args.beta_init 147 | self.beta = args.beta_init 148 | self.gamma = args.gamma 149 | self.batch_size = args.batch_size 150 | self.buffer_capacity = args.buffer_capacity 151 | self.sum_tree = SumTree(self.buffer_capacity) 152 | self.n_steps = args.n_steps 153 | self.n_steps_deque = deque(maxlen=self.n_steps) 154 | self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)), 155 | 'action': np.zeros((self.buffer_capacity, 1)), 156 | 'reward': np.zeros(self.buffer_capacity), 157 | 'next_state': np.zeros((self.buffer_capacity, args.state_dim)), 158 | 'terminal': np.zeros(self.buffer_capacity), 159 | } 160 | self.current_size = 0 161 | self.count = 0 162 | 163 | def store_transition(self, state, action, reward, next_state, terminal, done): 164 | transition = (state, action, reward, next_state, terminal, done) 165 | self.n_steps_deque.append(transition) 166 | if len(self.n_steps_deque) == self.n_steps: 167 | state, action, n_steps_reward, next_state, terminal = self.get_n_steps_transition() 168 | self.buffer['state'][self.count] = state 169 | self.buffer['action'][self.count] = action 170 | self.buffer['reward'][self.count] = n_steps_reward 171 | self.buffer['next_state'][self.count] = next_state 172 | self.buffer['terminal'][self.count] = terminal 173 | # 如果是buffer中的第一条经验,那么指定priority为1.0;否则对于新存入的经验,指定为当前最大的priority 174 | priority = 1.0 if self.current_size == 0 else self.sum_tree.priority_max 175 | self.sum_tree.update(data_index=self.count, priority=priority) # 更新当前经验在sum_tree中的优先级 176 | self.count = (self.count + 1) % self.buffer_capacity # When 'count' reaches buffer_capacity, it will be reset to 0. 177 | self.current_size = min(self.current_size + 1, self.buffer_capacity) 178 | 179 | def sample(self, total_steps): 180 | batch_index, IS_weight = self.sum_tree.get_batch_index(current_size=self.current_size, batch_size=self.batch_size, beta=self.beta) 181 | self.beta = self.beta_init + (1 - self.beta_init) * (total_steps / self.max_train_steps) # beta:beta_init->1.0 182 | batch = {} 183 | for key in self.buffer.keys(): # numpy->tensor 184 | if key == 'action': 185 | batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.long) 186 | else: 187 | batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.float32) 188 | 189 | return batch, batch_index, IS_weight 190 | 191 | def get_n_steps_transition(self): 192 | state, action = self.n_steps_deque[0][:2] # 获取deque中第一个transition的s和a 193 | next_state, terminal = self.n_steps_deque[-1][3:5] # 获取deque中最后一个transition的s'和terminal 194 | n_steps_reward = 0 195 | for i in reversed(range(self.n_steps)): # 逆序计算n_steps_reward 196 | r, s_, ter, d = self.n_steps_deque[i][2:] 197 | n_steps_reward = r + self.gamma * (1 - d) * n_steps_reward 198 | if d: # 如果done=True,说明一个回合结束,保存deque中当前这个transition的s'和terminal作为这个n_steps_transition的next_state和terminal 199 | next_state, terminal = s_, ter 200 | 201 | return state, action, n_steps_reward, next_state, terminal 202 | 203 | def update_batch_priorities(self, batch_index, td_errors): # 根据传入的td_error,更新batch_index所对应数据的priorities 204 | priorities = (np.abs(td_errors) + 0.01) ** self.alpha 205 | for index, priority in zip(batch_index, priorities): 206 | self.sum_tree.update(data_index=index, priority=priority) 207 | -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494478.DESKTOP-LMKC0MO.1408.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494478.DESKTOP-LMKC0MO.1408.0 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658507126.DESKTOP-LMKC0MO.1408.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658507126.DESKTOP-LMKC0MO.1408.1 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658520541.DESKTOP-LMKC0MO.1408.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658520541.DESKTOP-LMKC0MO.1408.2 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494481.DESKTOP-LMKC0MO.9316.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494481.DESKTOP-LMKC0MO.9316.0 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658512436.DESKTOP-LMKC0MO.9316.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658512436.DESKTOP-LMKC0MO.9316.1 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658531515.DESKTOP-LMKC0MO.9316.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658531515.DESKTOP-LMKC0MO.9316.2 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494475.DESKTOP-LMKC0MO.5976.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494475.DESKTOP-LMKC0MO.5976.0 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511615.DESKTOP-LMKC0MO.5976.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511615.DESKTOP-LMKC0MO.5976.1 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658528978.DESKTOP-LMKC0MO.5976.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658528978.DESKTOP-LMKC0MO.5976.2 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494473.DESKTOP-LMKC0MO.2144.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494473.DESKTOP-LMKC0MO.2144.0 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511489.DESKTOP-LMKC0MO.2144.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511489.DESKTOP-LMKC0MO.2144.1 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658529336.DESKTOP-LMKC0MO.2144.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658529336.DESKTOP-LMKC0MO.2144.2 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494471.DESKTOP-LMKC0MO.9964.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494471.DESKTOP-LMKC0MO.9964.0 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658510515.DESKTOP-LMKC0MO.9964.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658510515.DESKTOP-LMKC0MO.9964.1 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658526626.DESKTOP-LMKC0MO.9964.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658526626.DESKTOP-LMKC0MO.9964.2 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658479209.DESKTOP-LMKC0MO.1228.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658479209.DESKTOP-LMKC0MO.1228.0 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658479212.DESKTOP-LMKC0MO.10500.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658479212.DESKTOP-LMKC0MO.10500.0 -------------------------------------------------------------------------------- /3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658479214.DESKTOP-LMKC0MO.9512.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658479214.DESKTOP-LMKC0MO.9512.0 -------------------------------------------------------------------------------- /3.Rainbow_DQN/sum_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | class SumTree(object): 5 | """ 6 | Story data with its priority in the tree. 7 | Tree structure and array storage: 8 | 9 | Tree index: 10 | 0 -> storing priority sum 11 | / \ 12 | 1 2 13 | / \ / \ 14 | 3 4 5 6 -> storing priority for transitions 15 | 16 | Array type for storing: 17 | [0,1,2,3,4,5,6] 18 | """ 19 | 20 | def __init__(self, buffer_capacity): 21 | self.buffer_capacity = buffer_capacity # buffer的容量 22 | self.tree_capacity = 2 * buffer_capacity - 1 # sum_tree的容量 23 | self.tree = np.zeros(self.tree_capacity) 24 | 25 | def update(self, data_index, priority): 26 | # data_index表示当前数据在buffer中的index 27 | # tree_index表示当前数据在sum_tree中的index 28 | tree_index = data_index + self.buffer_capacity - 1 # 把当前数据在buffer中的index转换为在sum_tree中的index 29 | change = priority - self.tree[tree_index] # 当前数据的priority的改变量 30 | self.tree[tree_index] = priority # 更新树的最后一层叶子节点的优先级 31 | # then propagate the change through the tree 32 | while tree_index != 0: # 更新上层节点的优先级,一直传播到最顶端 33 | tree_index = (tree_index - 1) // 2 34 | self.tree[tree_index] += change 35 | 36 | def get_index(self, v): 37 | parent_idx = 0 # 从树的顶端开始 38 | while True: 39 | child_left_idx = 2 * parent_idx + 1 # 父节点下方的左右两个子节点的index 40 | child_right_idx = child_left_idx + 1 41 | if child_left_idx >= self.tree_capacity: # reach bottom, end search 42 | tree_index = parent_idx # tree_index表示采样到的数据在sum_tree中的index 43 | break 44 | else: # downward search, always search for a higher priority node 45 | if v <= self.tree[child_left_idx]: 46 | parent_idx = child_left_idx 47 | else: 48 | v -= self.tree[child_left_idx] 49 | parent_idx = child_right_idx 50 | 51 | data_index = tree_index - self.buffer_capacity + 1 # tree_index->data_index 52 | return data_index, self.tree[tree_index] # 返回采样到的data在buffer中的index,以及相对应的priority 53 | 54 | def get_batch_index(self, current_size, batch_size, beta): 55 | batch_index = np.zeros(batch_size, dtype=np.long) 56 | IS_weight = torch.zeros(batch_size, dtype=torch.float32) 57 | segment = self.priority_sum / batch_size # 把[0,priority_sum]等分成batch_size个区间,在每个区间均匀采样一个数 58 | for i in range(batch_size): 59 | a = segment * i 60 | b = segment * (i + 1) 61 | v = np.random.uniform(a, b) 62 | index, priority = self.get_index(v) 63 | batch_index[i] = index 64 | prob = priority / self.priority_sum # 当前数据被采样的概率 65 | IS_weight[i] = (current_size * prob) ** (-beta) 66 | IS_weight /= IS_weight.max() # normalization 67 | 68 | return batch_index, IS_weight 69 | 70 | @property 71 | def priority_sum(self): 72 | return self.tree[0] # 树的顶端保存了所有priority之和 73 | 74 | @property 75 | def priority_max(self): 76 | return self.tree[self.buffer_capacity - 1:].max() # 树的最后一层叶节点,保存的才是每个数据对应的priority 77 | -------------------------------------------------------------------------------- /4.PPO-discrete/PPO_discrete_main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.utils.tensorboard import SummaryWriter 4 | import gym 5 | import argparse 6 | from normalization import Normalization, RewardScaling 7 | from replaybuffer import ReplayBuffer 8 | from ppo_discrete import PPO_discrete 9 | 10 | 11 | def evaluate_policy(args, env, agent, state_norm): 12 | times = 3 13 | evaluate_reward = 0 14 | for _ in range(times): 15 | s = env.reset() 16 | if args.use_state_norm: # During the evaluating,update=False 17 | s = state_norm(s, update=False) 18 | done = False 19 | episode_reward = 0 20 | while not done: 21 | a = agent.evaluate(s) # We use the deterministic policy during the evaluating 22 | s_, r, done, _ = env.step(a) 23 | if args.use_state_norm: 24 | s_ = state_norm(s_, update=False) 25 | episode_reward += r 26 | s = s_ 27 | evaluate_reward += episode_reward 28 | 29 | return evaluate_reward / times 30 | 31 | 32 | def main(args, env_name, number, seed): 33 | env = gym.make(env_name) 34 | env_evaluate = gym.make(env_name) # When evaluating the policy, we need to rebuild an environment 35 | # Set random seed 36 | env.seed(seed) 37 | env.action_space.seed(seed) 38 | env_evaluate.seed(seed) 39 | env_evaluate.action_space.seed(seed) 40 | np.random.seed(seed) 41 | torch.manual_seed(seed) 42 | 43 | args.state_dim = env.observation_space.shape[0] 44 | args.action_dim = env.action_space.n 45 | args.max_episode_steps = env._max_episode_steps # Maximum number of steps per episode 46 | print("env={}".format(env_name)) 47 | print("state_dim={}".format(args.state_dim)) 48 | print("action_dim={}".format(args.action_dim)) 49 | print("max_episode_steps={}".format(args.max_episode_steps)) 50 | 51 | evaluate_num = 0 # Record the number of evaluations 52 | evaluate_rewards = [] # Record the rewards during the evaluating 53 | total_steps = 0 # Record the total steps during the training 54 | 55 | replay_buffer = ReplayBuffer(args) 56 | agent = PPO_discrete(args) 57 | 58 | # Build a tensorboard 59 | writer = SummaryWriter(log_dir='runs/PPO_discrete/env_{}_number_{}_seed_{}'.format(env_name, number, seed)) 60 | 61 | state_norm = Normalization(shape=args.state_dim) # Trick 2:state normalization 62 | if args.use_reward_norm: # Trick 3:reward normalization 63 | reward_norm = Normalization(shape=1) 64 | elif args.use_reward_scaling: # Trick 4:reward scaling 65 | reward_scaling = RewardScaling(shape=1, gamma=args.gamma) 66 | 67 | while total_steps < args.max_train_steps: 68 | s = env.reset() 69 | if args.use_state_norm: 70 | s = state_norm(s) 71 | if args.use_reward_scaling: 72 | reward_scaling.reset() 73 | episode_steps = 0 74 | done = False 75 | while not done: 76 | episode_steps += 1 77 | a, a_logprob = agent.choose_action(s) # Action and the corresponding log probability 78 | s_, r, done, _ = env.step(a) 79 | 80 | if args.use_state_norm: 81 | s_ = state_norm(s_) 82 | if args.use_reward_norm: 83 | r = reward_norm(r) 84 | elif args.use_reward_scaling: 85 | r = reward_scaling(r) 86 | 87 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them; 88 | # dw means dead or win,there is no next state s'; 89 | # but when reaching the max_episode_steps,there is a next state s' actually. 90 | if done and episode_steps != args.max_episode_steps: 91 | dw = True 92 | else: 93 | dw = False 94 | 95 | replay_buffer.store(s, a, a_logprob, r, s_, dw, done) 96 | s = s_ 97 | total_steps += 1 98 | 99 | # When the number of transitions in buffer reaches batch_size,then update 100 | if replay_buffer.count == args.batch_size: 101 | agent.update(replay_buffer, total_steps) 102 | replay_buffer.count = 0 103 | 104 | # Evaluate the policy every 'evaluate_freq' steps 105 | if total_steps % args.evaluate_freq == 0: 106 | evaluate_num += 1 107 | evaluate_reward = evaluate_policy(args, env_evaluate, agent, state_norm) 108 | evaluate_rewards.append(evaluate_reward) 109 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward)) 110 | writer.add_scalar('step_rewards_{}'.format(env_name), evaluate_rewards[-1], global_step=total_steps) 111 | # Save the rewards 112 | if evaluate_num % args.save_freq == 0: 113 | np.save('./data_train/PPO_discrete_env_{}_number_{}_seed_{}.npy'.format(env_name, number, seed), np.array(evaluate_rewards)) 114 | 115 | 116 | if __name__ == '__main__': 117 | parser = argparse.ArgumentParser("Hyperparameter Setting for PPO-discrete") 118 | parser.add_argument("--max_train_steps", type=int, default=int(2e5), help=" Maximum number of training steps") 119 | parser.add_argument("--evaluate_freq", type=float, default=5e3, help="Evaluate the policy every 'evaluate_freq' steps") 120 | parser.add_argument("--save_freq", type=int, default=20, help="Save frequency") 121 | parser.add_argument("--batch_size", type=int, default=2048, help="Batch size") 122 | parser.add_argument("--mini_batch_size", type=int, default=64, help="Minibatch size") 123 | parser.add_argument("--hidden_width", type=int, default=64, help="The number of neurons in hidden layers of the neural network") 124 | parser.add_argument("--lr_a", type=float, default=3e-4, help="Learning rate of actor") 125 | parser.add_argument("--lr_c", type=float, default=3e-4, help="Learning rate of critic") 126 | parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") 127 | parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter") 128 | parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter") 129 | parser.add_argument("--K_epochs", type=int, default=10, help="PPO parameter") 130 | parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization") 131 | parser.add_argument("--use_state_norm", type=bool, default=True, help="Trick 2:state normalization") 132 | parser.add_argument("--use_reward_norm", type=bool, default=False, help="Trick 3:reward normalization") 133 | parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling") 134 | parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy") 135 | parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay") 136 | parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip") 137 | parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization") 138 | parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5") 139 | parser.add_argument("--use_tanh", type=float, default=True, help="Trick 10: tanh activation function") 140 | 141 | args = parser.parse_args() 142 | 143 | env_name = ['CartPole-v1', 'LunarLander-v2'] 144 | env_index = 1 145 | main(args, env_name=env_name[env_index], number=1, seed=0) 146 | -------------------------------------------------------------------------------- /4.PPO-discrete/README.md: -------------------------------------------------------------------------------- 1 | # PPO-discrete 2 | This is a concise Pytorch implementation of PPO on discrete action space with 10 tricks.
3 | 4 | ## 10 tricks 5 | Trick 1—Advantage Normalization.
6 | Trick 2—State Normalization.
7 | Trick 3 & Trick 4—— Reward Normalization & Reward Scaling.
8 | Trick 5—Policy Entropy.
9 | Trick 6—Learning Rate Decay.
10 | Trick 7—Gradient clip.
11 | Trick 8—Orthogonal Initialization.
12 | Trick 9—Adam Optimizer Epsilon Parameter.
13 | Trick10—Tanh Activation Function.
14 | 15 | ## How to use my code? 16 | You can dircetly run 'PPO_discrete_main.py' in your own IDE.
17 | 18 | ## Trainning environments 19 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 2 environments.
20 | env_index=0 represent 'CartPole-v1'
21 | env_index=1 represent 'LunarLander-v2'
22 | 23 | ## Training result 24 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/4.PPO-discrete/training_result.png) 25 | 26 | ## Tutorial 27 | If you can read Chinese, you can get more information from this blog.https://zhuanlan.zhihu.com/p/512327050 28 | -------------------------------------------------------------------------------- /4.PPO-discrete/__pycache__/normalization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/__pycache__/normalization.cpython-37.pyc -------------------------------------------------------------------------------- /4.PPO-discrete/__pycache__/ppo_discrete.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/__pycache__/ppo_discrete.cpython-37.pyc -------------------------------------------------------------------------------- /4.PPO-discrete/__pycache__/replaybuffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/__pycache__/replaybuffer.cpython-37.pyc -------------------------------------------------------------------------------- /4.PPO-discrete/data_train/readme.txt: -------------------------------------------------------------------------------- 1 | This is a file used to store the training reward data. -------------------------------------------------------------------------------- /4.PPO-discrete/normalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RunningMeanStd: 5 | # Dynamically calculate mean and std 6 | def __init__(self, shape): # shape:the dimension of input data 7 | self.n = 0 8 | self.mean = np.zeros(shape) 9 | self.S = np.zeros(shape) 10 | self.std = np.sqrt(self.S) 11 | 12 | def update(self, x): 13 | x = np.array(x) 14 | self.n += 1 15 | if self.n == 1: 16 | self.mean = x 17 | self.std = x 18 | else: 19 | old_mean = self.mean.copy() 20 | self.mean = old_mean + (x - old_mean) / self.n 21 | self.S = self.S + (x - old_mean) * (x - self.mean) 22 | self.std = np.sqrt(self.S / self.n) 23 | 24 | 25 | class Normalization: 26 | def __init__(self, shape): 27 | self.running_ms = RunningMeanStd(shape=shape) 28 | 29 | def __call__(self, x, update=True): 30 | # Whether to update the mean and std,during the evaluating,update=False 31 | if update: 32 | self.running_ms.update(x) 33 | x = (x - self.running_ms.mean) / (self.running_ms.std + 1e-8) 34 | 35 | return x 36 | 37 | 38 | class RewardScaling: 39 | def __init__(self, shape, gamma): 40 | self.shape = shape # reward shape=1 41 | self.gamma = gamma # discount factor 42 | self.running_ms = RunningMeanStd(shape=self.shape) 43 | self.R = np.zeros(self.shape) 44 | 45 | def __call__(self, x): 46 | self.R = self.gamma * self.R + x 47 | self.running_ms.update(self.R) 48 | x = x / (self.running_ms.std + 1e-8) # Only divided std 49 | return x 50 | 51 | def reset(self): # When an episode is done,we should reset 'self.R' 52 | self.R = np.zeros(self.shape) 53 | -------------------------------------------------------------------------------- /4.PPO-discrete/ppo_discrete.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 6 | from torch.distributions import Categorical 7 | 8 | 9 | # Trick 8: orthogonal initialization 10 | def orthogonal_init(layer, gain=1.0): 11 | nn.init.orthogonal_(layer.weight, gain=gain) 12 | nn.init.constant_(layer.bias, 0) 13 | 14 | 15 | class Actor(nn.Module): 16 | def __init__(self, args): 17 | super(Actor, self).__init__() 18 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width) 19 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width) 20 | self.fc3 = nn.Linear(args.hidden_width, args.action_dim) 21 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh 22 | 23 | if args.use_orthogonal_init: 24 | print("------use_orthogonal_init------") 25 | orthogonal_init(self.fc1) 26 | orthogonal_init(self.fc2) 27 | orthogonal_init(self.fc3, gain=0.01) 28 | 29 | def forward(self, s): 30 | s = self.activate_func(self.fc1(s)) 31 | s = self.activate_func(self.fc2(s)) 32 | a_prob = torch.softmax(self.fc3(s), dim=1) 33 | return a_prob 34 | 35 | 36 | class Critic(nn.Module): 37 | def __init__(self, args): 38 | super(Critic, self).__init__() 39 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width) 40 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width) 41 | self.fc3 = nn.Linear(args.hidden_width, 1) 42 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh 43 | 44 | if args.use_orthogonal_init: 45 | print("------use_orthogonal_init------") 46 | orthogonal_init(self.fc1) 47 | orthogonal_init(self.fc2) 48 | orthogonal_init(self.fc3) 49 | 50 | def forward(self, s): 51 | s = self.activate_func(self.fc1(s)) 52 | s = self.activate_func(self.fc2(s)) 53 | v_s = self.fc3(s) 54 | return v_s 55 | 56 | 57 | class PPO_discrete: 58 | def __init__(self, args): 59 | self.batch_size = args.batch_size 60 | self.mini_batch_size = args.mini_batch_size 61 | self.max_train_steps = args.max_train_steps 62 | self.lr_a = args.lr_a # Learning rate of actor 63 | self.lr_c = args.lr_c # Learning rate of critic 64 | self.gamma = args.gamma # Discount factor 65 | self.lamda = args.lamda # GAE parameter 66 | self.epsilon = args.epsilon # PPO clip parameter 67 | self.K_epochs = args.K_epochs # PPO parameter 68 | self.entropy_coef = args.entropy_coef # Entropy coefficient 69 | self.set_adam_eps = args.set_adam_eps 70 | self.use_grad_clip = args.use_grad_clip 71 | self.use_lr_decay = args.use_lr_decay 72 | self.use_adv_norm = args.use_adv_norm 73 | 74 | self.actor = Actor(args) 75 | self.critic = Critic(args) 76 | if self.set_adam_eps: # Trick 9: set Adam epsilon=1e-5 77 | self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a, eps=1e-5) 78 | self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c, eps=1e-5) 79 | else: 80 | self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a) 81 | self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c) 82 | 83 | def evaluate(self, s): # When evaluating the policy, we select the action with the highest probability 84 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 85 | a_prob = self.actor(s).detach().numpy().flatten() 86 | a = np.argmax(a_prob) 87 | return a 88 | 89 | def choose_action(self, s): 90 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 91 | with torch.no_grad(): 92 | dist = Categorical(probs=self.actor(s)) 93 | a = dist.sample() 94 | a_logprob = dist.log_prob(a) 95 | return a.numpy()[0], a_logprob.numpy()[0] 96 | 97 | def update(self, replay_buffer, total_steps): 98 | s, a, a_logprob, r, s_, dw, done = replay_buffer.numpy_to_tensor() # Get training data 99 | """ 100 | Calculate the advantage using GAE 101 | 'dw=True' means dead or win, there is no next state s' 102 | 'done=True' represents the terminal of an episode(dead or win or reaching the max_episode_steps). When calculating the adv, if done=True, gae=0 103 | """ 104 | adv = [] 105 | gae = 0 106 | with torch.no_grad(): # adv and v_target have no gradient 107 | vs = self.critic(s) 108 | vs_ = self.critic(s_) 109 | deltas = r + self.gamma * (1.0 - dw) * vs_ - vs 110 | for delta, d in zip(reversed(deltas.flatten().numpy()), reversed(done.flatten().numpy())): 111 | gae = delta + self.gamma * self.lamda * gae * (1.0 - d) 112 | adv.insert(0, gae) 113 | adv = torch.tensor(adv, dtype=torch.float).view(-1, 1) 114 | v_target = adv + vs 115 | if self.use_adv_norm: # Trick 1:advantage normalization 116 | adv = ((adv - adv.mean()) / (adv.std() + 1e-5)) 117 | 118 | # Optimize policy for K epochs: 119 | for _ in range(self.K_epochs): 120 | # Random sampling and no repetition. 'False' indicates that training will continue even if the number of samples in the last time is less than mini_batch_size 121 | for index in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.mini_batch_size, False): 122 | dist_now = Categorical(probs=self.actor(s[index])) 123 | dist_entropy = dist_now.entropy().view(-1, 1) # shape(mini_batch_size X 1) 124 | a_logprob_now = dist_now.log_prob(a[index].squeeze()).view(-1, 1) # shape(mini_batch_size X 1) 125 | # a/b=exp(log(a)-log(b)) 126 | ratios = torch.exp(a_logprob_now - a_logprob[index]) # shape(mini_batch_size X 1) 127 | 128 | surr1 = ratios * adv[index] # Only calculate the gradient of 'a_logprob_now' in ratios 129 | surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * adv[index] 130 | actor_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy # shape(mini_batch_size X 1) 131 | # Update actor 132 | self.optimizer_actor.zero_grad() 133 | actor_loss.mean().backward() 134 | if self.use_grad_clip: # Trick 7: Gradient clip 135 | torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5) 136 | self.optimizer_actor.step() 137 | 138 | v_s = self.critic(s[index]) 139 | critic_loss = F.mse_loss(v_target[index], v_s) 140 | # Update critic 141 | self.optimizer_critic.zero_grad() 142 | critic_loss.backward() 143 | if self.use_grad_clip: # Trick 7: Gradient clip 144 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) 145 | self.optimizer_critic.step() 146 | 147 | if self.use_lr_decay: # Trick 6:learning rate Decay 148 | self.lr_decay(total_steps) 149 | 150 | def lr_decay(self, total_steps): 151 | lr_a_now = self.lr_a * (1 - total_steps / self.max_train_steps) 152 | lr_c_now = self.lr_c * (1 - total_steps / self.max_train_steps) 153 | for p in self.optimizer_actor.param_groups: 154 | p['lr'] = lr_a_now 155 | for p in self.optimizer_critic.param_groups: 156 | p['lr'] = lr_c_now 157 | -------------------------------------------------------------------------------- /4.PPO-discrete/replaybuffer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class ReplayBuffer: 6 | def __init__(self, args): 7 | self.s = np.zeros((args.batch_size, args.state_dim)) 8 | self.a = np.zeros((args.batch_size, 1)) 9 | self.a_logprob = np.zeros((args.batch_size, 1)) 10 | self.r = np.zeros((args.batch_size, 1)) 11 | self.s_ = np.zeros((args.batch_size, args.state_dim)) 12 | self.dw = np.zeros((args.batch_size, 1)) 13 | self.done = np.zeros((args.batch_size, 1)) 14 | self.count = 0 15 | 16 | def store(self, s, a, a_logprob, r, s_, dw, done): 17 | self.s[self.count] = s 18 | self.a[self.count] = a 19 | self.a_logprob[self.count] = a_logprob 20 | self.r[self.count] = r 21 | self.s_[self.count] = s_ 22 | self.dw[self.count] = dw 23 | self.done[self.count] = done 24 | self.count += 1 25 | 26 | def numpy_to_tensor(self): 27 | s = torch.tensor(self.s, dtype=torch.float) 28 | a = torch.tensor(self.a, dtype=torch.long) # In discrete action space, 'a' needs to be torch.long 29 | a_logprob = torch.tensor(self.a_logprob, dtype=torch.float) 30 | r = torch.tensor(self.r, dtype=torch.float) 31 | s_ = torch.tensor(self.s_, dtype=torch.float) 32 | dw = torch.tensor(self.dw, dtype=torch.float) 33 | done = torch.tensor(self.done, dtype=torch.float) 34 | 35 | return s, a, a_logprob, r, s_, dw, done 36 | -------------------------------------------------------------------------------- /4.PPO-discrete/runs/readme.txt: -------------------------------------------------------------------------------- 1 | This is a file used to save the tensorboard data. -------------------------------------------------------------------------------- /4.PPO-discrete/training_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/training_result.png -------------------------------------------------------------------------------- /5.PPO-continuous/PPO_continuous_main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.utils.tensorboard import SummaryWriter 4 | import gym 5 | import argparse 6 | from normalization import Normalization, RewardScaling 7 | from replaybuffer import ReplayBuffer 8 | from ppo_continuous import PPO_continuous 9 | 10 | 11 | def evaluate_policy(args, env, agent, state_norm): 12 | times = 3 13 | evaluate_reward = 0 14 | for _ in range(times): 15 | s = env.reset() 16 | if args.use_state_norm: 17 | s = state_norm(s, update=False) # During the evaluating,update=False 18 | done = False 19 | episode_reward = 0 20 | while not done: 21 | a = agent.evaluate(s) # We use the deterministic policy during the evaluating 22 | if args.policy_dist == "Beta": 23 | action = 2 * (a - 0.5) * args.max_action # [0,1]->[-max,max] 24 | else: 25 | action = a 26 | s_, r, done, _ = env.step(action) 27 | if args.use_state_norm: 28 | s_ = state_norm(s_, update=False) 29 | episode_reward += r 30 | s = s_ 31 | evaluate_reward += episode_reward 32 | 33 | return evaluate_reward / times 34 | 35 | 36 | def main(args, env_name, number, seed): 37 | env = gym.make(env_name) 38 | env_evaluate = gym.make(env_name) # When evaluating the policy, we need to rebuild an environment 39 | # Set random seed 40 | env.seed(seed) 41 | env.action_space.seed(seed) 42 | env_evaluate.seed(seed) 43 | env_evaluate.action_space.seed(seed) 44 | np.random.seed(seed) 45 | torch.manual_seed(seed) 46 | 47 | args.state_dim = env.observation_space.shape[0] 48 | args.action_dim = env.action_space.shape[0] 49 | args.max_action = float(env.action_space.high[0]) 50 | args.max_episode_steps = env._max_episode_steps # Maximum number of steps per episode 51 | print("env={}".format(env_name)) 52 | print("state_dim={}".format(args.state_dim)) 53 | print("action_dim={}".format(args.action_dim)) 54 | print("max_action={}".format(args.max_action)) 55 | print("max_episode_steps={}".format(args.max_episode_steps)) 56 | 57 | evaluate_num = 0 # Record the number of evaluations 58 | evaluate_rewards = [] # Record the rewards during the evaluating 59 | total_steps = 0 # Record the total steps during the training 60 | 61 | replay_buffer = ReplayBuffer(args) 62 | agent = PPO_continuous(args) 63 | 64 | # Build a tensorboard 65 | writer = SummaryWriter(log_dir='runs/PPO_continuous/env_{}_{}_number_{}_seed_{}'.format(env_name, args.policy_dist, number, seed)) 66 | 67 | state_norm = Normalization(shape=args.state_dim) # Trick 2:state normalization 68 | if args.use_reward_norm: # Trick 3:reward normalization 69 | reward_norm = Normalization(shape=1) 70 | elif args.use_reward_scaling: # Trick 4:reward scaling 71 | reward_scaling = RewardScaling(shape=1, gamma=args.gamma) 72 | 73 | while total_steps < args.max_train_steps: 74 | s = env.reset() 75 | if args.use_state_norm: 76 | s = state_norm(s) 77 | if args.use_reward_scaling: 78 | reward_scaling.reset() 79 | episode_steps = 0 80 | done = False 81 | while not done: 82 | episode_steps += 1 83 | a, a_logprob = agent.choose_action(s) # Action and the corresponding log probability 84 | if args.policy_dist == "Beta": 85 | action = 2 * (a - 0.5) * args.max_action # [0,1]->[-max,max] 86 | else: 87 | action = a 88 | s_, r, done, _ = env.step(action) 89 | 90 | if args.use_state_norm: 91 | s_ = state_norm(s_) 92 | if args.use_reward_norm: 93 | r = reward_norm(r) 94 | elif args.use_reward_scaling: 95 | r = reward_scaling(r) 96 | 97 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them; 98 | # dw means dead or win,there is no next state s'; 99 | # but when reaching the max_episode_steps,there is a next state s' actually. 100 | if done and episode_steps != args.max_episode_steps: 101 | dw = True 102 | else: 103 | dw = False 104 | 105 | # Take the 'action',but store the original 'a'(especially for Beta) 106 | replay_buffer.store(s, a, a_logprob, r, s_, dw, done) 107 | s = s_ 108 | total_steps += 1 109 | 110 | # When the number of transitions in buffer reaches batch_size,then update 111 | if replay_buffer.count == args.batch_size: 112 | agent.update(replay_buffer, total_steps) 113 | replay_buffer.count = 0 114 | 115 | # Evaluate the policy every 'evaluate_freq' steps 116 | if total_steps % args.evaluate_freq == 0: 117 | evaluate_num += 1 118 | evaluate_reward = evaluate_policy(args, env_evaluate, agent, state_norm) 119 | evaluate_rewards.append(evaluate_reward) 120 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward)) 121 | writer.add_scalar('step_rewards_{}'.format(env_name), evaluate_rewards[-1], global_step=total_steps) 122 | # Save the rewards 123 | if evaluate_num % args.save_freq == 0: 124 | np.save('./data_train/PPO_continuous_{}_env_{}_number_{}_seed_{}.npy'.format(args.policy_dist, env_name, number, seed), np.array(evaluate_rewards)) 125 | 126 | 127 | if __name__ == '__main__': 128 | parser = argparse.ArgumentParser("Hyperparameters Setting for PPO-continuous") 129 | parser.add_argument("--max_train_steps", type=int, default=int(3e6), help=" Maximum number of training steps") 130 | parser.add_argument("--evaluate_freq", type=float, default=5e3, help="Evaluate the policy every 'evaluate_freq' steps") 131 | parser.add_argument("--save_freq", type=int, default=20, help="Save frequency") 132 | parser.add_argument("--policy_dist", type=str, default="Gaussian", help="Beta or Gaussian") 133 | parser.add_argument("--batch_size", type=int, default=2048, help="Batch size") 134 | parser.add_argument("--mini_batch_size", type=int, default=64, help="Minibatch size") 135 | parser.add_argument("--hidden_width", type=int, default=64, help="The number of neurons in hidden layers of the neural network") 136 | parser.add_argument("--lr_a", type=float, default=3e-4, help="Learning rate of actor") 137 | parser.add_argument("--lr_c", type=float, default=3e-4, help="Learning rate of critic") 138 | parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") 139 | parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter") 140 | parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter") 141 | parser.add_argument("--K_epochs", type=int, default=10, help="PPO parameter") 142 | parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization") 143 | parser.add_argument("--use_state_norm", type=bool, default=True, help="Trick 2:state normalization") 144 | parser.add_argument("--use_reward_norm", type=bool, default=False, help="Trick 3:reward normalization") 145 | parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling") 146 | parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy") 147 | parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay") 148 | parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip") 149 | parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization") 150 | parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5") 151 | parser.add_argument("--use_tanh", type=float, default=True, help="Trick 10: tanh activation function") 152 | 153 | args = parser.parse_args() 154 | 155 | env_name = ['BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2'] 156 | env_index = 1 157 | main(args, env_name=env_name[env_index], number=1, seed=10) 158 | -------------------------------------------------------------------------------- /5.PPO-continuous/README.md: -------------------------------------------------------------------------------- 1 | # PPO-continuous 2 | This is a concise Pytorch implementation of PPO on continuous action space with 10 tricks.
3 | 4 | ## 10 tricks 5 | Trick 1—Advantage Normalization.
6 | Trick 2—State Normalization.
7 | Trick 3 & Trick 4—— Reward Normalization & Reward Scaling.
8 | Trick 5—Policy Entropy.
9 | Trick 6—Learning Rate Decay.
10 | Trick 7—Gradient clip.
11 | Trick 8—Orthogonal Initialization.
12 | Trick 9—Adam Optimizer Epsilon Parameter.
13 | Trick10—Tanh Activation Function.
14 | 15 | ## How to use my code? 16 | You can dircetly run 'PPO_continuous_main.py' in your own IDE.
17 | 18 | ## Trainning environments 19 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 4 environments.
20 | env_index=0 represent 'BipedalWalker-v3'
21 | env_index=1 represent 'HalfCheetah-v2'
22 | env_index=2 represent 'Hopper-v2'
23 | env_index=3 represent 'Walker2d-v2'
24 | 25 | ## Trainning result 26 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/5.PPO-continuous/training_result.png) 27 | 28 | ## Tutorial 29 | If you can read Chinese, you can get more information from this blog.https://zhuanlan.zhihu.com/p/512327050 30 | -------------------------------------------------------------------------------- /5.PPO-continuous/__pycache__/normalization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/__pycache__/normalization.cpython-37.pyc -------------------------------------------------------------------------------- /5.PPO-continuous/__pycache__/ppo_continuous.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/__pycache__/ppo_continuous.cpython-37.pyc -------------------------------------------------------------------------------- /5.PPO-continuous/__pycache__/replaybuffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/__pycache__/replaybuffer.cpython-37.pyc -------------------------------------------------------------------------------- /5.PPO-continuous/data_train/readme.txt: -------------------------------------------------------------------------------- 1 | This is a file used to store the training reward data. -------------------------------------------------------------------------------- /5.PPO-continuous/normalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RunningMeanStd: 5 | # Dynamically calculate mean and std 6 | def __init__(self, shape): # shape:the dimension of input data 7 | self.n = 0 8 | self.mean = np.zeros(shape) 9 | self.S = np.zeros(shape) 10 | self.std = np.sqrt(self.S) 11 | 12 | def update(self, x): 13 | x = np.array(x) 14 | self.n += 1 15 | if self.n == 1: 16 | self.mean = x 17 | self.std = x 18 | else: 19 | old_mean = self.mean.copy() 20 | self.mean = old_mean + (x - old_mean) / self.n 21 | self.S = self.S + (x - old_mean) * (x - self.mean) 22 | self.std = np.sqrt(self.S / self.n) 23 | 24 | 25 | class Normalization: 26 | def __init__(self, shape): 27 | self.running_ms = RunningMeanStd(shape=shape) 28 | 29 | def __call__(self, x, update=True): 30 | # Whether to update the mean and std,during the evaluating,update=False 31 | if update: 32 | self.running_ms.update(x) 33 | x = (x - self.running_ms.mean) / (self.running_ms.std + 1e-8) 34 | 35 | return x 36 | 37 | 38 | class RewardScaling: 39 | def __init__(self, shape, gamma): 40 | self.shape = shape # reward shape=1 41 | self.gamma = gamma # discount factor 42 | self.running_ms = RunningMeanStd(shape=self.shape) 43 | self.R = np.zeros(self.shape) 44 | 45 | def __call__(self, x): 46 | self.R = self.gamma * self.R + x 47 | self.running_ms.update(self.R) 48 | x = x / (self.running_ms.std + 1e-8) # Only divided std 49 | return x 50 | 51 | def reset(self): # When an episode is done,we should reset 'self.R' 52 | self.R = np.zeros(self.shape) 53 | -------------------------------------------------------------------------------- /5.PPO-continuous/ppo_continuous.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 4 | import torch.nn as nn 5 | from torch.distributions import Beta, Normal 6 | 7 | 8 | # Trick 8: orthogonal initialization 9 | def orthogonal_init(layer, gain=1.0): 10 | nn.init.orthogonal_(layer.weight, gain=gain) 11 | nn.init.constant_(layer.bias, 0) 12 | 13 | 14 | class Actor_Beta(nn.Module): 15 | def __init__(self, args): 16 | super(Actor_Beta, self).__init__() 17 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width) 18 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width) 19 | self.alpha_layer = nn.Linear(args.hidden_width, args.action_dim) 20 | self.beta_layer = nn.Linear(args.hidden_width, args.action_dim) 21 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh 22 | 23 | if args.use_orthogonal_init: 24 | print("------use_orthogonal_init------") 25 | orthogonal_init(self.fc1) 26 | orthogonal_init(self.fc2) 27 | orthogonal_init(self.alpha_layer, gain=0.01) 28 | orthogonal_init(self.beta_layer, gain=0.01) 29 | 30 | def forward(self, s): 31 | s = self.activate_func(self.fc1(s)) 32 | s = self.activate_func(self.fc2(s)) 33 | # alpha and beta need to be larger than 1,so we use 'softplus' as the activation function and then plus 1 34 | alpha = F.softplus(self.alpha_layer(s)) + 1.0 35 | beta = F.softplus(self.beta_layer(s)) + 1.0 36 | return alpha, beta 37 | 38 | def get_dist(self, s): 39 | alpha, beta = self.forward(s) 40 | dist = Beta(alpha, beta) 41 | return dist 42 | 43 | def mean(self, s): 44 | alpha, beta = self.forward(s) 45 | mean = alpha / (alpha + beta) # The mean of the beta distribution 46 | return mean 47 | 48 | 49 | class Actor_Gaussian(nn.Module): 50 | def __init__(self, args): 51 | super(Actor_Gaussian, self).__init__() 52 | self.max_action = args.max_action 53 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width) 54 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width) 55 | self.mean_layer = nn.Linear(args.hidden_width, args.action_dim) 56 | self.log_std = nn.Parameter(torch.zeros(1, args.action_dim)) # We use 'nn.Parameter' to train log_std automatically 57 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh 58 | 59 | if args.use_orthogonal_init: 60 | print("------use_orthogonal_init------") 61 | orthogonal_init(self.fc1) 62 | orthogonal_init(self.fc2) 63 | orthogonal_init(self.mean_layer, gain=0.01) 64 | 65 | def forward(self, s): 66 | s = self.activate_func(self.fc1(s)) 67 | s = self.activate_func(self.fc2(s)) 68 | mean = self.max_action * torch.tanh(self.mean_layer(s)) # [-1,1]->[-max_action,max_action] 69 | return mean 70 | 71 | def get_dist(self, s): 72 | mean = self.forward(s) 73 | log_std = self.log_std.expand_as(mean) # To make 'log_std' have the same dimension as 'mean' 74 | std = torch.exp(log_std) # The reason we train the 'log_std' is to ensure std=exp(log_std)>0 75 | dist = Normal(mean, std) # Get the Gaussian distribution 76 | return dist 77 | 78 | 79 | class Critic(nn.Module): 80 | def __init__(self, args): 81 | super(Critic, self).__init__() 82 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width) 83 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width) 84 | self.fc3 = nn.Linear(args.hidden_width, 1) 85 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh 86 | 87 | if args.use_orthogonal_init: 88 | print("------use_orthogonal_init------") 89 | orthogonal_init(self.fc1) 90 | orthogonal_init(self.fc2) 91 | orthogonal_init(self.fc3) 92 | 93 | def forward(self, s): 94 | s = self.activate_func(self.fc1(s)) 95 | s = self.activate_func(self.fc2(s)) 96 | v_s = self.fc3(s) 97 | return v_s 98 | 99 | 100 | class PPO_continuous(): 101 | def __init__(self, args): 102 | self.policy_dist = args.policy_dist 103 | self.max_action = args.max_action 104 | self.batch_size = args.batch_size 105 | self.mini_batch_size = args.mini_batch_size 106 | self.max_train_steps = args.max_train_steps 107 | self.lr_a = args.lr_a # Learning rate of actor 108 | self.lr_c = args.lr_c # Learning rate of critic 109 | self.gamma = args.gamma # Discount factor 110 | self.lamda = args.lamda # GAE parameter 111 | self.epsilon = args.epsilon # PPO clip parameter 112 | self.K_epochs = args.K_epochs # PPO parameter 113 | self.entropy_coef = args.entropy_coef # Entropy coefficient 114 | self.set_adam_eps = args.set_adam_eps 115 | self.use_grad_clip = args.use_grad_clip 116 | self.use_lr_decay = args.use_lr_decay 117 | self.use_adv_norm = args.use_adv_norm 118 | 119 | if self.policy_dist == "Beta": 120 | self.actor = Actor_Beta(args) 121 | else: 122 | self.actor = Actor_Gaussian(args) 123 | self.critic = Critic(args) 124 | 125 | if self.set_adam_eps: # Trick 9: set Adam epsilon=1e-5 126 | self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a, eps=1e-5) 127 | self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c, eps=1e-5) 128 | else: 129 | self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a) 130 | self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c) 131 | 132 | def evaluate(self, s): # When evaluating the policy, we only use the mean 133 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 134 | if self.policy_dist == "Beta": 135 | a = self.actor.mean(s).detach().numpy().flatten() 136 | else: 137 | a = self.actor(s).detach().numpy().flatten() 138 | return a 139 | 140 | def choose_action(self, s): 141 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 142 | if self.policy_dist == "Beta": 143 | with torch.no_grad(): 144 | dist = self.actor.get_dist(s) 145 | a = dist.sample() # Sample the action according to the probability distribution 146 | a_logprob = dist.log_prob(a) # The log probability density of the action 147 | else: 148 | with torch.no_grad(): 149 | dist = self.actor.get_dist(s) 150 | a = dist.sample() # Sample the action according to the probability distribution 151 | a = torch.clamp(a, -self.max_action, self.max_action) # [-max,max] 152 | a_logprob = dist.log_prob(a) # The log probability density of the action 153 | return a.numpy().flatten(), a_logprob.numpy().flatten() 154 | 155 | def update(self, replay_buffer, total_steps): 156 | s, a, a_logprob, r, s_, dw, done = replay_buffer.numpy_to_tensor() # Get training data 157 | """ 158 | Calculate the advantage using GAE 159 | 'dw=True' means dead or win, there is no next state s' 160 | 'done=True' represents the terminal of an episode(dead or win or reaching the max_episode_steps). When calculating the adv, if done=True, gae=0 161 | """ 162 | adv = [] 163 | gae = 0 164 | with torch.no_grad(): # adv and v_target have no gradient 165 | vs = self.critic(s) 166 | vs_ = self.critic(s_) 167 | deltas = r + self.gamma * (1.0 - dw) * vs_ - vs 168 | for delta, d in zip(reversed(deltas.flatten().numpy()), reversed(done.flatten().numpy())): 169 | gae = delta + self.gamma * self.lamda * gae * (1.0 - d) 170 | adv.insert(0, gae) 171 | adv = torch.tensor(adv, dtype=torch.float).view(-1, 1) 172 | v_target = adv + vs 173 | if self.use_adv_norm: # Trick 1:advantage normalization 174 | adv = ((adv - adv.mean()) / (adv.std() + 1e-5)) 175 | 176 | # Optimize policy for K epochs: 177 | for _ in range(self.K_epochs): 178 | # Random sampling and no repetition. 'False' indicates that training will continue even if the number of samples in the last time is less than mini_batch_size 179 | for index in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.mini_batch_size, False): 180 | dist_now = self.actor.get_dist(s[index]) 181 | dist_entropy = dist_now.entropy().sum(1, keepdim=True) # shape(mini_batch_size X 1) 182 | a_logprob_now = dist_now.log_prob(a[index]) 183 | # a/b=exp(log(a)-log(b)) In multi-dimensional continuous action space,we need to sum up the log_prob 184 | ratios = torch.exp(a_logprob_now.sum(1, keepdim=True) - a_logprob[index].sum(1, keepdim=True)) # shape(mini_batch_size X 1) 185 | 186 | surr1 = ratios * adv[index] # Only calculate the gradient of 'a_logprob_now' in ratios 187 | surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * adv[index] 188 | actor_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy # Trick 5: policy entropy 189 | # Update actor 190 | self.optimizer_actor.zero_grad() 191 | actor_loss.mean().backward() 192 | if self.use_grad_clip: # Trick 7: Gradient clip 193 | torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5) 194 | self.optimizer_actor.step() 195 | 196 | v_s = self.critic(s[index]) 197 | critic_loss = F.mse_loss(v_target[index], v_s) 198 | # Update critic 199 | self.optimizer_critic.zero_grad() 200 | critic_loss.backward() 201 | if self.use_grad_clip: # Trick 7: Gradient clip 202 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) 203 | self.optimizer_critic.step() 204 | 205 | if self.use_lr_decay: # Trick 6:learning rate Decay 206 | self.lr_decay(total_steps) 207 | 208 | def lr_decay(self, total_steps): 209 | lr_a_now = self.lr_a * (1 - total_steps / self.max_train_steps) 210 | lr_c_now = self.lr_c * (1 - total_steps / self.max_train_steps) 211 | for p in self.optimizer_actor.param_groups: 212 | p['lr'] = lr_a_now 213 | for p in self.optimizer_critic.param_groups: 214 | p['lr'] = lr_c_now 215 | -------------------------------------------------------------------------------- /5.PPO-continuous/replaybuffer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class ReplayBuffer: 6 | def __init__(self, args): 7 | self.s = np.zeros((args.batch_size, args.state_dim)) 8 | self.a = np.zeros((args.batch_size, args.action_dim)) 9 | self.a_logprob = np.zeros((args.batch_size, args.action_dim)) 10 | self.r = np.zeros((args.batch_size, 1)) 11 | self.s_ = np.zeros((args.batch_size, args.state_dim)) 12 | self.dw = np.zeros((args.batch_size, 1)) 13 | self.done = np.zeros((args.batch_size, 1)) 14 | self.count = 0 15 | 16 | def store(self, s, a, a_logprob, r, s_, dw, done): 17 | self.s[self.count] = s 18 | self.a[self.count] = a 19 | self.a_logprob[self.count] = a_logprob 20 | self.r[self.count] = r 21 | self.s_[self.count] = s_ 22 | self.dw[self.count] = dw 23 | self.done[self.count] = done 24 | self.count += 1 25 | 26 | def numpy_to_tensor(self): 27 | s = torch.tensor(self.s, dtype=torch.float) 28 | a = torch.tensor(self.a, dtype=torch.float) 29 | a_logprob = torch.tensor(self.a_logprob, dtype=torch.float) 30 | r = torch.tensor(self.r, dtype=torch.float) 31 | s_ = torch.tensor(self.s_, dtype=torch.float) 32 | dw = torch.tensor(self.dw, dtype=torch.float) 33 | done = torch.tensor(self.done, dtype=torch.float) 34 | 35 | return s, a, a_logprob, r, s_, dw, done 36 | -------------------------------------------------------------------------------- /5.PPO-continuous/runs/readme.txt: -------------------------------------------------------------------------------- 1 | This is a file used to save the tensorboard data. -------------------------------------------------------------------------------- /5.PPO-continuous/training_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/training_result.png -------------------------------------------------------------------------------- /6.DDPG/DDPG.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import copy 7 | from torch.utils.tensorboard import SummaryWriter 8 | 9 | 10 | class Actor(nn.Module): 11 | def __init__(self, state_dim, action_dim, hidden_width, max_action): 12 | super(Actor, self).__init__() 13 | self.max_action = max_action 14 | self.l1 = nn.Linear(state_dim, hidden_width) 15 | self.l2 = nn.Linear(hidden_width, hidden_width) 16 | self.l3 = nn.Linear(hidden_width, action_dim) 17 | 18 | def forward(self, s): 19 | s = F.relu(self.l1(s)) 20 | s = F.relu(self.l2(s)) 21 | a = self.max_action * torch.tanh(self.l3(s)) # [-max,max] 22 | return a 23 | 24 | 25 | class Critic(nn.Module): # According to (s,a), directly calculate Q(s,a) 26 | def __init__(self, state_dim, action_dim, hidden_width): 27 | super(Critic, self).__init__() 28 | self.l1 = nn.Linear(state_dim + action_dim, hidden_width) 29 | self.l2 = nn.Linear(hidden_width, hidden_width) 30 | self.l3 = nn.Linear(hidden_width, 1) 31 | 32 | def forward(self, s, a): 33 | q = F.relu(self.l1(torch.cat([s, a], 1))) 34 | q = F.relu(self.l2(q)) 35 | q = self.l3(q) 36 | return q 37 | 38 | 39 | class ReplayBuffer(object): 40 | def __init__(self, state_dim, action_dim): 41 | self.max_size = int(1e6) 42 | self.count = 0 43 | self.size = 0 44 | self.s = np.zeros((self.max_size, state_dim)) 45 | self.a = np.zeros((self.max_size, action_dim)) 46 | self.r = np.zeros((self.max_size, 1)) 47 | self.s_ = np.zeros((self.max_size, state_dim)) 48 | self.dw = np.zeros((self.max_size, 1)) 49 | 50 | def store(self, s, a, r, s_, dw): 51 | self.s[self.count] = s 52 | self.a[self.count] = a 53 | self.r[self.count] = r 54 | self.s_[self.count] = s_ 55 | self.dw[self.count] = dw 56 | self.count = (self.count + 1) % self.max_size # When the 'count' reaches max_size, it will be reset to 0. 57 | self.size = min(self.size + 1, self.max_size) # Record the number of transitions 58 | 59 | def sample(self, batch_size): 60 | index = np.random.choice(self.size, size=batch_size) # Randomly sampling 61 | batch_s = torch.tensor(self.s[index], dtype=torch.float) 62 | batch_a = torch.tensor(self.a[index], dtype=torch.float) 63 | batch_r = torch.tensor(self.r[index], dtype=torch.float) 64 | batch_s_ = torch.tensor(self.s_[index], dtype=torch.float) 65 | batch_dw = torch.tensor(self.dw[index], dtype=torch.float) 66 | 67 | return batch_s, batch_a, batch_r, batch_s_, batch_dw 68 | 69 | 70 | class DDPG(object): 71 | def __init__(self, state_dim, action_dim, max_action): 72 | self.hidden_width = 256 # The number of neurons in hidden layers of the neural network 73 | self.batch_size = 256 # batch size 74 | self.GAMMA = 0.99 # discount factor 75 | self.TAU = 0.005 # Softly update the target network 76 | self.lr = 3e-4 # learning rate 77 | 78 | self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action) 79 | self.actor_target = copy.deepcopy(self.actor) 80 | self.critic = Critic(state_dim, action_dim, self.hidden_width) 81 | self.critic_target = copy.deepcopy(self.critic) 82 | 83 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr) 84 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr) 85 | 86 | self.MseLoss = nn.MSELoss() 87 | 88 | def choose_action(self, s): 89 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 90 | a = self.actor(s).data.numpy().flatten() 91 | return a 92 | 93 | def learn(self, relay_buffer): 94 | batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size) # Sample a batch 95 | 96 | # Compute the target Q 97 | with torch.no_grad(): # target_Q has no gradient 98 | Q_ = self.critic_target(batch_s_, self.actor_target(batch_s_)) 99 | target_Q = batch_r + self.GAMMA * (1 - batch_dw) * Q_ 100 | 101 | # Compute the current Q and the critic loss 102 | current_Q = self.critic(batch_s, batch_a) 103 | critic_loss = self.MseLoss(target_Q, current_Q) 104 | # Optimize the critic 105 | self.critic_optimizer.zero_grad() 106 | critic_loss.backward() 107 | self.critic_optimizer.step() 108 | 109 | # Freeze critic networks so you don't waste computational effort 110 | for params in self.critic.parameters(): 111 | params.requires_grad = False 112 | 113 | # Compute the actor loss 114 | actor_loss = -self.critic(batch_s, self.actor(batch_s)).mean() 115 | # Optimize the actor 116 | self.actor_optimizer.zero_grad() 117 | actor_loss.backward() 118 | self.actor_optimizer.step() 119 | 120 | # Unfreeze critic networks 121 | for params in self.critic.parameters(): 122 | params.requires_grad = True 123 | 124 | # Softly update the target networks 125 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): 126 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data) 127 | 128 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 129 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data) 130 | 131 | 132 | def evaluate_policy(env, agent): 133 | times = 3 # Perform three evaluations and calculate the average 134 | evaluate_reward = 0 135 | for _ in range(times): 136 | s = env.reset() 137 | done = False 138 | episode_reward = 0 139 | while not done: 140 | a = agent.choose_action(s) # We do not add noise when evaluating 141 | s_, r, done, _ = env.step(a) 142 | episode_reward += r 143 | s = s_ 144 | evaluate_reward += episode_reward 145 | 146 | return int(evaluate_reward / times) 147 | 148 | 149 | def reward_adapter(r, env_index): 150 | if env_index == 0: # Pendulum-v1 151 | r = (r + 8) / 8 152 | elif env_index == 1: # BipedalWalker-v3 153 | if r <= -100: 154 | r = -1 155 | return r 156 | 157 | 158 | if __name__ == '__main__': 159 | env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2'] 160 | env_index = 0 161 | env = gym.make(env_name[env_index]) 162 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment 163 | number = 1 164 | # Set random seed 165 | seed = 0 166 | env.seed(seed) 167 | env.action_space.seed(seed) 168 | env_evaluate.seed(seed) 169 | env_evaluate.action_space.seed(seed) 170 | np.random.seed(seed) 171 | torch.manual_seed(seed) 172 | 173 | state_dim = env.observation_space.shape[0] 174 | action_dim = env.action_space.shape[0] 175 | max_action = float(env.action_space.high[0]) 176 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode 177 | print("env={}".format(env_name[env_index])) 178 | print("state_dim={}".format(state_dim)) 179 | print("action_dim={}".format(action_dim)) 180 | print("max_action={}".format(max_action)) 181 | print("max_episode_steps={}".format(max_episode_steps)) 182 | 183 | agent = DDPG(state_dim, action_dim, max_action) 184 | replay_buffer = ReplayBuffer(state_dim, action_dim) 185 | # Build a tensorboard 186 | writer = SummaryWriter(log_dir='runs/DDPG/DDPG_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed)) 187 | 188 | noise_std = 0.1 * max_action # the std of Gaussian noise for exploration 189 | max_train_steps = 3e6 # Maximum number of training steps 190 | random_steps = 25e3 # Take the random actions in the beginning for the better exploration 191 | update_freq = 50 # Take 50 steps,then update the networks 50 times 192 | evaluate_freq = 1e3 # Evaluate the policy every 'evaluate_freq' steps 193 | evaluate_num = 0 # Record the number of evaluations 194 | evaluate_rewards = [] # Record the rewards during the evaluating 195 | total_steps = 0 # Record the total steps during the training 196 | 197 | while total_steps < max_train_steps: 198 | s = env.reset() 199 | episode_steps = 0 200 | done = False 201 | while not done: 202 | episode_steps += 1 203 | if total_steps < random_steps: # Take the random actions in the beginning for the better exploration 204 | a = env.action_space.sample() 205 | else: 206 | # Add Gaussian noise to actions for exploration 207 | a = agent.choose_action(s) 208 | a = (a + np.random.normal(0, noise_std, size=action_dim)).clip(-max_action, max_action) 209 | s_, r, done, _ = env.step(a) 210 | r = reward_adapter(r, env_index) # Adjust rewards for better performance 211 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them; 212 | # dw means dead or win,there is no next state s'; 213 | # but when reaching the max_episode_steps,there is a next state s' actually. 214 | if done and episode_steps != max_episode_steps: 215 | dw = True 216 | else: 217 | dw = False 218 | replay_buffer.store(s, a, r, s_, dw) # Store the transition 219 | s = s_ 220 | 221 | # Take 50 steps,then update the networks 50 times 222 | if total_steps >= random_steps and total_steps % update_freq == 0: 223 | for _ in range(update_freq): 224 | agent.learn(replay_buffer) 225 | 226 | # Evaluate the policy every 'evaluate_freq' steps 227 | if (total_steps + 1) % evaluate_freq == 0: 228 | evaluate_num += 1 229 | evaluate_reward = evaluate_policy(env_evaluate, agent) 230 | evaluate_rewards.append(evaluate_reward) 231 | print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward)) 232 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps) 233 | # Save the rewards 234 | if evaluate_num % 10 == 0: 235 | np.save('./data_train/DDPG_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards)) 236 | 237 | total_steps += 1 238 | -------------------------------------------------------------------------------- /7.TD3/README.md: -------------------------------------------------------------------------------- 1 | # TD3 2 | This is a concise Pytorch implementation of TD3(Twin Delayed DDPG) on continuous action space.
3 | 4 | 5 | ## How to use my code? 6 | You can dircetly run 'TD3.py' in your own IDE.
7 | 8 | ### Trainning environments 9 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 5 environments.
10 | env_index=0 represent 'Pendulum-v1'
11 | env_index=1 represent 'BipedalWalker-v3'
12 | env_index=2 represent 'HalfCheetah-v2'
13 | env_index=3 represent 'Hopper-v2'
14 | env_index=4 represent 'Walker2d-v2'
15 | 16 | ### How to see the training results? 17 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.
18 | The rewards data are saved as numpy in the file 'data_train'.
19 | The training curves are shown below, which are smoothed by averaging over a window of 10 steps.
20 | The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)
21 | 22 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/7.TD3/TD3_result.png) 23 | 24 | ## Reference 25 | [1] Fujimoto S, Hoof H, Meger D. Addressing function approximation error in actor-critic methods[C]//International conference on machine learning. PMLR, 2018: 1587-1596.
26 | -------------------------------------------------------------------------------- /7.TD3/TD3.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import copy 7 | from torch.utils.tensorboard import SummaryWriter 8 | 9 | 10 | class Actor(nn.Module): 11 | def __init__(self, state_dim, action_dim, hidden_width, max_action): 12 | super(Actor, self).__init__() 13 | self.max_action = max_action 14 | self.l1 = nn.Linear(state_dim, hidden_width) 15 | self.l2 = nn.Linear(hidden_width, hidden_width) 16 | self.l3 = nn.Linear(hidden_width, action_dim) 17 | 18 | def forward(self, s): 19 | s = F.relu(self.l1(s)) 20 | s = F.relu(self.l2(s)) 21 | a = self.max_action * torch.tanh(self.l3(s)) # [-max,max] 22 | return a 23 | 24 | 25 | class Critic(nn.Module): # According to (s,a), directly calculate Q(s,a) 26 | def __init__(self, state_dim, action_dim, hidden_width): 27 | super(Critic, self).__init__() 28 | # Q1 29 | self.l1 = nn.Linear(state_dim + action_dim, hidden_width) 30 | self.l2 = nn.Linear(hidden_width, hidden_width) 31 | self.l3 = nn.Linear(hidden_width, 1) 32 | # Q2 33 | self.l4 = nn.Linear(state_dim + action_dim, hidden_width) 34 | self.l5 = nn.Linear(hidden_width, hidden_width) 35 | self.l6 = nn.Linear(hidden_width, 1) 36 | 37 | def forward(self, s, a): 38 | s_a = torch.cat([s, a], 1) 39 | q1 = F.relu(self.l1(s_a)) 40 | q1 = F.relu(self.l2(q1)) 41 | q1 = self.l3(q1) 42 | 43 | q2 = F.relu(self.l4(s_a)) 44 | q2 = F.relu(self.l5(q2)) 45 | q2 = self.l6(q2) 46 | 47 | return q1, q2 48 | 49 | def Q1(self, s, a): 50 | s_a = torch.cat([s, a], 1) 51 | q1 = F.relu(self.l1(s_a)) 52 | q1 = F.relu(self.l2(q1)) 53 | q1 = self.l3(q1) 54 | 55 | return q1 56 | 57 | 58 | class ReplayBuffer(object): 59 | def __init__(self, state_dim, action_dim): 60 | self.max_size = int(1e6) 61 | self.count = 0 62 | self.size = 0 63 | self.s = np.zeros((self.max_size, state_dim)) 64 | self.a = np.zeros((self.max_size, action_dim)) 65 | self.r = np.zeros((self.max_size, 1)) 66 | self.s_ = np.zeros((self.max_size, state_dim)) 67 | self.dw = np.zeros((self.max_size, 1)) 68 | 69 | def store(self, s, a, r, s_, dw): 70 | self.s[self.count] = s 71 | self.a[self.count] = a 72 | self.r[self.count] = r 73 | self.s_[self.count] = s_ 74 | self.dw[self.count] = dw 75 | self.count = (self.count + 1) % self.max_size # When the 'count' reaches max_size, it will be reset to 0. 76 | self.size = min(self.size + 1, self.max_size) # Record the number of transitions 77 | 78 | def sample(self, batch_size): 79 | index = np.random.choice(self.size, size=batch_size) # Randomly sampling 80 | batch_s = torch.tensor(self.s[index], dtype=torch.float) 81 | batch_a = torch.tensor(self.a[index], dtype=torch.float) 82 | batch_r = torch.tensor(self.r[index], dtype=torch.float) 83 | batch_s_ = torch.tensor(self.s_[index], dtype=torch.float) 84 | batch_dw = torch.tensor(self.dw[index], dtype=torch.float) 85 | 86 | return batch_s, batch_a, batch_r, batch_s_, batch_dw 87 | 88 | 89 | class TD3(object): 90 | def __init__(self, state_dim, action_dim, max_action): 91 | self.max_action = max_action 92 | self.hidden_width = 256 # The number of neurons in hidden layers of the neural network 93 | self.batch_size = 256 # batch size 94 | self.GAMMA = 0.99 # discount factor 95 | self.TAU = 0.005 # Softly update the target network 96 | self.lr = 3e-4 # learning rate 97 | self.policy_noise = 0.2 * max_action # The noise for the trick 'target policy smoothing' 98 | self.noise_clip = 0.5 * max_action # Clip the noise 99 | self.policy_freq = 2 # The frequency of policy updates 100 | self.actor_pointer = 0 101 | 102 | self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action) 103 | self.actor_target = copy.deepcopy(self.actor) 104 | self.critic = Critic(state_dim, action_dim, self.hidden_width) 105 | self.critic_target = copy.deepcopy(self.critic) 106 | 107 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr) 108 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr) 109 | 110 | def choose_action(self, s): 111 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 112 | a = self.actor(s).data.numpy().flatten() 113 | return a 114 | 115 | def learn(self, relay_buffer): 116 | self.actor_pointer += 1 117 | batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size) # Sample a batch 118 | 119 | # Compute the target Q 120 | with torch.no_grad(): # target_Q has no gradient 121 | # Trick 1:target policy smoothing 122 | # torch.randn_like can generate random numbers sampled from N(0,1),which have the same size as 'batch_a' 123 | noise = (torch.randn_like(batch_a) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) 124 | next_action = (self.actor_target(batch_s_) + noise).clamp(-self.max_action, self.max_action) 125 | 126 | # Trick 2:clipped double Q-learning 127 | target_Q1, target_Q2 = self.critic_target(batch_s_, next_action) 128 | target_Q = batch_r + self.GAMMA * (1 - batch_dw) * torch.min(target_Q1, target_Q2) 129 | 130 | # Get the current Q 131 | current_Q1, current_Q2 = self.critic(batch_s, batch_a) 132 | # Compute the critic loss 133 | critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) 134 | # Optimize the critic 135 | self.critic_optimizer.zero_grad() 136 | critic_loss.backward() 137 | self.critic_optimizer.step() 138 | 139 | # Trick 3:delayed policy updates 140 | if self.actor_pointer % self.policy_freq == 0: 141 | # Freeze critic networks so you don't waste computational effort 142 | for params in self.critic.parameters(): 143 | params.requires_grad = False 144 | 145 | # Compute actor loss 146 | actor_loss = -self.critic.Q1(batch_s, self.actor(batch_s)).mean() # Only use Q1 147 | # Optimize the actor 148 | self.actor_optimizer.zero_grad() 149 | actor_loss.backward() 150 | self.actor_optimizer.step() 151 | 152 | # Unfreeze critic networks 153 | for params in self.critic.parameters(): 154 | params.requires_grad = True 155 | 156 | # Softly update the target networks 157 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): 158 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data) 159 | 160 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 161 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data) 162 | 163 | 164 | def evaluate_policy(env, agent): 165 | times = 3 # Perform three evaluations and calculate the average 166 | evaluate_reward = 0 167 | for _ in range(times): 168 | s = env.reset() 169 | done = False 170 | episode_reward = 0 171 | while not done: 172 | a = agent.choose_action(s) # We do not add noise when evaluating 173 | s_, r, done, _ = env.step(a) 174 | episode_reward += r 175 | s = s_ 176 | evaluate_reward += episode_reward 177 | 178 | return int(evaluate_reward / times) 179 | 180 | 181 | def reward_adapter(r, env_index): 182 | if env_index == 0: # Pendulum-v1 183 | r = (r + 8) / 8 184 | elif env_index == 1: # BipedalWalker-v3 185 | if r <= -100: 186 | r = -1 187 | return r 188 | 189 | 190 | if __name__ == '__main__': 191 | env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2'] 192 | env_index = 0 193 | env = gym.make(env_name[env_index]) 194 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment 195 | number = 1 196 | # Set random seed 197 | seed = 0 198 | env.seed(seed) 199 | env.action_space.seed(seed) 200 | env_evaluate.seed(seed) 201 | env_evaluate.action_space.seed(seed) 202 | np.random.seed(seed) 203 | torch.manual_seed(seed) 204 | 205 | state_dim = env.observation_space.shape[0] 206 | action_dim = env.action_space.shape[0] 207 | max_action = float(env.action_space.high[0]) 208 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode 209 | print("env={}".format(env_name[env_index])) 210 | print("state_dim={}".format(state_dim)) 211 | print("action_dim={}".format(action_dim)) 212 | print("max_action={}".format(max_action)) 213 | print("max_episode_steps={}".format(max_episode_steps)) 214 | 215 | agent = TD3(state_dim, action_dim, max_action) 216 | replay_buffer = ReplayBuffer(state_dim, action_dim) 217 | # Build a tensorboard 218 | writer = SummaryWriter(log_dir='runs/TD3/TD3_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed)) 219 | 220 | noise_std = 0.1 * max_action # the std of Gaussian noise for exploration 221 | max_train_steps = 3e6 # Maximum number of training steps 222 | random_steps = 25e3 # Take the random actions in the beginning for the better exploration 223 | evaluate_freq = 5e3 # Evaluate the policy every 'evaluate_freq' steps 224 | evaluate_num = 0 # Record the number of evaluations 225 | evaluate_rewards = [] # Record the rewards during the evaluating 226 | total_steps = 0 # Record the total steps during the training 227 | 228 | while total_steps < max_train_steps: 229 | s = env.reset() 230 | episode_steps = 0 231 | done = False 232 | while not done: 233 | episode_steps += 1 234 | if total_steps < random_steps: # Take random actions in the beginning for the better exploration 235 | a = env.action_space.sample() 236 | else: 237 | # Add Gaussian noise to action for exploration 238 | a = agent.choose_action(s) 239 | a = (a + np.random.normal(0, noise_std, size=action_dim)).clip(-max_action, max_action) 240 | s_, r, done, _ = env.step(a) 241 | r = reward_adapter(r, env_index) # Adjust rewards for better performance 242 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them; 243 | # dw means dead or win,there is no next state s'; 244 | # but when reaching the max_episode_steps,there is a next state s' actually. 245 | if done and episode_steps != max_episode_steps: 246 | dw = True 247 | else: 248 | dw = False 249 | replay_buffer.store(s, a, r, s_, dw) # Store the transition 250 | s = s_ 251 | 252 | # Update one step 253 | if total_steps >= random_steps: 254 | agent.learn(replay_buffer) 255 | 256 | # Evaluate the policy every 'evaluate_freq' steps 257 | if (total_steps + 1) % evaluate_freq == 0: 258 | evaluate_num += 1 259 | evaluate_reward = evaluate_policy(env_evaluate, agent) 260 | evaluate_rewards.append(evaluate_reward) 261 | print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward)) 262 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps) 263 | # Save the rewards 264 | if evaluate_num % 10 == 0: 265 | np.save('./data_train/TD3_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards)) 266 | 267 | total_steps += 1 268 | -------------------------------------------------------------------------------- /7.TD3/TD3_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/TD3_result.png -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_0.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_10.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_100.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_0.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_10.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_100.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_0.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_10.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_100.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_0.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_10.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_100.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_0.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_10.npy -------------------------------------------------------------------------------- /7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_100.npy -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_0/events.out.tfevents.1648952137.李智.93956.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_0/events.out.tfevents.1648952137.李智.93956.0 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_10/events.out.tfevents.1648882414.李智.81744.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_10/events.out.tfevents.1648882414.李智.81744.0 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_100/events.out.tfevents.1648925401.李智.81744.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_100/events.out.tfevents.1648925401.李智.81744.1 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_0/events.out.tfevents.1648909506.李智.60360.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_0/events.out.tfevents.1648909506.李智.60360.2 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_10/events.out.tfevents.1648800524.李智.60360.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_10/events.out.tfevents.1648800524.李智.60360.0 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_100/events.out.tfevents.1648852975.李智.60360.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_100/events.out.tfevents.1648852975.李智.60360.1 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_0/events.out.tfevents.1649010066.李智.85868.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_0/events.out.tfevents.1649010066.李智.85868.2 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_10/events.out.tfevents.1648901654.李智.85868.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_10/events.out.tfevents.1648901654.李智.85868.0 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_100/events.out.tfevents.1648956951.李智.85868.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_100/events.out.tfevents.1648956951.李智.85868.1 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_0/events.out.tfevents.1649065960.李智.18392.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_0/events.out.tfevents.1649065960.李智.18392.2 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_10/events.out.tfevents.1649057339.李智.18392.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_10/events.out.tfevents.1649057339.李智.18392.0 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_100/events.out.tfevents.1649061632.李智.18392.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_100/events.out.tfevents.1649061632.李智.18392.1 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_0/events.out.tfevents.1648846023.李智.76672.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_0/events.out.tfevents.1648846023.李智.76672.2 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_10/events.out.tfevents.1648735005.李智.76672.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_10/events.out.tfevents.1648735005.李智.76672.0 -------------------------------------------------------------------------------- /7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_100/events.out.tfevents.1648793243.李智.76672.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_100/events.out.tfevents.1648793243.李智.76672.1 -------------------------------------------------------------------------------- /8.SAC/SAC-continuous.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import copy 7 | from torch.utils.tensorboard import SummaryWriter 8 | from torch.distributions import Normal 9 | 10 | 11 | class Actor(nn.Module): 12 | def __init__(self, state_dim, action_dim, hidden_width, max_action): 13 | super(Actor, self).__init__() 14 | self.max_action = max_action 15 | self.l1 = nn.Linear(state_dim, hidden_width) 16 | self.l2 = nn.Linear(hidden_width, hidden_width) 17 | self.mean_layer = nn.Linear(hidden_width, action_dim) 18 | self.log_std_layer = nn.Linear(hidden_width, action_dim) 19 | 20 | def forward(self, x, deterministic=False, with_logprob=True): 21 | x = F.relu(self.l1(x)) 22 | x = F.relu(self.l2(x)) 23 | mean = self.mean_layer(x) 24 | log_std = self.log_std_layer(x) # We output the log_std to ensure that std=exp(log_std)>0 25 | log_std = torch.clamp(log_std, -20, 2) 26 | std = torch.exp(log_std) 27 | 28 | dist = Normal(mean, std) # Generate a Gaussian distribution 29 | if deterministic: # When evaluating,we use the deterministic policy 30 | a = mean 31 | else: 32 | a = dist.rsample() # reparameterization trick: mean+std*N(0,1) 33 | 34 | if with_logprob: # The method refers to Open AI Spinning up, which is more stable. 35 | log_pi = dist.log_prob(a).sum(dim=1, keepdim=True) 36 | log_pi -= (2 * (np.log(2) - a - F.softplus(-2 * a))).sum(dim=1, keepdim=True) 37 | else: 38 | log_pi = None 39 | 40 | a = self.max_action * torch.tanh(a) # Use tanh to compress the unbounded Gaussian distribution into a bounded action interval. 41 | 42 | return a, log_pi 43 | 44 | 45 | class Critic(nn.Module): # According to (s,a), directly calculate Q(s,a) 46 | def __init__(self, state_dim, action_dim, hidden_width): 47 | super(Critic, self).__init__() 48 | # Q1 49 | self.l1 = nn.Linear(state_dim + action_dim, hidden_width) 50 | self.l2 = nn.Linear(hidden_width, hidden_width) 51 | self.l3 = nn.Linear(hidden_width, 1) 52 | # Q2 53 | self.l4 = nn.Linear(state_dim + action_dim, hidden_width) 54 | self.l5 = nn.Linear(hidden_width, hidden_width) 55 | self.l6 = nn.Linear(hidden_width, 1) 56 | 57 | def forward(self, s, a): 58 | s_a = torch.cat([s, a], 1) 59 | q1 = F.relu(self.l1(s_a)) 60 | q1 = F.relu(self.l2(q1)) 61 | q1 = self.l3(q1) 62 | 63 | q2 = F.relu(self.l4(s_a)) 64 | q2 = F.relu(self.l5(q2)) 65 | q2 = self.l6(q2) 66 | 67 | return q1, q2 68 | 69 | 70 | class ReplayBuffer(object): 71 | def __init__(self, state_dim, action_dim): 72 | self.max_size = int(1e6) 73 | self.count = 0 74 | self.size = 0 75 | self.s = np.zeros((self.max_size, state_dim)) 76 | self.a = np.zeros((self.max_size, action_dim)) 77 | self.r = np.zeros((self.max_size, 1)) 78 | self.s_ = np.zeros((self.max_size, state_dim)) 79 | self.dw = np.zeros((self.max_size, 1)) 80 | 81 | def store(self, s, a, r, s_, dw): 82 | self.s[self.count] = s 83 | self.a[self.count] = a 84 | self.r[self.count] = r 85 | self.s_[self.count] = s_ 86 | self.dw[self.count] = dw 87 | self.count = (self.count + 1) % self.max_size # When the 'count' reaches max_size, it will be reset to 0. 88 | self.size = min(self.size + 1, self.max_size) # Record the number of transitions 89 | 90 | def sample(self, batch_size): 91 | index = np.random.choice(self.size, size=batch_size) # Randomly sampling 92 | batch_s = torch.tensor(self.s[index], dtype=torch.float) 93 | batch_a = torch.tensor(self.a[index], dtype=torch.float) 94 | batch_r = torch.tensor(self.r[index], dtype=torch.float) 95 | batch_s_ = torch.tensor(self.s_[index], dtype=torch.float) 96 | batch_dw = torch.tensor(self.dw[index], dtype=torch.float) 97 | 98 | return batch_s, batch_a, batch_r, batch_s_, batch_dw 99 | 100 | 101 | class SAC(object): 102 | def __init__(self, state_dim, action_dim, max_action): 103 | self.max_action = max_action 104 | self.hidden_width = 256 # The number of neurons in hidden layers of the neural network 105 | self.batch_size = 256 # batch size 106 | self.GAMMA = 0.99 # discount factor 107 | self.TAU = 0.005 # Softly update the target network 108 | self.lr = 3e-4 # learning rate 109 | self.adaptive_alpha = True # Whether to automatically learn the temperature alpha 110 | if self.adaptive_alpha: 111 | # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper 112 | self.target_entropy = -action_dim 113 | # We learn log_alpha instead of alpha to ensure that alpha=exp(log_alpha)>0 114 | self.log_alpha = torch.zeros(1, requires_grad=True) 115 | self.alpha = self.log_alpha.exp() 116 | self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.lr) 117 | else: 118 | self.alpha = 0.2 119 | 120 | self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action) 121 | self.critic = Critic(state_dim, action_dim, self.hidden_width) 122 | self.critic_target = copy.deepcopy(self.critic) 123 | 124 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr) 125 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr) 126 | 127 | def choose_action(self, s, deterministic=False): 128 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0) 129 | a, _ = self.actor(s, deterministic, False) # When choosing actions, we do not need to compute log_pi 130 | return a.data.numpy().flatten() 131 | 132 | def learn(self, relay_buffer): 133 | batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size) # Sample a batch 134 | 135 | with torch.no_grad(): 136 | batch_a_, log_pi_ = self.actor(batch_s_) # a' from the current policy 137 | # Compute target Q 138 | target_Q1, target_Q2 = self.critic_target(batch_s_, batch_a_) 139 | target_Q = batch_r + self.GAMMA * (1 - batch_dw) * (torch.min(target_Q1, target_Q2) - self.alpha * log_pi_) 140 | 141 | # Compute current Q 142 | current_Q1, current_Q2 = self.critic(batch_s, batch_a) 143 | # Compute critic loss 144 | critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) 145 | # Optimize the critic 146 | self.critic_optimizer.zero_grad() 147 | critic_loss.backward() 148 | self.critic_optimizer.step() 149 | 150 | # Freeze critic networks so you don't waste computational effort 151 | for params in self.critic.parameters(): 152 | params.requires_grad = False 153 | 154 | # Compute actor loss 155 | a, log_pi = self.actor(batch_s) 156 | Q1, Q2 = self.critic(batch_s, a) 157 | Q = torch.min(Q1, Q2) 158 | actor_loss = (self.alpha * log_pi - Q).mean() 159 | 160 | # Optimize the actor 161 | self.actor_optimizer.zero_grad() 162 | actor_loss.backward() 163 | self.actor_optimizer.step() 164 | 165 | # Unfreeze critic networks 166 | for params in self.critic.parameters(): 167 | params.requires_grad = True 168 | 169 | # Update alpha 170 | if self.adaptive_alpha: 171 | # We learn log_alpha instead of alpha to ensure that alpha=exp(log_alpha)>0 172 | alpha_loss = -(self.log_alpha.exp() * (log_pi + self.target_entropy).detach()).mean() 173 | self.alpha_optimizer.zero_grad() 174 | alpha_loss.backward() 175 | self.alpha_optimizer.step() 176 | self.alpha = self.log_alpha.exp() 177 | 178 | # Softly update target networks 179 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): 180 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data) 181 | 182 | 183 | def evaluate_policy(env, agent): 184 | times = 3 # Perform three evaluations and calculate the average 185 | evaluate_reward = 0 186 | for _ in range(times): 187 | s = env.reset() 188 | done = False 189 | episode_reward = 0 190 | while not done: 191 | a = agent.choose_action(s, deterministic=True) # We use the deterministic policy during the evaluating 192 | s_, r, done, _ = env.step(a) 193 | episode_reward += r 194 | s = s_ 195 | evaluate_reward += episode_reward 196 | 197 | return int(evaluate_reward / times) 198 | 199 | 200 | def reward_adapter(r, env_index): 201 | if env_index == 0: # Pendulum-v1 202 | r = (r + 8) / 8 203 | elif env_index == 1: # BipedalWalker-v3 204 | if r <= -100: 205 | r = -1 206 | return r 207 | 208 | 209 | if __name__ == '__main__': 210 | env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2'] 211 | env_index = 0 212 | env = gym.make(env_name[env_index]) 213 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment 214 | number = 1 215 | seed = 0 216 | # Set random seed 217 | env.seed(seed) 218 | env.action_space.seed(seed) 219 | env_evaluate.seed(seed) 220 | env_evaluate.action_space.seed(seed) 221 | np.random.seed(seed) 222 | torch.manual_seed(seed) 223 | 224 | state_dim = env.observation_space.shape[0] 225 | action_dim = env.action_space.shape[0] 226 | max_action = float(env.action_space.high[0]) 227 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode 228 | print("env={}".format(env_name[env_index])) 229 | print("state_dim={}".format(state_dim)) 230 | print("action_dim={}".format(action_dim)) 231 | print("max_action={}".format(max_action)) 232 | print("max_episode_steps={}".format(max_episode_steps)) 233 | 234 | agent = SAC(state_dim, action_dim, max_action) 235 | replay_buffer = ReplayBuffer(state_dim, action_dim) 236 | # Build a tensorboard 237 | writer = SummaryWriter(log_dir='runs/SAC/SAC_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed)) 238 | 239 | max_train_steps = 3e6 # Maximum number of training steps 240 | random_steps = 25e3 # Take the random actions in the beginning for the better exploration 241 | evaluate_freq = 5e3 # Evaluate the policy every 'evaluate_freq' steps 242 | evaluate_num = 0 # Record the number of evaluations 243 | evaluate_rewards = [] # Record the rewards during the evaluating 244 | total_steps = 0 # Record the total steps during the training 245 | 246 | while total_steps < max_train_steps: 247 | s = env.reset() 248 | episode_steps = 0 249 | done = False 250 | while not done: 251 | episode_steps += 1 252 | if total_steps < random_steps: # Take the random actions in the beginning for the better exploration 253 | a = env.action_space.sample() 254 | else: 255 | a = agent.choose_action(s) 256 | s_, r, done, _ = env.step(a) 257 | r = reward_adapter(r, env_index) # Adjust rewards for better performance 258 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them; 259 | # dw means dead or win,there is no next state s'; 260 | # but when reaching the max_episode_steps,there is a next state s' actually. 261 | if done and episode_steps != max_episode_steps: 262 | dw = True 263 | else: 264 | dw = False 265 | replay_buffer.store(s, a, r, s_, dw) # Store the transition 266 | s = s_ 267 | 268 | if total_steps >= random_steps: 269 | agent.learn(replay_buffer) 270 | 271 | # Evaluate the policy every 'evaluate_freq' steps 272 | if (total_steps + 1) % evaluate_freq == 0: 273 | evaluate_num += 1 274 | evaluate_reward = evaluate_policy(env_evaluate, agent) 275 | evaluate_rewards.append(evaluate_reward) 276 | print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward)) 277 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps) 278 | # Save the rewards 279 | if evaluate_num % 10 == 0: 280 | np.save('./data_train/SAC_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards)) 281 | 282 | total_steps += 1 283 | -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/PPO+RNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/PPO+RNN.png -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/PPO_discrete_rnn_main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.utils.tensorboard import SummaryWriter 4 | import gym 5 | import argparse 6 | from normalization import Normalization, RewardScaling 7 | from replaybuffer import ReplayBuffer 8 | from ppo_discrete_rnn import PPO_discrete_RNN 9 | 10 | 11 | class Runner: 12 | def __init__(self, args, env_name, number, seed): 13 | self.args = args 14 | self.env_name = env_name 15 | self.number = number 16 | self.seed = seed 17 | 18 | # Create env 19 | self.env = gym.make(env_name) 20 | # Set random seed 21 | np.random.seed(self.seed) 22 | torch.manual_seed(self.seed) 23 | self.env.seed(seed) 24 | self.env.action_space.seed(seed) 25 | 26 | self.args.state_dim = self.env.observation_space.shape[0] 27 | self.args.action_dim = self.env.action_space.n 28 | self.args.episode_limit = self.env._max_episode_steps # Maximum number of steps per episode 29 | print("env={}".format(env_name)) 30 | print("state_dim={}".format(args.state_dim)) 31 | print("action_dim={}".format(args.action_dim)) 32 | print("episode_limit={}".format(args.episode_limit)) 33 | 34 | self.replay_buffer = ReplayBuffer(args) 35 | self.agent = PPO_discrete_RNN(args) 36 | 37 | # Create a tensorboard 38 | self.writer = SummaryWriter(log_dir='runs/PPO_discrete/env_{}_number_{}_seed_{}'.format(env_name, number, seed)) 39 | 40 | self.evaluate_rewards = [] # Record the rewards during the evaluating 41 | self.total_steps = 0 42 | 43 | if self.args.use_state_norm: 44 | print("------use state normalization------") 45 | self.state_norm = Normalization(shape=args.state_dim) # Trick 2:state normalization 46 | if self.args.use_reward_scaling: 47 | print("------use reward scaling------") 48 | self.reward_scaling = RewardScaling(shape=1, gamma=self.args.gamma) 49 | 50 | def run(self, ): 51 | evaluate_num = -1 # Record the number of evaluations 52 | while self.total_steps < self.args.max_train_steps: 53 | if self.total_steps // self.args.evaluate_freq > evaluate_num: 54 | self.evaluate_policy() # Evaluate the policy every 'evaluate_freq' steps 55 | evaluate_num += 1 56 | 57 | _, episode_steps = self.run_episode() # Run an episode 58 | self.total_steps += episode_steps 59 | 60 | if self.replay_buffer.episode_num == self.args.batch_size: 61 | self.agent.train(self.replay_buffer, self.total_steps) # Training 62 | self.replay_buffer.reset_buffer() 63 | 64 | self.evaluate_policy() 65 | self.env.close() 66 | 67 | def run_episode(self, ): 68 | episode_reward = 0 69 | s = self.env.reset() 70 | if self.args.use_reward_scaling: 71 | self.reward_scaling.reset() 72 | self.agent.reset_rnn_hidden() 73 | for episode_step in range(self.args.episode_limit): 74 | if self.args.use_state_norm: 75 | s = self.state_norm(s) 76 | a, a_logprob = self.agent.choose_action(s, evaluate=False) 77 | v = self.agent.get_value(s) 78 | s_, r, done, _ = self.env.step(a) 79 | episode_reward += r 80 | 81 | if done and episode_step + 1 != self.args.episode_limit: 82 | dw = True 83 | else: 84 | dw = False 85 | if self.args.use_reward_scaling: 86 | r = self.reward_scaling(r) 87 | # Store the transition 88 | self.replay_buffer.store_transition(episode_step, s, v, a, a_logprob, r, dw) 89 | s = s_ 90 | if done: 91 | break 92 | 93 | # An episode is over, store v in the last step 94 | if self.args.use_state_norm: 95 | s = self.state_norm(s) 96 | v = self.agent.get_value(s) 97 | self.replay_buffer.store_last_value(episode_step + 1, v) 98 | 99 | return episode_reward, episode_step + 1 100 | 101 | def evaluate_policy(self, ): 102 | evaluate_reward = 0 103 | for _ in range(self.args.evaluate_times): 104 | episode_reward, done = 0, False 105 | s = self.env.reset() 106 | self.agent.reset_rnn_hidden() 107 | while not done: 108 | if self.args.use_state_norm: 109 | s = self.state_norm(s, update=False) 110 | a, a_logprob = self.agent.choose_action(s, evaluate=True) 111 | s_, r, done, _ = self.env.step(a) 112 | episode_reward += r 113 | s = s_ 114 | evaluate_reward += episode_reward 115 | 116 | evaluate_reward = evaluate_reward / self.args.evaluate_times 117 | self.evaluate_rewards.append(evaluate_reward) 118 | print("total_steps:{} \t evaluate_reward:{}".format(self.total_steps, evaluate_reward)) 119 | self.writer.add_scalar('evaluate_step_rewards_{}'.format(self.env_name), evaluate_reward, global_step=self.total_steps) 120 | # Save the rewards and models 121 | np.save('./data_train/PPO_env_{}_number_{}_seed_{}.npy'.format(self.env_name, self.number, self.seed), np.array(self.evaluate_rewards)) 122 | 123 | 124 | if __name__ == '__main__': 125 | parser = argparse.ArgumentParser("Hyperparameter Setting for PPO-discrete") 126 | parser.add_argument("--max_train_steps", type=int, default=int(2e5), help=" Maximum number of training steps") 127 | parser.add_argument("--evaluate_freq", type=float, default=5e3, help="Evaluate the policy every 'evaluate_freq' steps") 128 | parser.add_argument("--save_freq", type=int, default=20, help="Save frequency") 129 | parser.add_argument("--evaluate_times", type=float, default=3, help="Evaluate times") 130 | 131 | parser.add_argument("--batch_size", type=int, default=16, help="Batch size") 132 | parser.add_argument("--mini_batch_size", type=int, default=2, help="Minibatch size") 133 | parser.add_argument("--hidden_dim", type=int, default=64, help="The number of neurons in hidden layers of the neural network") 134 | parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate of actor") 135 | parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") 136 | parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter") 137 | parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter") 138 | parser.add_argument("--K_epochs", type=int, default=15, help="PPO parameter") 139 | parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization") 140 | parser.add_argument("--use_state_norm", type=bool, default=False, help="Trick 2:state normalization") 141 | parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling") 142 | parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy") 143 | parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay") 144 | parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip") 145 | parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization") 146 | parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5") 147 | parser.add_argument("--use_tanh", type=float, default=False, help="Trick 10: tanh activation function") 148 | parser.add_argument("--use_gru", type=bool, default=True, help="Whether to use GRU") 149 | 150 | args = parser.parse_args() 151 | 152 | env_names = ['CartPole-v1', 'LunarLander-v2'] 153 | env_index = 0 154 | for seed in [0, 10, 100]: 155 | runner = Runner(args, env_name=env_names[env_index], number=3, seed=seed) 156 | runner.run() 157 | -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/README.md: -------------------------------------------------------------------------------- 1 | # PPO-discrete + RNN 2 | This is a concise Pytorch implementation of PPO+RNN(GRU/LSTM) on discrete action space.
3 | 4 | 5 | ## How to use my code? 6 | You can dircetly run 'PPO_discrete_rnn_main.py' in your own IDE.
7 | 8 | ## Trainning environments 9 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 2 environments.
10 | env_index=0 represent 'CartPole-v1'
11 | env_index=1 represent 'LunarLander-v2'
12 | 13 | ## Training result 14 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/9.PPO-discrete-RNN/PPO%2BRNN.png) 15 | 16 | -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/__pycache__/normalization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/__pycache__/normalization.cpython-37.pyc -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/__pycache__/ppo_discrete_rnn.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/__pycache__/ppo_discrete_rnn.cpython-37.pyc -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/__pycache__/replaybuffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/__pycache__/replaybuffer.cpython-37.pyc -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_0.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_10.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_100.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_0.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_10.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_100.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_0.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_10.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_100.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_0.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_10.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_10.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_100.npy -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/normalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RunningMeanStd: 5 | # Dynamically calculate mean and std 6 | def __init__(self, shape): # shape:the dimension of input data 7 | self.n = 0 8 | self.mean = np.zeros(shape) 9 | self.S = np.zeros(shape) 10 | self.std = np.sqrt(self.S) 11 | 12 | def update(self, x): 13 | x = np.array(x) 14 | self.n += 1 15 | if self.n == 1: 16 | self.mean = x 17 | self.std = x 18 | else: 19 | old_mean = self.mean.copy() 20 | self.mean = old_mean + (x - old_mean) / self.n 21 | self.S = self.S + (x - old_mean) * (x - self.mean) 22 | self.std = np.sqrt(self.S / self.n) 23 | 24 | 25 | class Normalization: 26 | def __init__(self, shape): 27 | self.running_ms = RunningMeanStd(shape=shape) 28 | 29 | def __call__(self, x, update=True): 30 | # Whether to update the mean and std,during the evaluating,update=False 31 | if update: 32 | self.running_ms.update(x) 33 | x = (x - self.running_ms.mean) / (self.running_ms.std + 1e-8) 34 | 35 | return x 36 | 37 | 38 | class RewardScaling: 39 | def __init__(self, shape, gamma): 40 | self.shape = shape # reward shape=1 41 | self.gamma = gamma # discount factor 42 | self.running_ms = RunningMeanStd(shape=self.shape) 43 | self.R = np.zeros(self.shape) 44 | 45 | def __call__(self, x): 46 | self.R = self.gamma * self.R + x 47 | self.running_ms.update(self.R) 48 | x = x / (self.running_ms.std + 1e-8) # Only divided std 49 | return x 50 | 51 | def reset(self): # When an episode is done,we should reset 'self.R' 52 | self.R = np.zeros(self.shape) 53 | 54 | 55 | -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/ppo_discrete_rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler, SequentialSampler 6 | from torch.distributions import Categorical 7 | import copy 8 | 9 | 10 | # Trick 8: orthogonal initialization 11 | def orthogonal_init(layer, gain=np.sqrt(2)): 12 | for name, param in layer.named_parameters(): 13 | if 'bias' in name: 14 | nn.init.constant_(param, 0) 15 | elif 'weight' in name: 16 | nn.init.orthogonal_(param, gain=gain) 17 | 18 | return layer 19 | 20 | 21 | class Actor_Critic_RNN(nn.Module): 22 | def __init__(self, args): 23 | super(Actor_Critic_RNN, self).__init__() 24 | self.use_gru = args.use_gru 25 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh 26 | 27 | self.actor_rnn_hidden = None 28 | self.actor_fc1 = nn.Linear(args.state_dim, args.hidden_dim) 29 | if args.use_gru: 30 | print("------use GRU------") 31 | self.actor_rnn = nn.GRU(args.hidden_dim, args.hidden_dim, batch_first=True) 32 | else: 33 | print("------use LSTM------") 34 | self.actor_rnn = nn.LSTM(args.hidden_dim, args.hidden_dim, batch_first=True) 35 | self.actor_fc2 = nn.Linear(args.hidden_dim, args.action_dim) 36 | 37 | self.critic_rnn_hidden = None 38 | self.critic_fc1 = nn.Linear(args.state_dim, args.hidden_dim) 39 | if args.use_gru: 40 | self.critic_rnn = nn.GRU(args.hidden_dim, args.hidden_dim, batch_first=True) 41 | else: 42 | self.critic_rnn = nn.LSTM(args.hidden_dim, args.hidden_dim, batch_first=True) 43 | self.critic_fc2 = nn.Linear(args.hidden_dim, 1) 44 | 45 | if args.use_orthogonal_init: 46 | print("------use orthogonal init------") 47 | orthogonal_init(self.actor_fc1) 48 | orthogonal_init(self.actor_rnn) 49 | orthogonal_init(self.actor_fc2, gain=0.01) 50 | orthogonal_init(self.critic_fc1) 51 | orthogonal_init(self.critic_rnn) 52 | orthogonal_init(self.critic_fc2) 53 | 54 | def actor(self, s): 55 | s = self.activate_func(self.actor_fc1(s)) 56 | output, self.actor_rnn_hidden = self.actor_rnn(s, self.actor_rnn_hidden) 57 | logit = self.actor_fc2(output) 58 | return logit 59 | 60 | def critic(self, s): 61 | s = self.activate_func(self.critic_fc1(s)) 62 | output, self.critic_rnn_hidden = self.critic_rnn(s, self.critic_rnn_hidden) 63 | value = self.critic_fc2(output) 64 | return value 65 | 66 | 67 | class PPO_discrete_RNN: 68 | def __init__(self, args): 69 | self.batch_size = args.batch_size 70 | self.mini_batch_size = args.mini_batch_size 71 | self.max_train_steps = args.max_train_steps 72 | self.lr = args.lr # Learning rate of actor 73 | self.gamma = args.gamma # Discount factor 74 | self.lamda = args.lamda # GAE parameter 75 | self.epsilon = args.epsilon # PPO clip parameter 76 | self.K_epochs = args.K_epochs # PPO parameter 77 | self.entropy_coef = args.entropy_coef # Entropy coefficient 78 | self.set_adam_eps = args.set_adam_eps 79 | self.use_grad_clip = args.use_grad_clip 80 | self.use_lr_decay = args.use_lr_decay 81 | self.use_adv_norm = args.use_adv_norm 82 | 83 | self.ac = Actor_Critic_RNN(args) 84 | if self.set_adam_eps: # Trick 9: set Adam epsilon=1e-5 85 | self.optimizer = torch.optim.Adam(self.ac.parameters(), lr=self.lr, eps=1e-5) 86 | else: 87 | self.optimizer = torch.optim.Adam(self.ac.parameters(), lr=self.lr) 88 | 89 | def reset_rnn_hidden(self): 90 | self.ac.actor_rnn_hidden = None 91 | self.ac.critic_rnn_hidden = None 92 | 93 | def choose_action(self, s, evaluate=False): 94 | with torch.no_grad(): 95 | s = torch.tensor(s, dtype=torch.float).unsqueeze(0) 96 | logit = self.ac.actor(s) 97 | if evaluate: 98 | a = torch.argmax(logit) 99 | return a.item(), None 100 | else: 101 | dist = Categorical(logits=logit) 102 | a = dist.sample() 103 | a_logprob = dist.log_prob(a) 104 | return a.item(), a_logprob.item() 105 | 106 | def get_value(self, s): 107 | with torch.no_grad(): 108 | s = torch.tensor(s, dtype=torch.float).unsqueeze(0) 109 | value = self.ac.critic(s) 110 | return value.item() 111 | 112 | def train(self, replay_buffer, total_steps): 113 | batch = replay_buffer.get_training_data() # Get training data 114 | 115 | # Optimize policy for K epochs: 116 | for _ in range(self.K_epochs): 117 | for index in BatchSampler(SequentialSampler(range(self.batch_size)), self.mini_batch_size, False): 118 | # If use RNN, we need to reset the rnn_hidden of the actor and critic. 119 | self.reset_rnn_hidden() 120 | logits_now = self.ac.actor(batch['s'][index]) # logits_now.shape=(mini_batch_size, max_episode_len, action_dim) 121 | values_now = self.ac.critic(batch['s'][index]).squeeze(-1) # values_now.shape=(mini_batch_size, max_episode_len) 122 | 123 | dist_now = Categorical(logits=logits_now) 124 | dist_entropy = dist_now.entropy() # shape(mini_batch_size, max_episode_len) 125 | a_logprob_now = dist_now.log_prob(batch['a'][index]) # shape(mini_batch_size, max_episode_len) 126 | # a/b=exp(log(a)-log(b)) 127 | ratios = torch.exp(a_logprob_now - batch['a_logprob'][index]) # shape(mini_batch_size, max_episode_len) 128 | 129 | # actor loss 130 | surr1 = ratios * batch['adv'][index] 131 | surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * batch['adv'][index] 132 | actor_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy # shape(mini_batch_size, max_episode_len) 133 | actor_loss = (actor_loss * batch['active'][index]).sum() / batch['active'][index].sum() 134 | 135 | # critic_loss 136 | critic_loss = (values_now - batch['v_target'][index]) ** 2 137 | critic_loss = (critic_loss * batch['active'][index]).sum() / batch['active'][index].sum() 138 | 139 | # Update 140 | self.optimizer.zero_grad() 141 | loss = actor_loss + critic_loss * 0.5 142 | loss.backward() 143 | if self.use_grad_clip: # Trick 7: Gradient clip 144 | torch.nn.utils.clip_grad_norm_(self.ac.parameters(), 0.5) 145 | self.optimizer.step() 146 | 147 | if self.use_lr_decay: # Trick 6:learning rate Decay 148 | self.lr_decay(total_steps) 149 | 150 | def lr_decay(self, total_steps): 151 | lr_now = 0.9 * self.lr * (1 - total_steps / self.max_train_steps) + 0.1 * self.lr 152 | for p in self.optimizer.param_groups: 153 | p['lr'] = lr_now 154 | 155 | def save_model(self, env_name, number, seed, total_steps): 156 | torch.save(self.ac.state_dict(), "./model/PPO_actor_env_{}_number_{}_seed_{}_step_{}k.pth".format(env_name, number, seed, int(total_steps / 1000))) 157 | 158 | def load_model(self, env_name, number, seed, step): 159 | self.ac.load_state_dict(torch.load("./model/PPO_actor_env_{}_number_{}_seed_{}_step_{}k.pth".format(env_name, number, seed, step))) 160 | 161 | -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/replaybuffer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import copy 4 | 5 | 6 | class ReplayBuffer: 7 | def __init__(self, args): 8 | self.gamma = args.gamma 9 | self.lamda = args.lamda 10 | self.use_adv_norm = args.use_adv_norm 11 | self.state_dim = args.state_dim 12 | self.action_dim = args.action_dim 13 | self.episode_limit = args.episode_limit 14 | self.batch_size = args.batch_size 15 | self.episode_num = 0 16 | self.max_episode_len = 0 17 | self.buffer = None 18 | self.reset_buffer() 19 | 20 | def reset_buffer(self): 21 | self.buffer = {'s': np.zeros([self.batch_size, self.episode_limit, self.state_dim]), 22 | 'v': np.zeros([self.batch_size, self.episode_limit + 1]), 23 | 'a': np.zeros([self.batch_size, self.episode_limit]), 24 | 'a_logprob': np.zeros([self.batch_size, self.episode_limit]), 25 | 'r': np.zeros([self.batch_size, self.episode_limit]), 26 | 'dw': np.ones([self.batch_size, self.episode_limit]), # Note: We use 'np.ones' to initialize 'dw' 27 | 'active': np.zeros([self.batch_size, self.episode_limit]) 28 | } 29 | self.episode_num = 0 30 | self.max_episode_len = 0 31 | 32 | def store_transition(self, episode_step, s, v, a, a_logprob, r, dw): 33 | self.buffer['s'][self.episode_num][episode_step] = s 34 | self.buffer['v'][self.episode_num][episode_step] = v 35 | self.buffer['a'][self.episode_num][episode_step] = a 36 | self.buffer['a_logprob'][self.episode_num][episode_step] = a_logprob 37 | self.buffer['r'][self.episode_num][episode_step] = r 38 | self.buffer['dw'][self.episode_num][episode_step] = dw 39 | 40 | self.buffer['active'][self.episode_num][episode_step] = 1.0 41 | 42 | def store_last_value(self, episode_step, v): 43 | self.buffer['v'][self.episode_num][episode_step] = v 44 | self.episode_num += 1 45 | # Record max_episode_len 46 | if episode_step > self.max_episode_len: 47 | self.max_episode_len = episode_step 48 | 49 | def get_adv(self): 50 | # Calculate the advantage using GAE 51 | v = self.buffer['v'][:, :self.max_episode_len] 52 | v_next = self.buffer['v'][:, 1:self.max_episode_len + 1] 53 | r = self.buffer['r'][:, :self.max_episode_len] 54 | dw = self.buffer['dw'][:, :self.max_episode_len] 55 | active = self.buffer['active'][:, :self.max_episode_len] 56 | adv = np.zeros_like(r) # adv.shape=(batch_size,max_episode_len) 57 | gae = 0 58 | with torch.no_grad(): # adv and v_target have no gradient 59 | # deltas.shape=(batch_size,max_episode_len) 60 | deltas = r + self.gamma * v_next * (1 - dw) - v 61 | for t in reversed(range(self.max_episode_len)): 62 | gae = deltas[:, t] + self.gamma * self.lamda * gae # gae.shape=(batch_size) 63 | adv[:, t] = gae 64 | v_target = adv + v # v_target.shape(batch_size,max_episode_len) 65 | if self.use_adv_norm: # Trick 1:advantage normalization 66 | adv_copy = copy.deepcopy(adv) 67 | adv_copy[active == 0] = np.nan # 忽略掉active=0的那些adv 68 | adv = ((adv - np.nanmean(adv_copy)) / (np.nanstd(adv_copy) + 1e-5)) 69 | return adv, v_target 70 | 71 | def get_training_data(self): 72 | adv, v_target = self.get_adv() 73 | batch = {'s': torch.tensor(self.buffer['s'][:, :self.max_episode_len], dtype=torch.float32), 74 | 'a': torch.tensor(self.buffer['a'][:, :self.max_episode_len], dtype=torch.long), # 动作a的类型必须是long 75 | 'a_logprob': torch.tensor(self.buffer['a_logprob'][:, :self.max_episode_len], dtype=torch.float32), 76 | 'active': torch.tensor(self.buffer['active'][:, :self.max_episode_len], dtype=torch.float32), 77 | 'adv': torch.tensor(adv, dtype=torch.float32), 78 | 'v_target': torch.tensor(v_target, dtype=torch.float32)} 79 | 80 | return batch 81 | -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_0/events.out.tfevents.1659015484.DESKTOP-LMKC0MO.6444.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_0/events.out.tfevents.1659015484.DESKTOP-LMKC0MO.6444.0 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_10/events.out.tfevents.1659019387.DESKTOP-LMKC0MO.6444.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_10/events.out.tfevents.1659019387.DESKTOP-LMKC0MO.6444.1 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_100/events.out.tfevents.1659022676.DESKTOP-LMKC0MO.6444.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_100/events.out.tfevents.1659022676.DESKTOP-LMKC0MO.6444.2 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_0/events.out.tfevents.1659083525.DESKTOP-LMKC0MO.2204.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_0/events.out.tfevents.1659083525.DESKTOP-LMKC0MO.2204.0 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_10/events.out.tfevents.1659084659.DESKTOP-LMKC0MO.2204.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_10/events.out.tfevents.1659084659.DESKTOP-LMKC0MO.2204.1 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_100/events.out.tfevents.1659085747.DESKTOP-LMKC0MO.2204.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_100/events.out.tfevents.1659085747.DESKTOP-LMKC0MO.2204.2 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_0/events.out.tfevents.1659015468.DESKTOP-LMKC0MO.13484.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_0/events.out.tfevents.1659015468.DESKTOP-LMKC0MO.13484.0 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_10/events.out.tfevents.1659031613.DESKTOP-LMKC0MO.13484.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_10/events.out.tfevents.1659031613.DESKTOP-LMKC0MO.13484.1 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_100/events.out.tfevents.1659045291.DESKTOP-LMKC0MO.13484.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_100/events.out.tfevents.1659045291.DESKTOP-LMKC0MO.13484.2 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_0/events.out.tfevents.1659083526.DESKTOP-LMKC0MO.12096.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_0/events.out.tfevents.1659083526.DESKTOP-LMKC0MO.12096.0 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_10/events.out.tfevents.1659083528.DESKTOP-LMKC0MO.11420.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_10/events.out.tfevents.1659083528.DESKTOP-LMKC0MO.11420.0 -------------------------------------------------------------------------------- /9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_100/events.out.tfevents.1659083530.DESKTOP-LMKC0MO.11484.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_100/events.out.tfevents.1659083530.DESKTOP-LMKC0MO.11484.0 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Lizhi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DRL-code-pytorch 2 | Concise pytorch implementations of DRL algorithms, including REINFORCE, A2C, Rainbow DQN, PPO(discrete and continuous), DDPG, TD3, SAC, PPO-discrete-RNN(LSTM/GRU). 3 | 4 | 5 | # Dependencies 6 | python==3.7.9
7 | numpy==1.19.4
8 | pytorch==1.12.0
9 | tensorboard==0.6.0
10 | gym==0.21.0
11 | --------------------------------------------------------------------------------