├── 1.REINFORCE
├── README.md
├── REINFORCE.py
├── REINFORCE_baseline.py
├── data_train
│ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_0.npy
│ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_10.npy
│ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_100.npy
│ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_0.npy
│ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_10.npy
│ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_100.npy
│ ├── REINFORCE_env_CartPole-v0_number_2_seed_0.npy
│ ├── REINFORCE_env_CartPole-v0_number_2_seed_10.npy
│ ├── REINFORCE_env_CartPole-v0_number_2_seed_100.npy
│ ├── REINFORCE_env_CartPole-v1_number_2_seed_0.npy
│ ├── REINFORCE_env_CartPole-v1_number_2_seed_10.npy
│ └── REINFORCE_env_CartPole-v1_number_2_seed_100.npy
├── runs
│ └── REINFORCE
│ │ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_0
│ │ └── events.out.tfevents.1648121668.李智.23156.0
│ │ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_10
│ │ └── events.out.tfevents.1648121786.李智.23156.1
│ │ ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_100
│ │ └── events.out.tfevents.1648121899.李智.23156.2
│ │ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_0
│ │ └── events.out.tfevents.1648121670.李智.15096.0
│ │ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_10
│ │ └── events.out.tfevents.1648121797.李智.15096.1
│ │ ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_100
│ │ └── events.out.tfevents.1648121918.李智.15096.2
│ │ ├── REINFORCE_env_CartPole-v0_number_2_seed_0
│ │ └── events.out.tfevents.1648121512.李智.32424.0
│ │ ├── REINFORCE_env_CartPole-v0_number_2_seed_10
│ │ └── events.out.tfevents.1648121583.李智.32424.1
│ │ ├── REINFORCE_env_CartPole-v0_number_2_seed_100
│ │ └── events.out.tfevents.1648121655.李智.32424.2
│ │ ├── REINFORCE_env_CartPole-v1_number_2_seed_0
│ │ └── events.out.tfevents.1648121526.李智.11256.0
│ │ ├── REINFORCE_env_CartPole-v1_number_2_seed_10
│ │ └── events.out.tfevents.1648121607.李智.11256.1
│ │ └── REINFORCE_env_CartPole-v1_number_2_seed_100
│ │ └── events.out.tfevents.1648121688.李智.11256.2
└── training results.png
├── 2.Actor-Critic
├── A2C.py
├── A2C_results.png
├── README.md
├── data_train
│ ├── A2C_env_CartPole-v0_number_9_seed_0.npy
│ ├── A2C_env_CartPole-v0_number_9_seed_10.npy
│ ├── A2C_env_CartPole-v0_number_9_seed_100.npy
│ ├── A2C_env_CartPole-v1_number_9_seed_0.npy
│ ├── A2C_env_CartPole-v1_number_9_seed_10.npy
│ └── A2C_env_CartPole-v1_number_9_seed_100.npy
└── runs
│ └── A2C
│ ├── A2C_env_CartPole-v0_number_9_seed_0
│ └── events.out.tfevents.1648553119.李智.62564.0
│ ├── A2C_env_CartPole-v0_number_9_seed_10
│ └── events.out.tfevents.1648553543.李智.62564.1
│ ├── A2C_env_CartPole-v0_number_9_seed_100
│ └── events.out.tfevents.1648554019.李智.62564.2
│ ├── A2C_env_CartPole-v1_number_9_seed_0
│ └── events.out.tfevents.1648553122.李智.63460.0
│ ├── A2C_env_CartPole-v1_number_9_seed_10
│ └── events.out.tfevents.1648553561.李智.63460.1
│ └── A2C_env_CartPole-v1_number_9_seed_100
│ └── events.out.tfevents.1648554055.李智.63460.2
├── 3.Rainbow_DQN
├── README.md
├── Rainbow_DQN_main.py
├── __pycache__
│ ├── network.cpython-37.pyc
│ ├── rainbow_dqn.cpython-37.pyc
│ ├── replay_buffer.cpython-37.pyc
│ └── sum_tree.cpython-37.pyc
├── data_train
│ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0.npy
│ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10.npy
│ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100.npy
│ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0.npy
│ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10.npy
│ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100.npy
│ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy
│ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy
│ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy
│ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy
│ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy
│ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy
│ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy
│ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy
│ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy
│ ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_0.npy
│ ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_10.npy
│ └── Rainbow_DQN_env_LunarLander-v2_number_1_seed_100.npy
├── drawing_Rainbow_DQN.py
├── network.py
├── rainbow_dqn.py
├── rainbow_dqn_result.png
├── replay_buffer.py
├── runs
│ └── DQN
│ │ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0
│ │ └── events.out.tfevents.1658494478.DESKTOP-LMKC0MO.1408.0
│ │ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10
│ │ └── events.out.tfevents.1658507126.DESKTOP-LMKC0MO.1408.1
│ │ ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100
│ │ └── events.out.tfevents.1658520541.DESKTOP-LMKC0MO.1408.2
│ │ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0
│ │ └── events.out.tfevents.1658494481.DESKTOP-LMKC0MO.9316.0
│ │ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10
│ │ └── events.out.tfevents.1658512436.DESKTOP-LMKC0MO.9316.1
│ │ ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100
│ │ └── events.out.tfevents.1658531515.DESKTOP-LMKC0MO.9316.2
│ │ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0
│ │ └── events.out.tfevents.1658494475.DESKTOP-LMKC0MO.5976.0
│ │ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10
│ │ └── events.out.tfevents.1658511615.DESKTOP-LMKC0MO.5976.1
│ │ ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100
│ │ └── events.out.tfevents.1658528978.DESKTOP-LMKC0MO.5976.2
│ │ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0
│ │ └── events.out.tfevents.1658494473.DESKTOP-LMKC0MO.2144.0
│ │ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10
│ │ └── events.out.tfevents.1658511489.DESKTOP-LMKC0MO.2144.1
│ │ ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100
│ │ └── events.out.tfevents.1658529336.DESKTOP-LMKC0MO.2144.2
│ │ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0
│ │ └── events.out.tfevents.1658494471.DESKTOP-LMKC0MO.9964.0
│ │ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10
│ │ └── events.out.tfevents.1658510515.DESKTOP-LMKC0MO.9964.1
│ │ ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100
│ │ └── events.out.tfevents.1658526626.DESKTOP-LMKC0MO.9964.2
│ │ ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_0
│ │ └── events.out.tfevents.1658479209.DESKTOP-LMKC0MO.1228.0
│ │ ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_10
│ │ └── events.out.tfevents.1658479212.DESKTOP-LMKC0MO.10500.0
│ │ └── Rainbow_DQN_env_LunarLander-v2_number_1_seed_100
│ │ └── events.out.tfevents.1658479214.DESKTOP-LMKC0MO.9512.0
└── sum_tree.py
├── 4.PPO-discrete
├── PPO_discrete_main.py
├── README.md
├── __pycache__
│ ├── normalization.cpython-37.pyc
│ ├── ppo_discrete.cpython-37.pyc
│ └── replaybuffer.cpython-37.pyc
├── data_train
│ └── readme.txt
├── normalization.py
├── ppo_discrete.py
├── replaybuffer.py
├── runs
│ └── readme.txt
└── training_result.png
├── 5.PPO-continuous
├── PPO_continuous_main.py
├── README.md
├── __pycache__
│ ├── normalization.cpython-37.pyc
│ ├── ppo_continuous.cpython-37.pyc
│ └── replaybuffer.cpython-37.pyc
├── data_train
│ └── readme.txt
├── normalization.py
├── ppo_continuous.py
├── replaybuffer.py
├── runs
│ └── readme.txt
└── training_result.png
├── 6.DDPG
└── DDPG.py
├── 7.TD3
├── README.md
├── TD3.py
├── TD3_result.png
├── data_train
│ ├── TD3_env_BipedalWalker-v3_number_1_seed_0.npy
│ ├── TD3_env_BipedalWalker-v3_number_1_seed_10.npy
│ ├── TD3_env_BipedalWalker-v3_number_1_seed_100.npy
│ ├── TD3_env_HalfCheetah-v2_number_1_seed_0.npy
│ ├── TD3_env_HalfCheetah-v2_number_1_seed_10.npy
│ ├── TD3_env_HalfCheetah-v2_number_1_seed_100.npy
│ ├── TD3_env_Hopper-v2_number_1_seed_0.npy
│ ├── TD3_env_Hopper-v2_number_1_seed_10.npy
│ ├── TD3_env_Hopper-v2_number_1_seed_100.npy
│ ├── TD3_env_Pendulum-v1_number_1_seed_0.npy
│ ├── TD3_env_Pendulum-v1_number_1_seed_10.npy
│ ├── TD3_env_Pendulum-v1_number_1_seed_100.npy
│ ├── TD3_env_Walker2d-v2_number_1_seed_0.npy
│ ├── TD3_env_Walker2d-v2_number_1_seed_10.npy
│ └── TD3_env_Walker2d-v2_number_1_seed_100.npy
└── runs
│ └── TD3
│ ├── TD3_env_BipedalWalker-v3_number_1_seed_0
│ └── events.out.tfevents.1648952137.李智.93956.0
│ ├── TD3_env_BipedalWalker-v3_number_1_seed_10
│ └── events.out.tfevents.1648882414.李智.81744.0
│ ├── TD3_env_BipedalWalker-v3_number_1_seed_100
│ └── events.out.tfevents.1648925401.李智.81744.1
│ ├── TD3_env_HalfCheetah-v2_number_1_seed_0
│ └── events.out.tfevents.1648909506.李智.60360.2
│ ├── TD3_env_HalfCheetah-v2_number_1_seed_10
│ └── events.out.tfevents.1648800524.李智.60360.0
│ ├── TD3_env_HalfCheetah-v2_number_1_seed_100
│ └── events.out.tfevents.1648852975.李智.60360.1
│ ├── TD3_env_Hopper-v2_number_1_seed_0
│ └── events.out.tfevents.1649010066.李智.85868.2
│ ├── TD3_env_Hopper-v2_number_1_seed_10
│ └── events.out.tfevents.1648901654.李智.85868.0
│ ├── TD3_env_Hopper-v2_number_1_seed_100
│ └── events.out.tfevents.1648956951.李智.85868.1
│ ├── TD3_env_Pendulum-v1_number_1_seed_0
│ └── events.out.tfevents.1649065960.李智.18392.2
│ ├── TD3_env_Pendulum-v1_number_1_seed_10
│ └── events.out.tfevents.1649057339.李智.18392.0
│ ├── TD3_env_Pendulum-v1_number_1_seed_100
│ └── events.out.tfevents.1649061632.李智.18392.1
│ ├── TD3_env_Walker2d-v2_number_1_seed_0
│ └── events.out.tfevents.1648846023.李智.76672.2
│ ├── TD3_env_Walker2d-v2_number_1_seed_10
│ └── events.out.tfevents.1648735005.李智.76672.0
│ └── TD3_env_Walker2d-v2_number_1_seed_100
│ └── events.out.tfevents.1648793243.李智.76672.1
├── 8.SAC
└── SAC-continuous.py
├── 9.PPO-discrete-RNN
├── PPO+RNN.png
├── PPO_discrete_rnn_main.py
├── README.md
├── __pycache__
│ ├── normalization.cpython-37.pyc
│ ├── ppo_discrete_rnn.cpython-37.pyc
│ └── replaybuffer.cpython-37.pyc
├── data_train
│ ├── PPO_env_CartPole-v1_number_3_seed_0.npy
│ ├── PPO_env_CartPole-v1_number_3_seed_10.npy
│ ├── PPO_env_CartPole-v1_number_3_seed_100.npy
│ ├── PPO_env_CartPole-v1_number_5_seed_0.npy
│ ├── PPO_env_CartPole-v1_number_5_seed_10.npy
│ ├── PPO_env_CartPole-v1_number_5_seed_100.npy
│ ├── PPO_env_LunarLander-v2_number_3_seed_0.npy
│ ├── PPO_env_LunarLander-v2_number_3_seed_10.npy
│ ├── PPO_env_LunarLander-v2_number_3_seed_100.npy
│ ├── PPO_env_LunarLander-v2_number_5_seed_0.npy
│ ├── PPO_env_LunarLander-v2_number_5_seed_10.npy
│ └── PPO_env_LunarLander-v2_number_5_seed_100.npy
├── normalization.py
├── ppo_discrete_rnn.py
├── replaybuffer.py
└── runs
│ └── PPO_discrete
│ ├── env_CartPole-v1_number_3_seed_0
│ └── events.out.tfevents.1659015484.DESKTOP-LMKC0MO.6444.0
│ ├── env_CartPole-v1_number_3_seed_10
│ └── events.out.tfevents.1659019387.DESKTOP-LMKC0MO.6444.1
│ ├── env_CartPole-v1_number_3_seed_100
│ └── events.out.tfevents.1659022676.DESKTOP-LMKC0MO.6444.2
│ ├── env_CartPole-v1_number_5_seed_0
│ └── events.out.tfevents.1659083525.DESKTOP-LMKC0MO.2204.0
│ ├── env_CartPole-v1_number_5_seed_10
│ └── events.out.tfevents.1659084659.DESKTOP-LMKC0MO.2204.1
│ ├── env_CartPole-v1_number_5_seed_100
│ └── events.out.tfevents.1659085747.DESKTOP-LMKC0MO.2204.2
│ ├── env_LunarLander-v2_number_3_seed_0
│ └── events.out.tfevents.1659015468.DESKTOP-LMKC0MO.13484.0
│ ├── env_LunarLander-v2_number_3_seed_10
│ └── events.out.tfevents.1659031613.DESKTOP-LMKC0MO.13484.1
│ ├── env_LunarLander-v2_number_3_seed_100
│ └── events.out.tfevents.1659045291.DESKTOP-LMKC0MO.13484.2
│ ├── env_LunarLander-v2_number_5_seed_0
│ └── events.out.tfevents.1659083526.DESKTOP-LMKC0MO.12096.0
│ ├── env_LunarLander-v2_number_5_seed_10
│ └── events.out.tfevents.1659083528.DESKTOP-LMKC0MO.11420.0
│ └── env_LunarLander-v2_number_5_seed_100
│ └── events.out.tfevents.1659083530.DESKTOP-LMKC0MO.11484.0
├── LICENSE
└── README.md
/1.REINFORCE/README.md:
--------------------------------------------------------------------------------
1 | # REINFORCE
2 | This is a concise Pytorch implementation of REINFORCE.
3 | REINFORCE.py is a implementation of REINFORCE without the baseline.
4 | REINFORCE_baseline.py is a implementation of REINFORCE with the baseline.
5 |
6 | ## How to use my code?
7 | You can dircetly run REINFORCE.py and REINFORCE_baseline.py in your own IDE.
8 |
9 | ### Trainning environments
10 | You can set the 'env_index' in the codes to change the environments.
11 | env_index=0 represent 'CartPole-v0'
12 | env_index=1 represent 'CartPole-v1'
13 |
14 | ### How to see the training results?
15 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.
16 | The rewards data are saved as numpy in the file 'data_train'.
17 | The training curves are shown below, which are smoothed by averaging over a window of 10 steps.
18 | The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)
19 |
20 | 
21 |
--------------------------------------------------------------------------------
/1.REINFORCE/REINFORCE.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import numpy as np
6 | from torch.utils.tensorboard import SummaryWriter
7 |
8 |
9 | class Policy(nn.Module):
10 | def __init__(self, state_dim, action_dim, hidden_width):
11 | super(Policy, self).__init__()
12 | self.l1 = nn.Linear(state_dim, hidden_width)
13 | self.l2 = nn.Linear(hidden_width, action_dim)
14 |
15 | def forward(self, s):
16 | s = F.relu(self.l1(s))
17 | a_prob = F.softmax(self.l2(s), dim=1)
18 | return a_prob
19 |
20 |
21 | class REINFORCE(object):
22 | def __init__(self, state_dim, action_dim):
23 | self.state_dim = state_dim
24 | self.action_dim = action_dim
25 | self.hidden_width = 64 # The number of neurons in hidden layers of the neural network
26 | self.lr = 4e-4 # learning rate
27 | self.GAMMA = 0.99 # discount factor
28 | self.episode_s, self.episode_a, self.episode_r = [], [], []
29 |
30 | self.policy = Policy(state_dim, action_dim, self.hidden_width)
31 | self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.lr)
32 |
33 | def choose_action(self, s, deterministic):
34 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
35 | prob_weights = self.policy(s).detach().numpy().flatten() # probability distribution(numpy)
36 | if deterministic: # We use the deterministic policy during the evaluating
37 | a = np.argmax(prob_weights) # Select the action with the highest probability
38 | return a
39 | else: # We use the stochastic policy during the training
40 | a = np.random.choice(range(self.action_dim), p=prob_weights) # Sample the action according to the probability distribution
41 | return a
42 |
43 | def store(self, s, a, r):
44 | self.episode_s.append(s)
45 | self.episode_a.append(a)
46 | self.episode_r.append(r)
47 |
48 | def learn(self, ):
49 | G = []
50 | g = 0
51 | for r in reversed(self.episode_r): # calculate the return G reversely
52 | g = self.GAMMA * g + r
53 | G.insert(0, g)
54 |
55 | for t in range(len(self.episode_r)):
56 | s = torch.unsqueeze(torch.tensor(self.episode_s[t], dtype=torch.float), 0)
57 | a = self.episode_a[t]
58 | g = G[t]
59 |
60 | a_prob = self.policy(s).flatten()
61 | policy_loss = -pow(self.GAMMA, t) * g * torch.log(a_prob[a])
62 | self.policy_optimizer.zero_grad()
63 | policy_loss.backward()
64 | self.policy_optimizer.step()
65 |
66 | # Clean the buffer
67 | self.episode_s, self.episode_a, self.episode_r = [], [], []
68 |
69 |
70 | def evaluate_policy(env, agent):
71 | times = 3 # Perform three evaluations and calculate the average
72 | evaluate_reward = 0
73 | for _ in range(times):
74 | s = env.reset()
75 | done = False
76 | episode_reward = 0
77 | while not done:
78 | a = agent.choose_action(s, deterministic=True) # We use the deterministic policy during the evaluating
79 | s_, r, done, _ = env.step(a)
80 | episode_reward += r
81 | s = s_
82 | evaluate_reward += episode_reward
83 |
84 | return int(evaluate_reward / times)
85 |
86 |
87 | if __name__ == '__main__':
88 | env_name = ['CartPole-v0', 'CartPole-v1']
89 | env_index = 0 # The index of the environments above
90 | env = gym.make(env_name[env_index])
91 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment
92 | number = 1
93 | seed = 0
94 | env.seed(seed)
95 | env_evaluate.seed(seed)
96 | np.random.seed(seed)
97 | torch.manual_seed(seed)
98 |
99 | state_dim = env.observation_space.shape[0]
100 | action_dim = env.action_space.n
101 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode
102 | print("state_dim={}".format(state_dim))
103 | print("action_dim={}".format(action_dim))
104 | print("max_episode_steps={}".format(max_episode_steps))
105 |
106 | agent = REINFORCE(state_dim, action_dim)
107 | writer = SummaryWriter(log_dir='runs/REINFORCE/REINFORCE_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed)) # build a tensorboard
108 |
109 | max_train_steps = 1e5 # Maximum number of training steps
110 | evaluate_freq = 1e3 # Evaluate the policy every 'evaluate_freq' steps
111 | evaluate_num = 0 # Record the number of evaluations
112 | evaluate_rewards = [] # Record the rewards during the evaluating
113 | total_steps = 0 # Record the total steps during the training
114 |
115 | while total_steps < max_train_steps:
116 | episode_steps = 0
117 | s = env.reset()
118 | done = False
119 | while not done:
120 | episode_steps += 1
121 | a = agent.choose_action(s, deterministic=False)
122 | s_, r, done, _ = env.step(a)
123 | agent.store(s, a, r)
124 | s = s_
125 |
126 | # Evaluate the policy every 'evaluate_freq' steps
127 | if (total_steps + 1) % evaluate_freq == 0:
128 | evaluate_num += 1
129 | evaluate_reward = evaluate_policy(env_evaluate, agent)
130 | evaluate_rewards.append(evaluate_reward)
131 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
132 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
133 | if evaluate_num % 10 == 0:
134 | np.save('./data_train/REINFORCE_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
135 |
136 | total_steps += 1
137 |
138 | # An episode is over,then update
139 | agent.learn()
140 |
--------------------------------------------------------------------------------
/1.REINFORCE/REINFORCE_baseline.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import numpy as np
6 | from torch.utils.tensorboard import SummaryWriter
7 |
8 |
9 | class Policy(nn.Module):
10 | def __init__(self, state_dim, action_dim, hidden_width):
11 | super(Policy, self).__init__()
12 | self.l1 = nn.Linear(state_dim, hidden_width)
13 | self.l2 = nn.Linear(hidden_width, action_dim)
14 |
15 | def forward(self, s):
16 | s = F.relu(self.l1(s))
17 | a_prob = F.softmax(self.l2(s), dim=1)
18 | return a_prob
19 |
20 |
21 | class Value(nn.Module):
22 | def __init__(self, state_dim, hidden_width):
23 | super(Value, self).__init__()
24 | self.l1 = nn.Linear(state_dim, hidden_width)
25 | self.l2 = nn.Linear(hidden_width, 1)
26 |
27 | def forward(self, s):
28 | s = F.relu(self.l1(s))
29 | v_s = self.l2(s)
30 | return v_s
31 |
32 |
33 | class REINFORCE(object):
34 | def __init__(self, state_dim, action_dim):
35 | self.state_dim = state_dim
36 | self.action_dim = action_dim
37 | self.hidden_width = 64 # The number of neurons in hidden layers of the neural network
38 | self.lr = 4e-4 # learning rate
39 | self.GAMMA = 0.99 # discount factor
40 | self.episode_s, self.episode_a, self.episode_r = [], [], []
41 |
42 | self.policy = Policy(state_dim, action_dim, self.hidden_width)
43 | self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.lr)
44 |
45 | self.value = Value(state_dim, self.hidden_width)
46 | self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=self.lr)
47 |
48 | def choose_action(self, s, deterministic):
49 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
50 | prob_weights = self.policy(s).detach().numpy().flatten() # probability distribution(numpy)
51 | if deterministic: # We use the deterministic policy during the evaluating
52 | a = np.argmax(prob_weights) # Select the action with the highest probability
53 | return a
54 | else: # We use the stochastic policy during the training
55 | a = np.random.choice(range(self.action_dim), p=prob_weights) # Sample the action according to the probability distribution
56 | return a
57 |
58 | def store(self, s, a, r):
59 | self.episode_s.append(s)
60 | self.episode_a.append(a)
61 | self.episode_r.append(r)
62 |
63 | def learn(self, ):
64 | G = []
65 | g = 0
66 | for r in reversed(self.episode_r): # calculate the return G reversely
67 | g = self.GAMMA * g + r
68 | G.insert(0, g)
69 |
70 | for t in range(len(self.episode_r)):
71 | s = torch.unsqueeze(torch.tensor(self.episode_s[t], dtype=torch.float), 0)
72 | a = self.episode_a[t]
73 | g = G[t]
74 | v_s = self.value(s).flatten()
75 |
76 | # Update policy
77 | a_prob = self.policy(s).flatten()
78 | policy_loss = -pow(self.GAMMA, t) * ((g - v_s).detach()) * torch.log(a_prob[a])
79 | self.policy_optimizer.zero_grad()
80 | policy_loss.backward()
81 | self.policy_optimizer.step()
82 |
83 | # Update value function
84 | value_loss = (g - v_s) ** 2
85 | self.value_optimizer.zero_grad()
86 | value_loss.backward()
87 | self.value_optimizer.step()
88 |
89 | # Clean the buffer
90 | self.episode_s, self.episode_a, self.episode_r = [], [], []
91 |
92 |
93 | def evaluate_policy(env, agent):
94 | times = 3 # Perform three evaluations and calculate the average
95 | evaluate_reward = 0
96 | for _ in range(times):
97 | s = env.reset()
98 | done = False
99 | episode_reward = 0
100 | while not done:
101 | a = agent.choose_action(s, deterministic=True) # We use the deterministic policy during the evaluating
102 | s_, r, done, _ = env.step(a)
103 | episode_reward += r
104 | s = s_
105 | evaluate_reward += episode_reward
106 |
107 | return int(evaluate_reward / times)
108 |
109 |
110 | if __name__ == '__main__':
111 | env_name = ['CartPole-v0', 'CartPole-v1']
112 | env_index = 0 # The index of the environments above
113 | env = gym.make(env_name[env_index])
114 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment
115 | number = 1
116 | seed = 0
117 | env.seed(seed)
118 | env_evaluate.seed(seed)
119 | np.random.seed(seed)
120 | torch.manual_seed(seed)
121 |
122 | state_dim = env.observation_space.shape[0]
123 | action_dim = env.action_space.n
124 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode
125 | print("state_dim={}".format(state_dim))
126 | print("action_dim={}".format(action_dim))
127 | print("max_episode_steps={}".format(max_episode_steps))
128 |
129 | agent = REINFORCE(state_dim, action_dim)
130 | writer = SummaryWriter(log_dir='runs/REINFORCE/REINFORCE_baseline_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed)) # build a tensorboard
131 |
132 | max_train_steps = 1e5 # Maximum number of training steps
133 | evaluate_freq = 1e3 # Evaluate the policy every 'evaluate_freq' steps
134 | evaluate_num = 0 # Record the number of evaluations
135 | evaluate_rewards = [] # Record the rewards during the evaluating
136 | total_steps = 0 # Record the total steps during the training
137 |
138 | while total_steps < max_train_steps:
139 | episode_steps = 0
140 | s = env.reset()
141 | done = False
142 | while not done:
143 | episode_steps += 1
144 | a = agent.choose_action(s, deterministic=False)
145 | s_, r, done, _ = env.step(a)
146 | agent.store(s, a, r)
147 | s = s_
148 |
149 | # Evaluate the policy every 'evaluate_freq' steps
150 | if (total_steps + 1) % evaluate_freq == 0:
151 | evaluate_num += 1
152 | evaluate_reward = evaluate_policy(env_evaluate, agent)
153 | evaluate_rewards.append(evaluate_reward)
154 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
155 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
156 | if evaluate_num % 10 == 0:
157 | np.save('./data_train/REINFORCE_baseline_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
158 |
159 | total_steps += 1
160 |
161 | # An episode is over,then update
162 | agent.learn()
163 |
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_0.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_10.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_100.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_0.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_10.npy
--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_100.npy
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121668.李智.23156.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121668.李智.23156.0
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121786.李智.23156.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121786.李智.23156.1
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121899.李智.23156.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121899.李智.23156.2
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121670.李智.15096.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121670.李智.15096.0
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121797.李智.15096.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121797.李智.15096.1
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121918.李智.15096.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121918.李智.15096.2
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121512.李智.32424.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121512.李智.32424.0
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121583.李智.32424.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121583.李智.32424.1
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121655.李智.32424.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121655.李智.32424.2
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121526.李智.11256.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121526.李智.11256.0
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121607.李智.11256.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121607.李智.11256.1
--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121688.李智.11256.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121688.李智.11256.2
--------------------------------------------------------------------------------
/1.REINFORCE/training results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/training results.png
--------------------------------------------------------------------------------
/2.Actor-Critic/A2C.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import numpy as np
6 | from torch.utils.tensorboard import SummaryWriter
7 |
8 |
9 | # The network of the actor
10 | class Actor(nn.Module):
11 | def __init__(self, state_dim, action_dim, hidden_width):
12 | super(Actor, self).__init__()
13 | self.l1 = nn.Linear(state_dim, hidden_width)
14 | self.l2 = nn.Linear(hidden_width, action_dim)
15 |
16 | def forward(self, s):
17 | s = F.relu(self.l1(s))
18 | a_prob = F.softmax(self.l2(s), dim=1)
19 | return a_prob
20 |
21 |
22 | # The network of the critic
23 | class Critic(nn.Module):
24 | def __init__(self, state_dim, hidden_width):
25 | super(Critic, self).__init__()
26 | self.l1 = nn.Linear(state_dim, hidden_width)
27 | self.l2 = nn.Linear(hidden_width, 1)
28 |
29 | def forward(self, s):
30 | s = F.relu(self.l1(s))
31 | v_s = self.l2(s)
32 | return v_s
33 |
34 |
35 | class A2C(object):
36 | def __init__(self, state_dim, action_dim):
37 | self.state_dim = state_dim
38 | self.action_dim = action_dim
39 | self.hidden_width = 64 # The number of neurons in hidden layers of the neural network
40 | self.lr = 5e-4 # learning rate
41 | self.GAMMA = 0.99 # discount factor
42 | self.I = 1
43 |
44 | self.actor = Actor(state_dim, action_dim, self.hidden_width)
45 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
46 |
47 | self.critic = Critic(state_dim, self.hidden_width)
48 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
49 |
50 | def choose_action(self, s, deterministic):
51 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
52 | prob_weights = self.actor(s).detach().numpy().flatten() # probability distribution(numpy)
53 | if deterministic: # We use the deterministic policy during the evaluating
54 | a = np.argmax(prob_weights) # Select the action with the highest probability
55 | return a
56 | else: # We use the stochastic policy during the training
57 | a = np.random.choice(range(self.action_dim), p=prob_weights) # Sample the action according to the probability distribution
58 | return a
59 |
60 | def learn(self, s, a, r, s_, dw):
61 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
62 | s_ = torch.unsqueeze(torch.tensor(s_, dtype=torch.float), 0)
63 | v_s = self.critic(s).flatten() # v(s)
64 | v_s_ = self.critic(s_).flatten() # v(s')
65 |
66 | with torch.no_grad(): # td_target has no gradient
67 | td_target = r + self.GAMMA * (1 - dw) * v_s_
68 |
69 | # Update actor
70 | log_pi = torch.log(self.actor(s).flatten()[a]) # log pi(a|s)
71 | actor_loss = -self.I * ((td_target - v_s).detach()) * log_pi # Only calculate the derivative of log_pi
72 | self.actor_optimizer.zero_grad()
73 | actor_loss.backward()
74 | self.actor_optimizer.step()
75 |
76 | # Update critic
77 | critic_loss = (td_target - v_s) ** 2 # Only calculate the derivative of v(s)
78 | self.critic_optimizer.zero_grad()
79 | critic_loss.backward()
80 | self.critic_optimizer.step()
81 |
82 | self.I *= self.GAMMA # Represent the gamma^t in th policy gradient theorem
83 |
84 |
85 | def evaluate_policy(env, agent):
86 | times = 3 # Perform three evaluations and calculate the average
87 | evaluate_reward = 0
88 | for _ in range(times):
89 | s = env.reset()
90 | done = False
91 | episode_reward = 0
92 | while not done:
93 | a = agent.choose_action(s, deterministic=True) # We use the deterministic policy during the evaluating
94 | s_, r, done, _ = env.step(a)
95 | episode_reward += r
96 | s = s_
97 | evaluate_reward += episode_reward
98 |
99 | return int(evaluate_reward / times)
100 |
101 |
102 | if __name__ == '__main__':
103 | env_name = ['CartPole-v0', 'CartPole-v1']
104 | env_index = 0
105 | env = gym.make(env_name[env_index])
106 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment
107 | number = 9
108 | # Set random seed
109 | seed = 0
110 | env.seed(seed)
111 | env.action_space.seed(seed)
112 | env_evaluate.seed(seed)
113 | env_evaluate.action_space.seed(seed)
114 | np.random.seed(seed)
115 | torch.manual_seed(seed)
116 |
117 | state_dim = env.observation_space.shape[0]
118 | action_dim = env.action_space.n
119 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode
120 | print("state_dim={}".format(state_dim))
121 | print("action_dim={}".format(action_dim))
122 | print("max_episode_steps={}".format(max_episode_steps))
123 |
124 | agent = A2C(state_dim, action_dim)
125 | writer = SummaryWriter(log_dir='runs/A2C/A2C_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed)) # Build a tensorboard
126 |
127 | max_train_steps = 3e5 # Maximum number of training steps
128 | evaluate_freq = 1e3 # Evaluate the policy every 'evaluate_freq' steps
129 | evaluate_rewards = [] # Record the rewards during the evaluating
130 | evaluate_num = 0 # Record the number of evaluations
131 | total_steps = 0 # Record the total steps during the training
132 |
133 | while total_steps < max_train_steps:
134 | episode_steps = 0
135 | s = env.reset()
136 | done = False
137 | agent.I = 1
138 | while not done:
139 | episode_steps += 1
140 | a = agent.choose_action(s, deterministic=False)
141 | s_, r, done, _ = env.step(a)
142 |
143 | # When dead or win or reaching the max_epsiode_steps, done will be Ture, we need to distinguish them;
144 | # dw means dead or win,there is no next state s';
145 | # but when reaching the max_episode_steps,there is a next state s' actually.
146 | if done and episode_steps != max_episode_steps:
147 | dw = True
148 | else:
149 | dw = False
150 |
151 | agent.learn(s, a, r, s_, dw)
152 | s = s_
153 |
154 | # Evaluate the policy every 'evaluate_freq' steps
155 | if (total_steps + 1) % evaluate_freq == 0:
156 | evaluate_num += 1
157 | evaluate_reward = evaluate_policy(env_evaluate, agent)
158 | evaluate_rewards.append(evaluate_reward)
159 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
160 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
161 | # Save the rewards
162 | if evaluate_num % 10 == 0:
163 | np.save('./data_train/A2C_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
164 |
165 | total_steps += 1
166 |
--------------------------------------------------------------------------------
/2.Actor-Critic/A2C_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/A2C_results.png
--------------------------------------------------------------------------------
/2.Actor-Critic/README.md:
--------------------------------------------------------------------------------
1 | # Actor-Critic(A2C)
2 | This is a concise Pytorch implementation of Advantage Actor-Critic(A2C).
3 |
4 | ## How to use my code?
5 | You can dircetly run A2C.py in your own IDE.
6 |
7 | ### Trainning environments
8 | You can set the 'env_index' in the codes to change the environments.
9 | env_index=0 represent 'CartPole-v0'
10 | env_index=1 represent 'CartPole-v1'
11 |
12 | ### How to see the training results?
13 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.
14 | The rewards data are saved as numpy in the file 'data_train'.
15 | The training curves are shown below, which are smoothed by averaging over a window of 10 steps.
16 | The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)
17 | 
18 |
--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_0.npy
--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_10.npy
--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_100.npy
--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_0.npy
--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_10.npy
--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_100.npy
--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_0/events.out.tfevents.1648553119.李智.62564.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_0/events.out.tfevents.1648553119.李智.62564.0
--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_10/events.out.tfevents.1648553543.李智.62564.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_10/events.out.tfevents.1648553543.李智.62564.1
--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_100/events.out.tfevents.1648554019.李智.62564.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_100/events.out.tfevents.1648554019.李智.62564.2
--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_0/events.out.tfevents.1648553122.李智.63460.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_0/events.out.tfevents.1648553122.李智.63460.0
--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_10/events.out.tfevents.1648553561.李智.63460.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_10/events.out.tfevents.1648553561.李智.63460.1
--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_100/events.out.tfevents.1648554055.李智.63460.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_100/events.out.tfevents.1648554055.李智.63460.2
--------------------------------------------------------------------------------
/3.Rainbow_DQN/README.md:
--------------------------------------------------------------------------------
1 | # Rainbow DQN
2 | This is a concise Pytorch implementation of Rainbow DQN, including Double Q-learning, Dueling network, Noisy network, PER and n-steps Q-learning.
3 |
4 | ## How to use my code?
5 | You can dircetly run Rainbow_DQN_main.py in your own IDE.
6 |
7 | ### Trainning environments
8 | You can set the 'env_index' in the code to change the environments.
9 | env_index=0 represent 'CartPole-v1'
10 | env_index=1 represent 'LunarLander-v2'
11 |
12 | ### How to see the training results?
13 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.
14 | The rewards data are saved as numpy in the file 'data_train'.
15 | The training curves are shown below.
16 | The right picture is smoothed by averaging over a window of 10 steps. The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)
17 | 
18 |
19 | ## Reference
20 | [1] Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep reinforcement learning[J]. nature, 2015, 518(7540): 529-533.
21 | [2] Van Hasselt H, Guez A, Silver D. Deep reinforcement learning with double q-learning[C]//Proceedings of the AAAI conference on artificial intelligence. 2016, 30(1).
22 | [3] Wang Z, Schaul T, Hessel M, et al. Dueling network architectures for deep reinforcement learning[C]//International conference on machine learning. PMLR, 2016: 1995-2003.
23 | [4] Fortunato M, Azar M G, Piot B, et al. Noisy networks for exploration[J]. arXiv preprint arXiv:1706.10295, 2017.
24 | [5] Schaul T, Quan J, Antonoglou I, et al. Prioritized experience replay[J]. arXiv preprint arXiv:1511.05952, 2015.
25 | [6] Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining improvements in deep reinforcement learning[C]//Thirty-second AAAI conference on artificial intelligence. 2018.
26 |
--------------------------------------------------------------------------------
/3.Rainbow_DQN/Rainbow_DQN_main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import gym
4 | from torch.utils.tensorboard import SummaryWriter
5 | from replay_buffer import *
6 | from rainbow_dqn import DQN
7 | import argparse
8 |
9 |
10 | class Runner:
11 | def __init__(self, args, env_name, number, seed):
12 | self.args = args
13 | self.env_name = env_name
14 | self.number = number
15 | self.seed = seed
16 |
17 | self.env = gym.make(env_name)
18 | self.env_evaluate = gym.make(env_name) # When evaluating the policy, we need to rebuild an environment
19 | self.env.seed(seed)
20 | self.env.action_space.seed(seed)
21 | self.env_evaluate.seed(seed)
22 | self.env_evaluate.action_space.seed(seed)
23 | np.random.seed(seed)
24 | torch.manual_seed(seed)
25 |
26 | self.args.state_dim = self.env.observation_space.shape[0]
27 | self.args.action_dim = self.env.action_space.n
28 | self.args.episode_limit = self.env._max_episode_steps # Maximum number of steps per episode
29 | print("env={}".format(self.env_name))
30 | print("state_dim={}".format(self.args.state_dim))
31 | print("action_dim={}".format(self.args.action_dim))
32 | print("episode_limit={}".format(self.args.episode_limit))
33 |
34 | if args.use_per and args.use_n_steps:
35 | self.replay_buffer = N_Steps_Prioritized_ReplayBuffer(args)
36 | elif args.use_per:
37 | self.replay_buffer = Prioritized_ReplayBuffer(args)
38 | elif args.use_n_steps:
39 | self.replay_buffer = N_Steps_ReplayBuffer(args)
40 | else:
41 | self.replay_buffer = ReplayBuffer(args)
42 | self.agent = DQN(args)
43 |
44 | self.algorithm = 'DQN'
45 | if args.use_double and args.use_dueling and args.use_noisy and args.use_per and args.use_n_steps:
46 | self.algorithm = 'Rainbow_' + self.algorithm
47 | else:
48 | if args.use_double:
49 | self.algorithm += '_Double'
50 | if args.use_dueling:
51 | self.algorithm += '_Dueling'
52 | if args.use_noisy:
53 | self.algorithm += '_Noisy'
54 | if args.use_per:
55 | self.algorithm += '_PER'
56 | if args.use_n_steps:
57 | self.algorithm += "_N_steps"
58 |
59 | self.writer = SummaryWriter(log_dir='runs/DQN/{}_env_{}_number_{}_seed_{}'.format(self.algorithm, env_name, number, seed))
60 |
61 | self.evaluate_num = 0 # Record the number of evaluations
62 | self.evaluate_rewards = [] # Record the rewards during the evaluating
63 | self.total_steps = 0 # Record the total steps during the training
64 | if args.use_noisy: # 如果使用Noisy net,就不需要epsilon贪心策略了
65 | self.epsilon = 0
66 | else:
67 | self.epsilon = self.args.epsilon_init
68 | self.epsilon_min = self.args.epsilon_min
69 | self.epsilon_decay = (self.args.epsilon_init - self.args.epsilon_min) / self.args.epsilon_decay_steps
70 |
71 | def run(self, ):
72 | self.evaluate_policy()
73 | while self.total_steps < self.args.max_train_steps:
74 | state = self.env.reset()
75 | done = False
76 | episode_steps = 0
77 | while not done:
78 | action = self.agent.choose_action(state, epsilon=self.epsilon)
79 | next_state, reward, done, _ = self.env.step(action)
80 | episode_steps += 1
81 | self.total_steps += 1
82 |
83 | if not self.args.use_noisy: # Decay epsilon
84 | self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon - self.epsilon_decay > self.epsilon_min else self.epsilon_min
85 |
86 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
87 | # terminal means dead or win,there is no next state s';
88 | # but when reaching the max_episode_steps,there is a next state s' actually.
89 | if done and episode_steps != self.args.episode_limit:
90 | if self.env_name == 'LunarLander-v2':
91 | if reward <= -100: reward = -1 # good for LunarLander
92 | terminal = True
93 | else:
94 | terminal = False
95 |
96 | self.replay_buffer.store_transition(state, action, reward, next_state, terminal, done) # Store the transition
97 | state = next_state
98 |
99 | if self.replay_buffer.current_size >= self.args.batch_size:
100 | self.agent.learn(self.replay_buffer, self.total_steps)
101 |
102 | if self.total_steps % self.args.evaluate_freq == 0:
103 | self.evaluate_policy()
104 | # Save reward
105 | np.save('./data_train/{}_env_{}_number_{}_seed_{}.npy'.format(self.algorithm, self.env_name, self.number, self.seed), np.array(self.evaluate_rewards))
106 |
107 | def evaluate_policy(self, ):
108 | evaluate_reward = 0
109 | self.agent.net.eval()
110 | for _ in range(self.args.evaluate_times):
111 | state = self.env_evaluate.reset()
112 | done = False
113 | episode_reward = 0
114 | while not done:
115 | action = self.agent.choose_action(state, epsilon=0)
116 | next_state, reward, done, _ = self.env_evaluate.step(action)
117 | episode_reward += reward
118 | state = next_state
119 | evaluate_reward += episode_reward
120 | self.agent.net.train()
121 | evaluate_reward /= self.args.evaluate_times
122 | self.evaluate_rewards.append(evaluate_reward)
123 | print("total_steps:{} \t evaluate_reward:{} \t epsilon:{}".format(self.total_steps, evaluate_reward, self.epsilon))
124 | self.writer.add_scalar('step_rewards_{}'.format(self.env_name), evaluate_reward, global_step=self.total_steps)
125 |
126 |
127 | if __name__ == '__main__':
128 | parser = argparse.ArgumentParser("Hyperparameter Setting for DQN")
129 | parser.add_argument("--max_train_steps", type=int, default=int(4e5), help=" Maximum number of training steps")
130 | parser.add_argument("--evaluate_freq", type=float, default=1e3, help="Evaluate the policy every 'evaluate_freq' steps")
131 | parser.add_argument("--evaluate_times", type=float, default=3, help="Evaluate times")
132 |
133 | parser.add_argument("--buffer_capacity", type=int, default=int(1e5), help="The maximum replay-buffer capacity ")
134 | parser.add_argument("--batch_size", type=int, default=256, help="batch size")
135 | parser.add_argument("--hidden_dim", type=int, default=256, help="The number of neurons in hidden layers of the neural network")
136 | parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate of actor")
137 | parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
138 | parser.add_argument("--epsilon_init", type=float, default=0.5, help="Initial epsilon")
139 | parser.add_argument("--epsilon_min", type=float, default=0.1, help="Minimum epsilon")
140 | parser.add_argument("--epsilon_decay_steps", type=int, default=int(1e5), help="How many steps before the epsilon decays to the minimum")
141 | parser.add_argument("--tau", type=float, default=0.005, help="soft update the target network")
142 | parser.add_argument("--use_soft_update", type=bool, default=True, help="Whether to use soft update")
143 | parser.add_argument("--target_update_freq", type=int, default=200, help="Update frequency of the target network(hard update)")
144 | parser.add_argument("--n_steps", type=int, default=5, help="n_steps")
145 | parser.add_argument("--alpha", type=float, default=0.6, help="PER parameter")
146 | parser.add_argument("--beta_init", type=float, default=0.4, help="Important sampling parameter in PER")
147 | parser.add_argument("--use_lr_decay", type=bool, default=True, help="Learning rate Decay")
148 | parser.add_argument("--grad_clip", type=float, default=10.0, help="Gradient clip")
149 |
150 | parser.add_argument("--use_double", type=bool, default=True, help="Whether to use double Q-learning")
151 | parser.add_argument("--use_dueling", type=bool, default=True, help="Whether to use dueling network")
152 | parser.add_argument("--use_noisy", type=bool, default=True, help="Whether to use noisy network")
153 | parser.add_argument("--use_per", type=bool, default=True, help="Whether to use PER")
154 | parser.add_argument("--use_n_steps", type=bool, default=True, help="Whether to use n_steps Q-learning")
155 |
156 | args = parser.parse_args()
157 |
158 | env_names = ['CartPole-v1', 'LunarLander-v2']
159 | env_index = 1
160 | for seed in [0, 10, 100]:
161 | runner = Runner(args=args, env_name=env_names[env_index], number=1, seed=seed)
162 | runner.run()
163 |
--------------------------------------------------------------------------------
/3.Rainbow_DQN/__pycache__/network.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/network.cpython-37.pyc
--------------------------------------------------------------------------------
/3.Rainbow_DQN/__pycache__/rainbow_dqn.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/rainbow_dqn.cpython-37.pyc
--------------------------------------------------------------------------------
/3.Rainbow_DQN/__pycache__/replay_buffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/replay_buffer.cpython-37.pyc
--------------------------------------------------------------------------------
/3.Rainbow_DQN/__pycache__/sum_tree.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/sum_tree.cpython-37.pyc
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100.npy
--------------------------------------------------------------------------------
/3.Rainbow_DQN/drawing_Rainbow_DQN.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import seaborn as sns
4 |
5 |
6 | def smooth(reward):
7 | smooth_reward = []
8 | for i in range(reward.shape[0]):
9 | if i == 0:
10 | smooth_reward.append(reward[i])
11 | else:
12 | smooth_reward.append(smooth_reward[-1] * 0.9 + reward[i] * 0.1)
13 | return np.array(smooth_reward)
14 |
15 |
16 | env_name = ['CartPole-v1', 'LunarLander-v2']
17 | colors = ['r', 'darkorange', 'dodgerblue', 'limegreen', 'yellow', 'magenta', 'chocolate', 'indigo', 'gray', 'aqua', 'g', 'black']
18 |
19 |
20 | def get_data(algorithm, env_index, number):
21 | reward1 = smooth(np.load('./data_train/{}_env_{}_number_{}_seed_0.npy'.format(algorithm, env_name[env_index], number)))
22 | reward2 = smooth(np.load('./data_train/{}_env_{}_number_{}_seed_10.npy'.format(algorithm, env_name[env_index], number)))
23 | reward3 = smooth(np.load('./data_train/{}_env_{}_number_{}_seed_100.npy'.format(algorithm, env_name[env_index], number)))
24 | reward = np.stack((reward1, reward2, reward3), axis=0)
25 | len = reward1.shape[0]
26 |
27 | return reward, len
28 |
29 |
30 | def drawing_CP(plt, algorithm, number, color, label):
31 | reward, len = get_data(algorithm=algorithm, env_index=0, number=number)
32 | sns.tsplot(time=np.arange(len), data=reward, color=color, linestyle='-') # color=darkorange dodgerblue
33 | plt.plot(reward.mean(0), color=color, label=label)
34 | plt.title("CartPole-v1", size=14)
35 | plt.xlabel("Steps", size=14)
36 | plt.ylabel("Reward", size=14)
37 | plt.xticks([0, 50, 100, 150], ['0', '50k', '100k', '150k'], size=14)
38 | plt.yticks(size=14)
39 | plt.ylim([0, 510])
40 | plt.legend(loc='lower right', fontsize=14)
41 |
42 |
43 | def drawing_LL(plt, algorithm, number, color, label):
44 | reward, len = get_data(algorithm=algorithm, env_index=1, number=number)
45 | sns.tsplot(time=np.arange(len), data=reward, color=color, linestyle='-') # color=darkorange dodgerblue
46 | plt.plot(reward.mean(0), color=color, label=label)
47 | plt.title("LunarLander-v2", size=14)
48 | plt.xlabel("Steps", size=14)
49 | plt.ylabel("Reward", size=14)
50 | plt.xticks([0, 100, 200, 300, 400], ['0', '100k', '200k', '300k', '400k'], size=14)
51 | plt.yticks(size=14)
52 | plt.ylim([-300, 300])
53 | plt.legend(loc='lower right', fontsize=14)
54 |
55 |
56 | sns.set_style('darkgrid')
57 | plt.figure()
58 | drawing_LL(plt, algorithm='Rainbow_DQN', number=1, color=colors[0], label='Rainbow_DQN')
59 |
60 | drawing_LL(plt, algorithm='DQN_dueling_Noisy_PER_N_steps', number=1, color=colors[1], label='Rainbow_DQN without Double')
61 |
62 | drawing_LL(plt, algorithm='DQN_double_Noisy_PER_N_steps', number=1, color=colors[2], label='Rainbow_DQN without Dueling')
63 |
64 | drawing_LL(plt, algorithm='DQN_double_dueling_Noisy_N_steps', number=1, color=colors[3], label='Rainbow DQN without PER')
65 |
66 | drawing_LL(plt, algorithm='DQN_double_dueling_Noisy_PER', number=1, color=colors[4], label='Rainbow_DQN without N-steps')
67 |
68 | drawing_LL(plt, algorithm='DQN_double_dueling_PER_N_steps', number=1, color=colors[9], label='Rainbow_DQN without Noisy')
69 |
70 |
71 | plt.show()
72 |
--------------------------------------------------------------------------------
/3.Rainbow_DQN/network.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import math
5 |
6 |
7 | class Dueling_Net(nn.Module):
8 | def __init__(self, args):
9 | super(Dueling_Net, self).__init__()
10 | self.fc1 = nn.Linear(args.state_dim, args.hidden_dim)
11 | self.fc2 = nn.Linear(args.hidden_dim, args.hidden_dim)
12 | if args.use_noisy:
13 | self.V = NoisyLinear(args.hidden_dim, 1)
14 | self.A = NoisyLinear(args.hidden_dim, args.action_dim)
15 | else:
16 | self.V = nn.Linear(args.hidden_dim, 1)
17 | self.A = nn.Linear(args.hidden_dim, args.action_dim)
18 |
19 | def forward(self, s):
20 | s = torch.relu(self.fc1(s))
21 | s = torch.relu(self.fc2(s))
22 | V = self.V(s) # batch_size X 1
23 | A = self.A(s) # batch_size X action_dim
24 | Q = V + (A - torch.mean(A, dim=-1, keepdim=True)) # Q(s,a)=V(s)+A(s,a)-mean(A(s,a))
25 | return Q
26 |
27 |
28 | class Net(nn.Module):
29 | def __init__(self, args):
30 | super(Net, self).__init__()
31 | self.fc1 = nn.Linear(args.state_dim, args.hidden_dim)
32 | self.fc2 = nn.Linear(args.hidden_dim, args.hidden_dim)
33 | if args.use_noisy:
34 | self.fc3 = NoisyLinear(args.hidden_dim, args.action_dim)
35 | else:
36 | self.fc3 = nn.Linear(args.hidden_dim, args.action_dim)
37 |
38 | def forward(self, s):
39 | s = torch.relu(self.fc1(s))
40 | s = torch.relu(self.fc2(s))
41 | Q = self.fc3(s)
42 | return Q
43 |
44 |
45 | class NoisyLinear(nn.Module):
46 | def __init__(self, in_features, out_features, sigma_init=0.5):
47 | super(NoisyLinear, self).__init__()
48 | self.in_features = in_features
49 | self.out_features = out_features
50 | self.sigma_init = sigma_init
51 |
52 | self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
53 | self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
54 | self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))
55 |
56 | self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
57 | self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
58 | self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
59 |
60 | self.reset_parameters()
61 | self.reset_noise()
62 |
63 | def forward(self, x):
64 | if self.training:
65 | self.reset_noise()
66 | weight = self.weight_mu + self.weight_sigma.mul(self.weight_epsilon) # mul是对应元素相乘
67 | bias = self.bias_mu + self.bias_sigma.mul(self.bias_epsilon)
68 |
69 | else:
70 | weight = self.weight_mu
71 | bias = self.bias_mu
72 |
73 | return F.linear(x, weight, bias)
74 |
75 | def reset_parameters(self):
76 | mu_range = 1 / math.sqrt(self.in_features)
77 | self.weight_mu.data.uniform_(-mu_range, mu_range)
78 | self.bias_mu.data.uniform_(-mu_range, mu_range)
79 |
80 | self.weight_sigma.data.fill_(self.sigma_init / math.sqrt(self.in_features))
81 | self.bias_sigma.data.fill_(self.sigma_init / math.sqrt(self.out_features)) # 这里要除以out_features
82 |
83 | def reset_noise(self):
84 | epsilon_i = self.scale_noise(self.in_features)
85 | epsilon_j = self.scale_noise(self.out_features)
86 | self.weight_epsilon.copy_(torch.ger(epsilon_j, epsilon_i))
87 | self.bias_epsilon.copy_(epsilon_j)
88 |
89 | def scale_noise(self, size):
90 | x = torch.randn(size) # torch.randn产生标准高斯分布
91 | x = x.sign().mul(x.abs().sqrt())
92 | return x
93 |
--------------------------------------------------------------------------------
/3.Rainbow_DQN/rainbow_dqn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import copy
4 | from network import Dueling_Net, Net
5 |
6 |
7 | class DQN(object):
8 | def __init__(self, args):
9 | self.action_dim = args.action_dim
10 | self.batch_size = args.batch_size # batch size
11 | self.max_train_steps = args.max_train_steps
12 | self.lr = args.lr # learning rate
13 | self.gamma = args.gamma # discount factor
14 | self.tau = args.tau # Soft update
15 | self.use_soft_update = args.use_soft_update
16 | self.target_update_freq = args.target_update_freq # hard update
17 | self.update_count = 0
18 |
19 | self.grad_clip = args.grad_clip
20 | self.use_lr_decay = args.use_lr_decay
21 | self.use_double = args.use_double
22 | self.use_dueling = args.use_dueling
23 | self.use_per = args.use_per
24 | self.use_n_steps = args.use_n_steps
25 | if self.use_n_steps:
26 | self.gamma = self.gamma ** args.n_steps
27 |
28 | if self.use_dueling: # Whether to use the 'dueling network'
29 | self.net = Dueling_Net(args)
30 | else:
31 | self.net = Net(args)
32 |
33 | self.target_net = copy.deepcopy(self.net) # Copy the online_net to the target_net
34 |
35 | self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.lr)
36 |
37 | def choose_action(self, state, epsilon):
38 | with torch.no_grad():
39 | state = torch.unsqueeze(torch.tensor(state, dtype=torch.float), 0)
40 | q = self.net(state)
41 | if np.random.uniform() > epsilon:
42 | action = q.argmax(dim=-1).item()
43 | else:
44 | action = np.random.randint(0, self.action_dim)
45 | return action
46 |
47 | def learn(self, replay_buffer, total_steps):
48 | batch, batch_index, IS_weight = replay_buffer.sample(total_steps)
49 |
50 | with torch.no_grad(): # q_target has no gradient
51 | if self.use_double: # Whether to use the 'double q-learning'
52 | # Use online_net to select the action
53 | a_argmax = self.net(batch['next_state']).argmax(dim=-1, keepdim=True) # shape:(batch_size,1)
54 | # Use target_net to estimate the q_target
55 | q_target = batch['reward'] + self.gamma * (1 - batch['terminal']) * self.target_net(batch['next_state']).gather(-1, a_argmax).squeeze(-1) # shape:(batch_size,)
56 | else:
57 | q_target = batch['reward'] + self.gamma * (1 - batch['terminal']) * self.target_net(batch['next_state']).max(dim=-1)[0] # shape:(batch_size,)
58 |
59 | q_current = self.net(batch['state']).gather(-1, batch['action']).squeeze(-1) # shape:(batch_size,)
60 | td_errors = q_current - q_target # shape:(batch_size,)
61 |
62 | if self.use_per:
63 | loss = (IS_weight * (td_errors ** 2)).mean()
64 | replay_buffer.update_batch_priorities(batch_index, td_errors.detach().numpy())
65 | else:
66 | loss = (td_errors ** 2).mean()
67 |
68 | self.optimizer.zero_grad()
69 | loss.backward()
70 | if self.grad_clip:
71 | torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_clip)
72 | self.optimizer.step()
73 |
74 | if self.use_soft_update: # soft update
75 | for param, target_param in zip(self.net.parameters(), self.target_net.parameters()):
76 | target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
77 | else: # hard update
78 | self.update_count += 1
79 | if self.update_count % self.target_update_freq == 0:
80 | self.target_net.load_state_dict(self.net.state_dict())
81 |
82 | if self.use_lr_decay: # learning rate Decay
83 | self.lr_decay(total_steps)
84 |
85 | def lr_decay(self, total_steps):
86 | lr_now = 0.9 * self.lr * (1 - total_steps / self.max_train_steps) + 0.1 * self.lr
87 | for p in self.optimizer.param_groups:
88 | p['lr'] = lr_now
89 |
--------------------------------------------------------------------------------
/3.Rainbow_DQN/rainbow_dqn_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/rainbow_dqn_result.png
--------------------------------------------------------------------------------
/3.Rainbow_DQN/replay_buffer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from collections import deque
4 | from sum_tree import SumTree
5 |
6 |
7 | class ReplayBuffer(object):
8 | def __init__(self, args):
9 | self.batch_size = args.batch_size
10 | self.buffer_capacity = args.buffer_capacity
11 | self.current_size = 0
12 | self.count = 0
13 | self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)),
14 | 'action': np.zeros((self.buffer_capacity, 1)),
15 | 'reward': np.zeros(self.buffer_capacity),
16 | 'next_state': np.zeros((self.buffer_capacity, args.state_dim)),
17 | 'terminal': np.zeros(self.buffer_capacity),
18 | }
19 |
20 | def store_transition(self, state, action, reward, next_state, terminal, done):
21 | self.buffer['state'][self.count] = state
22 | self.buffer['action'][self.count] = action
23 | self.buffer['reward'][self.count] = reward
24 | self.buffer['next_state'][self.count] = next_state
25 | self.buffer['terminal'][self.count] = terminal
26 | self.count = (self.count + 1) % self.buffer_capacity # When the 'count' reaches buffer_capacity, it will be reset to 0.
27 | self.current_size = min(self.current_size + 1, self.buffer_capacity)
28 |
29 | def sample(self, total_steps):
30 | index = np.random.randint(0, self.current_size, size=self.batch_size)
31 | batch = {}
32 | for key in self.buffer.keys(): # numpy->tensor
33 | if key == 'action':
34 | batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.long)
35 | else:
36 | batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.float32)
37 |
38 | return batch, None, None
39 |
40 |
41 | class N_Steps_ReplayBuffer(object):
42 | def __init__(self, args):
43 | self.gamma = args.gamma
44 | self.batch_size = args.batch_size
45 | self.buffer_capacity = args.buffer_capacity
46 | self.current_size = 0
47 | self.count = 0
48 | self.n_steps = args.n_steps
49 | self.n_steps_deque = deque(maxlen=self.n_steps)
50 | self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)),
51 | 'action': np.zeros((self.buffer_capacity, 1)),
52 | 'reward': np.zeros(self.buffer_capacity),
53 | 'next_state': np.zeros((self.buffer_capacity, args.state_dim)),
54 | 'terminal': np.zeros(self.buffer_capacity),
55 | }
56 |
57 | def store_transition(self, state, action, reward, next_state, terminal, done):
58 | transition = (state, action, reward, next_state, terminal, done)
59 | self.n_steps_deque.append(transition)
60 | if len(self.n_steps_deque) == self.n_steps:
61 | state, action, n_steps_reward, next_state, terminal = self.get_n_steps_transition()
62 | self.buffer['state'][self.count] = state
63 | self.buffer['action'][self.count] = action
64 | self.buffer['reward'][self.count] = n_steps_reward
65 | self.buffer['next_state'][self.count] = next_state
66 | self.buffer['terminal'][self.count] = terminal
67 | self.count = (self.count + 1) % self.buffer_capacity # When the 'count' reaches buffer_capacity, it will be reset to 0.
68 | self.current_size = min(self.current_size + 1, self.buffer_capacity)
69 |
70 | def get_n_steps_transition(self):
71 | state, action = self.n_steps_deque[0][:2]
72 | next_state, terminal = self.n_steps_deque[-1][3:5]
73 | n_steps_reward = 0
74 | for i in reversed(range(self.n_steps)):
75 | r, s_, ter, d = self.n_steps_deque[i][2:]
76 | n_steps_reward = r + self.gamma * (1 - d) * n_steps_reward
77 | if d:
78 | next_state, terminal = s_, ter
79 |
80 | return state, action, n_steps_reward, next_state, terminal
81 |
82 | def sample(self, total_steps):
83 | index = np.random.randint(0, self.current_size, size=self.batch_size)
84 | batch = {}
85 | for key in self.buffer.keys(): # numpy->tensor
86 | if key == 'action':
87 | batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.long)
88 | else:
89 | batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.float32)
90 |
91 | return batch, None, None
92 |
93 |
94 | class Prioritized_ReplayBuffer(object):
95 | def __init__(self, args):
96 | self.max_train_steps = args.max_train_steps
97 | self.alpha = args.alpha
98 | self.beta_init = args.beta_init
99 | self.beta = args.beta_init
100 | self.batch_size = args.batch_size
101 | self.buffer_capacity = args.buffer_capacity
102 | self.sum_tree = SumTree(self.buffer_capacity)
103 | self.current_size = 0
104 | self.count = 0
105 | self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)),
106 | 'action': np.zeros((self.buffer_capacity, 1)),
107 | 'reward': np.zeros(self.buffer_capacity),
108 | 'next_state': np.zeros((self.buffer_capacity, args.state_dim)),
109 | 'terminal': np.zeros(self.buffer_capacity),
110 | }
111 |
112 | def store_transition(self, state, action, reward, next_state, terminal, done):
113 | self.buffer['state'][self.count] = state
114 | self.buffer['action'][self.count] = action
115 | self.buffer['reward'][self.count] = reward
116 | self.buffer['next_state'][self.count] = next_state
117 | self.buffer['terminal'][self.count] = terminal
118 | # 如果是第一条经验,初始化优先级为1.0;否则,对于新存入的经验,指定为当前最大的优先级
119 | priority = 1.0 if self.current_size == 0 else self.sum_tree.priority_max
120 | self.sum_tree.update(data_index=self.count, priority=priority) # 更新当前经验在sum_tree中的优先级
121 | self.count = (self.count + 1) % self.buffer_capacity # When the 'count' reaches buffer_capacity, it will be reset to 0.
122 | self.current_size = min(self.current_size + 1, self.buffer_capacity)
123 |
124 | def sample(self, total_steps):
125 | batch_index, IS_weight = self.sum_tree.get_batch_index(current_size=self.current_size, batch_size=self.batch_size, beta=self.beta)
126 | self.beta = self.beta_init + (1 - self.beta_init) * (total_steps / self.max_train_steps) # beta:beta_init->1.0
127 | batch = {}
128 | for key in self.buffer.keys(): # numpy->tensor
129 | if key == 'action':
130 | batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.long)
131 | else:
132 | batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.float32)
133 |
134 | return batch, batch_index, IS_weight
135 |
136 | def update_batch_priorities(self, batch_index, td_errors): # 根据传入的td_error,更新batch_index所对应数据的priorities
137 | priorities = (np.abs(td_errors) + 0.01) ** self.alpha
138 | for index, priority in zip(batch_index, priorities):
139 | self.sum_tree.update(data_index=index, priority=priority)
140 |
141 |
142 | class N_Steps_Prioritized_ReplayBuffer(object):
143 | def __init__(self, args):
144 | self.max_train_steps = args.max_train_steps
145 | self.alpha = args.alpha
146 | self.beta_init = args.beta_init
147 | self.beta = args.beta_init
148 | self.gamma = args.gamma
149 | self.batch_size = args.batch_size
150 | self.buffer_capacity = args.buffer_capacity
151 | self.sum_tree = SumTree(self.buffer_capacity)
152 | self.n_steps = args.n_steps
153 | self.n_steps_deque = deque(maxlen=self.n_steps)
154 | self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)),
155 | 'action': np.zeros((self.buffer_capacity, 1)),
156 | 'reward': np.zeros(self.buffer_capacity),
157 | 'next_state': np.zeros((self.buffer_capacity, args.state_dim)),
158 | 'terminal': np.zeros(self.buffer_capacity),
159 | }
160 | self.current_size = 0
161 | self.count = 0
162 |
163 | def store_transition(self, state, action, reward, next_state, terminal, done):
164 | transition = (state, action, reward, next_state, terminal, done)
165 | self.n_steps_deque.append(transition)
166 | if len(self.n_steps_deque) == self.n_steps:
167 | state, action, n_steps_reward, next_state, terminal = self.get_n_steps_transition()
168 | self.buffer['state'][self.count] = state
169 | self.buffer['action'][self.count] = action
170 | self.buffer['reward'][self.count] = n_steps_reward
171 | self.buffer['next_state'][self.count] = next_state
172 | self.buffer['terminal'][self.count] = terminal
173 | # 如果是buffer中的第一条经验,那么指定priority为1.0;否则对于新存入的经验,指定为当前最大的priority
174 | priority = 1.0 if self.current_size == 0 else self.sum_tree.priority_max
175 | self.sum_tree.update(data_index=self.count, priority=priority) # 更新当前经验在sum_tree中的优先级
176 | self.count = (self.count + 1) % self.buffer_capacity # When 'count' reaches buffer_capacity, it will be reset to 0.
177 | self.current_size = min(self.current_size + 1, self.buffer_capacity)
178 |
179 | def sample(self, total_steps):
180 | batch_index, IS_weight = self.sum_tree.get_batch_index(current_size=self.current_size, batch_size=self.batch_size, beta=self.beta)
181 | self.beta = self.beta_init + (1 - self.beta_init) * (total_steps / self.max_train_steps) # beta:beta_init->1.0
182 | batch = {}
183 | for key in self.buffer.keys(): # numpy->tensor
184 | if key == 'action':
185 | batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.long)
186 | else:
187 | batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.float32)
188 |
189 | return batch, batch_index, IS_weight
190 |
191 | def get_n_steps_transition(self):
192 | state, action = self.n_steps_deque[0][:2] # 获取deque中第一个transition的s和a
193 | next_state, terminal = self.n_steps_deque[-1][3:5] # 获取deque中最后一个transition的s'和terminal
194 | n_steps_reward = 0
195 | for i in reversed(range(self.n_steps)): # 逆序计算n_steps_reward
196 | r, s_, ter, d = self.n_steps_deque[i][2:]
197 | n_steps_reward = r + self.gamma * (1 - d) * n_steps_reward
198 | if d: # 如果done=True,说明一个回合结束,保存deque中当前这个transition的s'和terminal作为这个n_steps_transition的next_state和terminal
199 | next_state, terminal = s_, ter
200 |
201 | return state, action, n_steps_reward, next_state, terminal
202 |
203 | def update_batch_priorities(self, batch_index, td_errors): # 根据传入的td_error,更新batch_index所对应数据的priorities
204 | priorities = (np.abs(td_errors) + 0.01) ** self.alpha
205 | for index, priority in zip(batch_index, priorities):
206 | self.sum_tree.update(data_index=index, priority=priority)
207 |
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494478.DESKTOP-LMKC0MO.1408.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494478.DESKTOP-LMKC0MO.1408.0
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658507126.DESKTOP-LMKC0MO.1408.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658507126.DESKTOP-LMKC0MO.1408.1
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658520541.DESKTOP-LMKC0MO.1408.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658520541.DESKTOP-LMKC0MO.1408.2
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494481.DESKTOP-LMKC0MO.9316.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494481.DESKTOP-LMKC0MO.9316.0
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658512436.DESKTOP-LMKC0MO.9316.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658512436.DESKTOP-LMKC0MO.9316.1
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658531515.DESKTOP-LMKC0MO.9316.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658531515.DESKTOP-LMKC0MO.9316.2
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494475.DESKTOP-LMKC0MO.5976.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494475.DESKTOP-LMKC0MO.5976.0
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511615.DESKTOP-LMKC0MO.5976.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511615.DESKTOP-LMKC0MO.5976.1
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658528978.DESKTOP-LMKC0MO.5976.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658528978.DESKTOP-LMKC0MO.5976.2
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494473.DESKTOP-LMKC0MO.2144.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494473.DESKTOP-LMKC0MO.2144.0
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511489.DESKTOP-LMKC0MO.2144.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511489.DESKTOP-LMKC0MO.2144.1
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658529336.DESKTOP-LMKC0MO.2144.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658529336.DESKTOP-LMKC0MO.2144.2
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494471.DESKTOP-LMKC0MO.9964.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494471.DESKTOP-LMKC0MO.9964.0
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658510515.DESKTOP-LMKC0MO.9964.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658510515.DESKTOP-LMKC0MO.9964.1
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658526626.DESKTOP-LMKC0MO.9964.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658526626.DESKTOP-LMKC0MO.9964.2
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658479209.DESKTOP-LMKC0MO.1228.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658479209.DESKTOP-LMKC0MO.1228.0
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658479212.DESKTOP-LMKC0MO.10500.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658479212.DESKTOP-LMKC0MO.10500.0
--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658479214.DESKTOP-LMKC0MO.9512.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658479214.DESKTOP-LMKC0MO.9512.0
--------------------------------------------------------------------------------
/3.Rainbow_DQN/sum_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | class SumTree(object):
5 | """
6 | Story data with its priority in the tree.
7 | Tree structure and array storage:
8 |
9 | Tree index:
10 | 0 -> storing priority sum
11 | / \
12 | 1 2
13 | / \ / \
14 | 3 4 5 6 -> storing priority for transitions
15 |
16 | Array type for storing:
17 | [0,1,2,3,4,5,6]
18 | """
19 |
20 | def __init__(self, buffer_capacity):
21 | self.buffer_capacity = buffer_capacity # buffer的容量
22 | self.tree_capacity = 2 * buffer_capacity - 1 # sum_tree的容量
23 | self.tree = np.zeros(self.tree_capacity)
24 |
25 | def update(self, data_index, priority):
26 | # data_index表示当前数据在buffer中的index
27 | # tree_index表示当前数据在sum_tree中的index
28 | tree_index = data_index + self.buffer_capacity - 1 # 把当前数据在buffer中的index转换为在sum_tree中的index
29 | change = priority - self.tree[tree_index] # 当前数据的priority的改变量
30 | self.tree[tree_index] = priority # 更新树的最后一层叶子节点的优先级
31 | # then propagate the change through the tree
32 | while tree_index != 0: # 更新上层节点的优先级,一直传播到最顶端
33 | tree_index = (tree_index - 1) // 2
34 | self.tree[tree_index] += change
35 |
36 | def get_index(self, v):
37 | parent_idx = 0 # 从树的顶端开始
38 | while True:
39 | child_left_idx = 2 * parent_idx + 1 # 父节点下方的左右两个子节点的index
40 | child_right_idx = child_left_idx + 1
41 | if child_left_idx >= self.tree_capacity: # reach bottom, end search
42 | tree_index = parent_idx # tree_index表示采样到的数据在sum_tree中的index
43 | break
44 | else: # downward search, always search for a higher priority node
45 | if v <= self.tree[child_left_idx]:
46 | parent_idx = child_left_idx
47 | else:
48 | v -= self.tree[child_left_idx]
49 | parent_idx = child_right_idx
50 |
51 | data_index = tree_index - self.buffer_capacity + 1 # tree_index->data_index
52 | return data_index, self.tree[tree_index] # 返回采样到的data在buffer中的index,以及相对应的priority
53 |
54 | def get_batch_index(self, current_size, batch_size, beta):
55 | batch_index = np.zeros(batch_size, dtype=np.long)
56 | IS_weight = torch.zeros(batch_size, dtype=torch.float32)
57 | segment = self.priority_sum / batch_size # 把[0,priority_sum]等分成batch_size个区间,在每个区间均匀采样一个数
58 | for i in range(batch_size):
59 | a = segment * i
60 | b = segment * (i + 1)
61 | v = np.random.uniform(a, b)
62 | index, priority = self.get_index(v)
63 | batch_index[i] = index
64 | prob = priority / self.priority_sum # 当前数据被采样的概率
65 | IS_weight[i] = (current_size * prob) ** (-beta)
66 | IS_weight /= IS_weight.max() # normalization
67 |
68 | return batch_index, IS_weight
69 |
70 | @property
71 | def priority_sum(self):
72 | return self.tree[0] # 树的顶端保存了所有priority之和
73 |
74 | @property
75 | def priority_max(self):
76 | return self.tree[self.buffer_capacity - 1:].max() # 树的最后一层叶节点,保存的才是每个数据对应的priority
77 |
--------------------------------------------------------------------------------
/4.PPO-discrete/PPO_discrete_main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from torch.utils.tensorboard import SummaryWriter
4 | import gym
5 | import argparse
6 | from normalization import Normalization, RewardScaling
7 | from replaybuffer import ReplayBuffer
8 | from ppo_discrete import PPO_discrete
9 |
10 |
11 | def evaluate_policy(args, env, agent, state_norm):
12 | times = 3
13 | evaluate_reward = 0
14 | for _ in range(times):
15 | s = env.reset()
16 | if args.use_state_norm: # During the evaluating,update=False
17 | s = state_norm(s, update=False)
18 | done = False
19 | episode_reward = 0
20 | while not done:
21 | a = agent.evaluate(s) # We use the deterministic policy during the evaluating
22 | s_, r, done, _ = env.step(a)
23 | if args.use_state_norm:
24 | s_ = state_norm(s_, update=False)
25 | episode_reward += r
26 | s = s_
27 | evaluate_reward += episode_reward
28 |
29 | return evaluate_reward / times
30 |
31 |
32 | def main(args, env_name, number, seed):
33 | env = gym.make(env_name)
34 | env_evaluate = gym.make(env_name) # When evaluating the policy, we need to rebuild an environment
35 | # Set random seed
36 | env.seed(seed)
37 | env.action_space.seed(seed)
38 | env_evaluate.seed(seed)
39 | env_evaluate.action_space.seed(seed)
40 | np.random.seed(seed)
41 | torch.manual_seed(seed)
42 |
43 | args.state_dim = env.observation_space.shape[0]
44 | args.action_dim = env.action_space.n
45 | args.max_episode_steps = env._max_episode_steps # Maximum number of steps per episode
46 | print("env={}".format(env_name))
47 | print("state_dim={}".format(args.state_dim))
48 | print("action_dim={}".format(args.action_dim))
49 | print("max_episode_steps={}".format(args.max_episode_steps))
50 |
51 | evaluate_num = 0 # Record the number of evaluations
52 | evaluate_rewards = [] # Record the rewards during the evaluating
53 | total_steps = 0 # Record the total steps during the training
54 |
55 | replay_buffer = ReplayBuffer(args)
56 | agent = PPO_discrete(args)
57 |
58 | # Build a tensorboard
59 | writer = SummaryWriter(log_dir='runs/PPO_discrete/env_{}_number_{}_seed_{}'.format(env_name, number, seed))
60 |
61 | state_norm = Normalization(shape=args.state_dim) # Trick 2:state normalization
62 | if args.use_reward_norm: # Trick 3:reward normalization
63 | reward_norm = Normalization(shape=1)
64 | elif args.use_reward_scaling: # Trick 4:reward scaling
65 | reward_scaling = RewardScaling(shape=1, gamma=args.gamma)
66 |
67 | while total_steps < args.max_train_steps:
68 | s = env.reset()
69 | if args.use_state_norm:
70 | s = state_norm(s)
71 | if args.use_reward_scaling:
72 | reward_scaling.reset()
73 | episode_steps = 0
74 | done = False
75 | while not done:
76 | episode_steps += 1
77 | a, a_logprob = agent.choose_action(s) # Action and the corresponding log probability
78 | s_, r, done, _ = env.step(a)
79 |
80 | if args.use_state_norm:
81 | s_ = state_norm(s_)
82 | if args.use_reward_norm:
83 | r = reward_norm(r)
84 | elif args.use_reward_scaling:
85 | r = reward_scaling(r)
86 |
87 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
88 | # dw means dead or win,there is no next state s';
89 | # but when reaching the max_episode_steps,there is a next state s' actually.
90 | if done and episode_steps != args.max_episode_steps:
91 | dw = True
92 | else:
93 | dw = False
94 |
95 | replay_buffer.store(s, a, a_logprob, r, s_, dw, done)
96 | s = s_
97 | total_steps += 1
98 |
99 | # When the number of transitions in buffer reaches batch_size,then update
100 | if replay_buffer.count == args.batch_size:
101 | agent.update(replay_buffer, total_steps)
102 | replay_buffer.count = 0
103 |
104 | # Evaluate the policy every 'evaluate_freq' steps
105 | if total_steps % args.evaluate_freq == 0:
106 | evaluate_num += 1
107 | evaluate_reward = evaluate_policy(args, env_evaluate, agent, state_norm)
108 | evaluate_rewards.append(evaluate_reward)
109 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
110 | writer.add_scalar('step_rewards_{}'.format(env_name), evaluate_rewards[-1], global_step=total_steps)
111 | # Save the rewards
112 | if evaluate_num % args.save_freq == 0:
113 | np.save('./data_train/PPO_discrete_env_{}_number_{}_seed_{}.npy'.format(env_name, number, seed), np.array(evaluate_rewards))
114 |
115 |
116 | if __name__ == '__main__':
117 | parser = argparse.ArgumentParser("Hyperparameter Setting for PPO-discrete")
118 | parser.add_argument("--max_train_steps", type=int, default=int(2e5), help=" Maximum number of training steps")
119 | parser.add_argument("--evaluate_freq", type=float, default=5e3, help="Evaluate the policy every 'evaluate_freq' steps")
120 | parser.add_argument("--save_freq", type=int, default=20, help="Save frequency")
121 | parser.add_argument("--batch_size", type=int, default=2048, help="Batch size")
122 | parser.add_argument("--mini_batch_size", type=int, default=64, help="Minibatch size")
123 | parser.add_argument("--hidden_width", type=int, default=64, help="The number of neurons in hidden layers of the neural network")
124 | parser.add_argument("--lr_a", type=float, default=3e-4, help="Learning rate of actor")
125 | parser.add_argument("--lr_c", type=float, default=3e-4, help="Learning rate of critic")
126 | parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
127 | parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter")
128 | parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter")
129 | parser.add_argument("--K_epochs", type=int, default=10, help="PPO parameter")
130 | parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization")
131 | parser.add_argument("--use_state_norm", type=bool, default=True, help="Trick 2:state normalization")
132 | parser.add_argument("--use_reward_norm", type=bool, default=False, help="Trick 3:reward normalization")
133 | parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling")
134 | parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy")
135 | parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay")
136 | parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip")
137 | parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization")
138 | parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5")
139 | parser.add_argument("--use_tanh", type=float, default=True, help="Trick 10: tanh activation function")
140 |
141 | args = parser.parse_args()
142 |
143 | env_name = ['CartPole-v1', 'LunarLander-v2']
144 | env_index = 1
145 | main(args, env_name=env_name[env_index], number=1, seed=0)
146 |
--------------------------------------------------------------------------------
/4.PPO-discrete/README.md:
--------------------------------------------------------------------------------
1 | # PPO-discrete
2 | This is a concise Pytorch implementation of PPO on discrete action space with 10 tricks.
3 |
4 | ## 10 tricks
5 | Trick 1—Advantage Normalization.
6 | Trick 2—State Normalization.
7 | Trick 3 & Trick 4—— Reward Normalization & Reward Scaling.
8 | Trick 5—Policy Entropy.
9 | Trick 6—Learning Rate Decay.
10 | Trick 7—Gradient clip.
11 | Trick 8—Orthogonal Initialization.
12 | Trick 9—Adam Optimizer Epsilon Parameter.
13 | Trick10—Tanh Activation Function.
14 |
15 | ## How to use my code?
16 | You can dircetly run 'PPO_discrete_main.py' in your own IDE.
17 |
18 | ## Trainning environments
19 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 2 environments.
20 | env_index=0 represent 'CartPole-v1'
21 | env_index=1 represent 'LunarLander-v2'
22 |
23 | ## Training result
24 | 
25 |
26 | ## Tutorial
27 | If you can read Chinese, you can get more information from this blog.https://zhuanlan.zhihu.com/p/512327050
28 |
--------------------------------------------------------------------------------
/4.PPO-discrete/__pycache__/normalization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/__pycache__/normalization.cpython-37.pyc
--------------------------------------------------------------------------------
/4.PPO-discrete/__pycache__/ppo_discrete.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/__pycache__/ppo_discrete.cpython-37.pyc
--------------------------------------------------------------------------------
/4.PPO-discrete/__pycache__/replaybuffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/__pycache__/replaybuffer.cpython-37.pyc
--------------------------------------------------------------------------------
/4.PPO-discrete/data_train/readme.txt:
--------------------------------------------------------------------------------
1 | This is a file used to store the training reward data.
--------------------------------------------------------------------------------
/4.PPO-discrete/normalization.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class RunningMeanStd:
5 | # Dynamically calculate mean and std
6 | def __init__(self, shape): # shape:the dimension of input data
7 | self.n = 0
8 | self.mean = np.zeros(shape)
9 | self.S = np.zeros(shape)
10 | self.std = np.sqrt(self.S)
11 |
12 | def update(self, x):
13 | x = np.array(x)
14 | self.n += 1
15 | if self.n == 1:
16 | self.mean = x
17 | self.std = x
18 | else:
19 | old_mean = self.mean.copy()
20 | self.mean = old_mean + (x - old_mean) / self.n
21 | self.S = self.S + (x - old_mean) * (x - self.mean)
22 | self.std = np.sqrt(self.S / self.n)
23 |
24 |
25 | class Normalization:
26 | def __init__(self, shape):
27 | self.running_ms = RunningMeanStd(shape=shape)
28 |
29 | def __call__(self, x, update=True):
30 | # Whether to update the mean and std,during the evaluating,update=False
31 | if update:
32 | self.running_ms.update(x)
33 | x = (x - self.running_ms.mean) / (self.running_ms.std + 1e-8)
34 |
35 | return x
36 |
37 |
38 | class RewardScaling:
39 | def __init__(self, shape, gamma):
40 | self.shape = shape # reward shape=1
41 | self.gamma = gamma # discount factor
42 | self.running_ms = RunningMeanStd(shape=self.shape)
43 | self.R = np.zeros(self.shape)
44 |
45 | def __call__(self, x):
46 | self.R = self.gamma * self.R + x
47 | self.running_ms.update(self.R)
48 | x = x / (self.running_ms.std + 1e-8) # Only divided std
49 | return x
50 |
51 | def reset(self): # When an episode is done,we should reset 'self.R'
52 | self.R = np.zeros(self.shape)
53 |
--------------------------------------------------------------------------------
/4.PPO-discrete/ppo_discrete.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import numpy as np
5 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
6 | from torch.distributions import Categorical
7 |
8 |
9 | # Trick 8: orthogonal initialization
10 | def orthogonal_init(layer, gain=1.0):
11 | nn.init.orthogonal_(layer.weight, gain=gain)
12 | nn.init.constant_(layer.bias, 0)
13 |
14 |
15 | class Actor(nn.Module):
16 | def __init__(self, args):
17 | super(Actor, self).__init__()
18 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
19 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
20 | self.fc3 = nn.Linear(args.hidden_width, args.action_dim)
21 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh
22 |
23 | if args.use_orthogonal_init:
24 | print("------use_orthogonal_init------")
25 | orthogonal_init(self.fc1)
26 | orthogonal_init(self.fc2)
27 | orthogonal_init(self.fc3, gain=0.01)
28 |
29 | def forward(self, s):
30 | s = self.activate_func(self.fc1(s))
31 | s = self.activate_func(self.fc2(s))
32 | a_prob = torch.softmax(self.fc3(s), dim=1)
33 | return a_prob
34 |
35 |
36 | class Critic(nn.Module):
37 | def __init__(self, args):
38 | super(Critic, self).__init__()
39 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
40 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
41 | self.fc3 = nn.Linear(args.hidden_width, 1)
42 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh
43 |
44 | if args.use_orthogonal_init:
45 | print("------use_orthogonal_init------")
46 | orthogonal_init(self.fc1)
47 | orthogonal_init(self.fc2)
48 | orthogonal_init(self.fc3)
49 |
50 | def forward(self, s):
51 | s = self.activate_func(self.fc1(s))
52 | s = self.activate_func(self.fc2(s))
53 | v_s = self.fc3(s)
54 | return v_s
55 |
56 |
57 | class PPO_discrete:
58 | def __init__(self, args):
59 | self.batch_size = args.batch_size
60 | self.mini_batch_size = args.mini_batch_size
61 | self.max_train_steps = args.max_train_steps
62 | self.lr_a = args.lr_a # Learning rate of actor
63 | self.lr_c = args.lr_c # Learning rate of critic
64 | self.gamma = args.gamma # Discount factor
65 | self.lamda = args.lamda # GAE parameter
66 | self.epsilon = args.epsilon # PPO clip parameter
67 | self.K_epochs = args.K_epochs # PPO parameter
68 | self.entropy_coef = args.entropy_coef # Entropy coefficient
69 | self.set_adam_eps = args.set_adam_eps
70 | self.use_grad_clip = args.use_grad_clip
71 | self.use_lr_decay = args.use_lr_decay
72 | self.use_adv_norm = args.use_adv_norm
73 |
74 | self.actor = Actor(args)
75 | self.critic = Critic(args)
76 | if self.set_adam_eps: # Trick 9: set Adam epsilon=1e-5
77 | self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a, eps=1e-5)
78 | self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c, eps=1e-5)
79 | else:
80 | self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a)
81 | self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c)
82 |
83 | def evaluate(self, s): # When evaluating the policy, we select the action with the highest probability
84 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
85 | a_prob = self.actor(s).detach().numpy().flatten()
86 | a = np.argmax(a_prob)
87 | return a
88 |
89 | def choose_action(self, s):
90 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
91 | with torch.no_grad():
92 | dist = Categorical(probs=self.actor(s))
93 | a = dist.sample()
94 | a_logprob = dist.log_prob(a)
95 | return a.numpy()[0], a_logprob.numpy()[0]
96 |
97 | def update(self, replay_buffer, total_steps):
98 | s, a, a_logprob, r, s_, dw, done = replay_buffer.numpy_to_tensor() # Get training data
99 | """
100 | Calculate the advantage using GAE
101 | 'dw=True' means dead or win, there is no next state s'
102 | 'done=True' represents the terminal of an episode(dead or win or reaching the max_episode_steps). When calculating the adv, if done=True, gae=0
103 | """
104 | adv = []
105 | gae = 0
106 | with torch.no_grad(): # adv and v_target have no gradient
107 | vs = self.critic(s)
108 | vs_ = self.critic(s_)
109 | deltas = r + self.gamma * (1.0 - dw) * vs_ - vs
110 | for delta, d in zip(reversed(deltas.flatten().numpy()), reversed(done.flatten().numpy())):
111 | gae = delta + self.gamma * self.lamda * gae * (1.0 - d)
112 | adv.insert(0, gae)
113 | adv = torch.tensor(adv, dtype=torch.float).view(-1, 1)
114 | v_target = adv + vs
115 | if self.use_adv_norm: # Trick 1:advantage normalization
116 | adv = ((adv - adv.mean()) / (adv.std() + 1e-5))
117 |
118 | # Optimize policy for K epochs:
119 | for _ in range(self.K_epochs):
120 | # Random sampling and no repetition. 'False' indicates that training will continue even if the number of samples in the last time is less than mini_batch_size
121 | for index in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.mini_batch_size, False):
122 | dist_now = Categorical(probs=self.actor(s[index]))
123 | dist_entropy = dist_now.entropy().view(-1, 1) # shape(mini_batch_size X 1)
124 | a_logprob_now = dist_now.log_prob(a[index].squeeze()).view(-1, 1) # shape(mini_batch_size X 1)
125 | # a/b=exp(log(a)-log(b))
126 | ratios = torch.exp(a_logprob_now - a_logprob[index]) # shape(mini_batch_size X 1)
127 |
128 | surr1 = ratios * adv[index] # Only calculate the gradient of 'a_logprob_now' in ratios
129 | surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * adv[index]
130 | actor_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy # shape(mini_batch_size X 1)
131 | # Update actor
132 | self.optimizer_actor.zero_grad()
133 | actor_loss.mean().backward()
134 | if self.use_grad_clip: # Trick 7: Gradient clip
135 | torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
136 | self.optimizer_actor.step()
137 |
138 | v_s = self.critic(s[index])
139 | critic_loss = F.mse_loss(v_target[index], v_s)
140 | # Update critic
141 | self.optimizer_critic.zero_grad()
142 | critic_loss.backward()
143 | if self.use_grad_clip: # Trick 7: Gradient clip
144 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
145 | self.optimizer_critic.step()
146 |
147 | if self.use_lr_decay: # Trick 6:learning rate Decay
148 | self.lr_decay(total_steps)
149 |
150 | def lr_decay(self, total_steps):
151 | lr_a_now = self.lr_a * (1 - total_steps / self.max_train_steps)
152 | lr_c_now = self.lr_c * (1 - total_steps / self.max_train_steps)
153 | for p in self.optimizer_actor.param_groups:
154 | p['lr'] = lr_a_now
155 | for p in self.optimizer_critic.param_groups:
156 | p['lr'] = lr_c_now
157 |
--------------------------------------------------------------------------------
/4.PPO-discrete/replaybuffer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 |
5 | class ReplayBuffer:
6 | def __init__(self, args):
7 | self.s = np.zeros((args.batch_size, args.state_dim))
8 | self.a = np.zeros((args.batch_size, 1))
9 | self.a_logprob = np.zeros((args.batch_size, 1))
10 | self.r = np.zeros((args.batch_size, 1))
11 | self.s_ = np.zeros((args.batch_size, args.state_dim))
12 | self.dw = np.zeros((args.batch_size, 1))
13 | self.done = np.zeros((args.batch_size, 1))
14 | self.count = 0
15 |
16 | def store(self, s, a, a_logprob, r, s_, dw, done):
17 | self.s[self.count] = s
18 | self.a[self.count] = a
19 | self.a_logprob[self.count] = a_logprob
20 | self.r[self.count] = r
21 | self.s_[self.count] = s_
22 | self.dw[self.count] = dw
23 | self.done[self.count] = done
24 | self.count += 1
25 |
26 | def numpy_to_tensor(self):
27 | s = torch.tensor(self.s, dtype=torch.float)
28 | a = torch.tensor(self.a, dtype=torch.long) # In discrete action space, 'a' needs to be torch.long
29 | a_logprob = torch.tensor(self.a_logprob, dtype=torch.float)
30 | r = torch.tensor(self.r, dtype=torch.float)
31 | s_ = torch.tensor(self.s_, dtype=torch.float)
32 | dw = torch.tensor(self.dw, dtype=torch.float)
33 | done = torch.tensor(self.done, dtype=torch.float)
34 |
35 | return s, a, a_logprob, r, s_, dw, done
36 |
--------------------------------------------------------------------------------
/4.PPO-discrete/runs/readme.txt:
--------------------------------------------------------------------------------
1 | This is a file used to save the tensorboard data.
--------------------------------------------------------------------------------
/4.PPO-discrete/training_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/training_result.png
--------------------------------------------------------------------------------
/5.PPO-continuous/PPO_continuous_main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from torch.utils.tensorboard import SummaryWriter
4 | import gym
5 | import argparse
6 | from normalization import Normalization, RewardScaling
7 | from replaybuffer import ReplayBuffer
8 | from ppo_continuous import PPO_continuous
9 |
10 |
11 | def evaluate_policy(args, env, agent, state_norm):
12 | times = 3
13 | evaluate_reward = 0
14 | for _ in range(times):
15 | s = env.reset()
16 | if args.use_state_norm:
17 | s = state_norm(s, update=False) # During the evaluating,update=False
18 | done = False
19 | episode_reward = 0
20 | while not done:
21 | a = agent.evaluate(s) # We use the deterministic policy during the evaluating
22 | if args.policy_dist == "Beta":
23 | action = 2 * (a - 0.5) * args.max_action # [0,1]->[-max,max]
24 | else:
25 | action = a
26 | s_, r, done, _ = env.step(action)
27 | if args.use_state_norm:
28 | s_ = state_norm(s_, update=False)
29 | episode_reward += r
30 | s = s_
31 | evaluate_reward += episode_reward
32 |
33 | return evaluate_reward / times
34 |
35 |
36 | def main(args, env_name, number, seed):
37 | env = gym.make(env_name)
38 | env_evaluate = gym.make(env_name) # When evaluating the policy, we need to rebuild an environment
39 | # Set random seed
40 | env.seed(seed)
41 | env.action_space.seed(seed)
42 | env_evaluate.seed(seed)
43 | env_evaluate.action_space.seed(seed)
44 | np.random.seed(seed)
45 | torch.manual_seed(seed)
46 |
47 | args.state_dim = env.observation_space.shape[0]
48 | args.action_dim = env.action_space.shape[0]
49 | args.max_action = float(env.action_space.high[0])
50 | args.max_episode_steps = env._max_episode_steps # Maximum number of steps per episode
51 | print("env={}".format(env_name))
52 | print("state_dim={}".format(args.state_dim))
53 | print("action_dim={}".format(args.action_dim))
54 | print("max_action={}".format(args.max_action))
55 | print("max_episode_steps={}".format(args.max_episode_steps))
56 |
57 | evaluate_num = 0 # Record the number of evaluations
58 | evaluate_rewards = [] # Record the rewards during the evaluating
59 | total_steps = 0 # Record the total steps during the training
60 |
61 | replay_buffer = ReplayBuffer(args)
62 | agent = PPO_continuous(args)
63 |
64 | # Build a tensorboard
65 | writer = SummaryWriter(log_dir='runs/PPO_continuous/env_{}_{}_number_{}_seed_{}'.format(env_name, args.policy_dist, number, seed))
66 |
67 | state_norm = Normalization(shape=args.state_dim) # Trick 2:state normalization
68 | if args.use_reward_norm: # Trick 3:reward normalization
69 | reward_norm = Normalization(shape=1)
70 | elif args.use_reward_scaling: # Trick 4:reward scaling
71 | reward_scaling = RewardScaling(shape=1, gamma=args.gamma)
72 |
73 | while total_steps < args.max_train_steps:
74 | s = env.reset()
75 | if args.use_state_norm:
76 | s = state_norm(s)
77 | if args.use_reward_scaling:
78 | reward_scaling.reset()
79 | episode_steps = 0
80 | done = False
81 | while not done:
82 | episode_steps += 1
83 | a, a_logprob = agent.choose_action(s) # Action and the corresponding log probability
84 | if args.policy_dist == "Beta":
85 | action = 2 * (a - 0.5) * args.max_action # [0,1]->[-max,max]
86 | else:
87 | action = a
88 | s_, r, done, _ = env.step(action)
89 |
90 | if args.use_state_norm:
91 | s_ = state_norm(s_)
92 | if args.use_reward_norm:
93 | r = reward_norm(r)
94 | elif args.use_reward_scaling:
95 | r = reward_scaling(r)
96 |
97 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
98 | # dw means dead or win,there is no next state s';
99 | # but when reaching the max_episode_steps,there is a next state s' actually.
100 | if done and episode_steps != args.max_episode_steps:
101 | dw = True
102 | else:
103 | dw = False
104 |
105 | # Take the 'action',but store the original 'a'(especially for Beta)
106 | replay_buffer.store(s, a, a_logprob, r, s_, dw, done)
107 | s = s_
108 | total_steps += 1
109 |
110 | # When the number of transitions in buffer reaches batch_size,then update
111 | if replay_buffer.count == args.batch_size:
112 | agent.update(replay_buffer, total_steps)
113 | replay_buffer.count = 0
114 |
115 | # Evaluate the policy every 'evaluate_freq' steps
116 | if total_steps % args.evaluate_freq == 0:
117 | evaluate_num += 1
118 | evaluate_reward = evaluate_policy(args, env_evaluate, agent, state_norm)
119 | evaluate_rewards.append(evaluate_reward)
120 | print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
121 | writer.add_scalar('step_rewards_{}'.format(env_name), evaluate_rewards[-1], global_step=total_steps)
122 | # Save the rewards
123 | if evaluate_num % args.save_freq == 0:
124 | np.save('./data_train/PPO_continuous_{}_env_{}_number_{}_seed_{}.npy'.format(args.policy_dist, env_name, number, seed), np.array(evaluate_rewards))
125 |
126 |
127 | if __name__ == '__main__':
128 | parser = argparse.ArgumentParser("Hyperparameters Setting for PPO-continuous")
129 | parser.add_argument("--max_train_steps", type=int, default=int(3e6), help=" Maximum number of training steps")
130 | parser.add_argument("--evaluate_freq", type=float, default=5e3, help="Evaluate the policy every 'evaluate_freq' steps")
131 | parser.add_argument("--save_freq", type=int, default=20, help="Save frequency")
132 | parser.add_argument("--policy_dist", type=str, default="Gaussian", help="Beta or Gaussian")
133 | parser.add_argument("--batch_size", type=int, default=2048, help="Batch size")
134 | parser.add_argument("--mini_batch_size", type=int, default=64, help="Minibatch size")
135 | parser.add_argument("--hidden_width", type=int, default=64, help="The number of neurons in hidden layers of the neural network")
136 | parser.add_argument("--lr_a", type=float, default=3e-4, help="Learning rate of actor")
137 | parser.add_argument("--lr_c", type=float, default=3e-4, help="Learning rate of critic")
138 | parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
139 | parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter")
140 | parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter")
141 | parser.add_argument("--K_epochs", type=int, default=10, help="PPO parameter")
142 | parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization")
143 | parser.add_argument("--use_state_norm", type=bool, default=True, help="Trick 2:state normalization")
144 | parser.add_argument("--use_reward_norm", type=bool, default=False, help="Trick 3:reward normalization")
145 | parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling")
146 | parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy")
147 | parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay")
148 | parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip")
149 | parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization")
150 | parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5")
151 | parser.add_argument("--use_tanh", type=float, default=True, help="Trick 10: tanh activation function")
152 |
153 | args = parser.parse_args()
154 |
155 | env_name = ['BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2']
156 | env_index = 1
157 | main(args, env_name=env_name[env_index], number=1, seed=10)
158 |
--------------------------------------------------------------------------------
/5.PPO-continuous/README.md:
--------------------------------------------------------------------------------
1 | # PPO-continuous
2 | This is a concise Pytorch implementation of PPO on continuous action space with 10 tricks.
3 |
4 | ## 10 tricks
5 | Trick 1—Advantage Normalization.
6 | Trick 2—State Normalization.
7 | Trick 3 & Trick 4—— Reward Normalization & Reward Scaling.
8 | Trick 5—Policy Entropy.
9 | Trick 6—Learning Rate Decay.
10 | Trick 7—Gradient clip.
11 | Trick 8—Orthogonal Initialization.
12 | Trick 9—Adam Optimizer Epsilon Parameter.
13 | Trick10—Tanh Activation Function.
14 |
15 | ## How to use my code?
16 | You can dircetly run 'PPO_continuous_main.py' in your own IDE.
17 |
18 | ## Trainning environments
19 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 4 environments.
20 | env_index=0 represent 'BipedalWalker-v3'
21 | env_index=1 represent 'HalfCheetah-v2'
22 | env_index=2 represent 'Hopper-v2'
23 | env_index=3 represent 'Walker2d-v2'
24 |
25 | ## Trainning result
26 | 
27 |
28 | ## Tutorial
29 | If you can read Chinese, you can get more information from this blog.https://zhuanlan.zhihu.com/p/512327050
30 |
--------------------------------------------------------------------------------
/5.PPO-continuous/__pycache__/normalization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/__pycache__/normalization.cpython-37.pyc
--------------------------------------------------------------------------------
/5.PPO-continuous/__pycache__/ppo_continuous.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/__pycache__/ppo_continuous.cpython-37.pyc
--------------------------------------------------------------------------------
/5.PPO-continuous/__pycache__/replaybuffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/__pycache__/replaybuffer.cpython-37.pyc
--------------------------------------------------------------------------------
/5.PPO-continuous/data_train/readme.txt:
--------------------------------------------------------------------------------
1 | This is a file used to store the training reward data.
--------------------------------------------------------------------------------
/5.PPO-continuous/normalization.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class RunningMeanStd:
5 | # Dynamically calculate mean and std
6 | def __init__(self, shape): # shape:the dimension of input data
7 | self.n = 0
8 | self.mean = np.zeros(shape)
9 | self.S = np.zeros(shape)
10 | self.std = np.sqrt(self.S)
11 |
12 | def update(self, x):
13 | x = np.array(x)
14 | self.n += 1
15 | if self.n == 1:
16 | self.mean = x
17 | self.std = x
18 | else:
19 | old_mean = self.mean.copy()
20 | self.mean = old_mean + (x - old_mean) / self.n
21 | self.S = self.S + (x - old_mean) * (x - self.mean)
22 | self.std = np.sqrt(self.S / self.n)
23 |
24 |
25 | class Normalization:
26 | def __init__(self, shape):
27 | self.running_ms = RunningMeanStd(shape=shape)
28 |
29 | def __call__(self, x, update=True):
30 | # Whether to update the mean and std,during the evaluating,update=False
31 | if update:
32 | self.running_ms.update(x)
33 | x = (x - self.running_ms.mean) / (self.running_ms.std + 1e-8)
34 |
35 | return x
36 |
37 |
38 | class RewardScaling:
39 | def __init__(self, shape, gamma):
40 | self.shape = shape # reward shape=1
41 | self.gamma = gamma # discount factor
42 | self.running_ms = RunningMeanStd(shape=self.shape)
43 | self.R = np.zeros(self.shape)
44 |
45 | def __call__(self, x):
46 | self.R = self.gamma * self.R + x
47 | self.running_ms.update(self.R)
48 | x = x / (self.running_ms.std + 1e-8) # Only divided std
49 | return x
50 |
51 | def reset(self): # When an episode is done,we should reset 'self.R'
52 | self.R = np.zeros(self.shape)
53 |
--------------------------------------------------------------------------------
/5.PPO-continuous/ppo_continuous.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
4 | import torch.nn as nn
5 | from torch.distributions import Beta, Normal
6 |
7 |
8 | # Trick 8: orthogonal initialization
9 | def orthogonal_init(layer, gain=1.0):
10 | nn.init.orthogonal_(layer.weight, gain=gain)
11 | nn.init.constant_(layer.bias, 0)
12 |
13 |
14 | class Actor_Beta(nn.Module):
15 | def __init__(self, args):
16 | super(Actor_Beta, self).__init__()
17 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
18 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
19 | self.alpha_layer = nn.Linear(args.hidden_width, args.action_dim)
20 | self.beta_layer = nn.Linear(args.hidden_width, args.action_dim)
21 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh
22 |
23 | if args.use_orthogonal_init:
24 | print("------use_orthogonal_init------")
25 | orthogonal_init(self.fc1)
26 | orthogonal_init(self.fc2)
27 | orthogonal_init(self.alpha_layer, gain=0.01)
28 | orthogonal_init(self.beta_layer, gain=0.01)
29 |
30 | def forward(self, s):
31 | s = self.activate_func(self.fc1(s))
32 | s = self.activate_func(self.fc2(s))
33 | # alpha and beta need to be larger than 1,so we use 'softplus' as the activation function and then plus 1
34 | alpha = F.softplus(self.alpha_layer(s)) + 1.0
35 | beta = F.softplus(self.beta_layer(s)) + 1.0
36 | return alpha, beta
37 |
38 | def get_dist(self, s):
39 | alpha, beta = self.forward(s)
40 | dist = Beta(alpha, beta)
41 | return dist
42 |
43 | def mean(self, s):
44 | alpha, beta = self.forward(s)
45 | mean = alpha / (alpha + beta) # The mean of the beta distribution
46 | return mean
47 |
48 |
49 | class Actor_Gaussian(nn.Module):
50 | def __init__(self, args):
51 | super(Actor_Gaussian, self).__init__()
52 | self.max_action = args.max_action
53 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
54 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
55 | self.mean_layer = nn.Linear(args.hidden_width, args.action_dim)
56 | self.log_std = nn.Parameter(torch.zeros(1, args.action_dim)) # We use 'nn.Parameter' to train log_std automatically
57 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh
58 |
59 | if args.use_orthogonal_init:
60 | print("------use_orthogonal_init------")
61 | orthogonal_init(self.fc1)
62 | orthogonal_init(self.fc2)
63 | orthogonal_init(self.mean_layer, gain=0.01)
64 |
65 | def forward(self, s):
66 | s = self.activate_func(self.fc1(s))
67 | s = self.activate_func(self.fc2(s))
68 | mean = self.max_action * torch.tanh(self.mean_layer(s)) # [-1,1]->[-max_action,max_action]
69 | return mean
70 |
71 | def get_dist(self, s):
72 | mean = self.forward(s)
73 | log_std = self.log_std.expand_as(mean) # To make 'log_std' have the same dimension as 'mean'
74 | std = torch.exp(log_std) # The reason we train the 'log_std' is to ensure std=exp(log_std)>0
75 | dist = Normal(mean, std) # Get the Gaussian distribution
76 | return dist
77 |
78 |
79 | class Critic(nn.Module):
80 | def __init__(self, args):
81 | super(Critic, self).__init__()
82 | self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
83 | self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
84 | self.fc3 = nn.Linear(args.hidden_width, 1)
85 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh
86 |
87 | if args.use_orthogonal_init:
88 | print("------use_orthogonal_init------")
89 | orthogonal_init(self.fc1)
90 | orthogonal_init(self.fc2)
91 | orthogonal_init(self.fc3)
92 |
93 | def forward(self, s):
94 | s = self.activate_func(self.fc1(s))
95 | s = self.activate_func(self.fc2(s))
96 | v_s = self.fc3(s)
97 | return v_s
98 |
99 |
100 | class PPO_continuous():
101 | def __init__(self, args):
102 | self.policy_dist = args.policy_dist
103 | self.max_action = args.max_action
104 | self.batch_size = args.batch_size
105 | self.mini_batch_size = args.mini_batch_size
106 | self.max_train_steps = args.max_train_steps
107 | self.lr_a = args.lr_a # Learning rate of actor
108 | self.lr_c = args.lr_c # Learning rate of critic
109 | self.gamma = args.gamma # Discount factor
110 | self.lamda = args.lamda # GAE parameter
111 | self.epsilon = args.epsilon # PPO clip parameter
112 | self.K_epochs = args.K_epochs # PPO parameter
113 | self.entropy_coef = args.entropy_coef # Entropy coefficient
114 | self.set_adam_eps = args.set_adam_eps
115 | self.use_grad_clip = args.use_grad_clip
116 | self.use_lr_decay = args.use_lr_decay
117 | self.use_adv_norm = args.use_adv_norm
118 |
119 | if self.policy_dist == "Beta":
120 | self.actor = Actor_Beta(args)
121 | else:
122 | self.actor = Actor_Gaussian(args)
123 | self.critic = Critic(args)
124 |
125 | if self.set_adam_eps: # Trick 9: set Adam epsilon=1e-5
126 | self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a, eps=1e-5)
127 | self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c, eps=1e-5)
128 | else:
129 | self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a)
130 | self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c)
131 |
132 | def evaluate(self, s): # When evaluating the policy, we only use the mean
133 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
134 | if self.policy_dist == "Beta":
135 | a = self.actor.mean(s).detach().numpy().flatten()
136 | else:
137 | a = self.actor(s).detach().numpy().flatten()
138 | return a
139 |
140 | def choose_action(self, s):
141 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
142 | if self.policy_dist == "Beta":
143 | with torch.no_grad():
144 | dist = self.actor.get_dist(s)
145 | a = dist.sample() # Sample the action according to the probability distribution
146 | a_logprob = dist.log_prob(a) # The log probability density of the action
147 | else:
148 | with torch.no_grad():
149 | dist = self.actor.get_dist(s)
150 | a = dist.sample() # Sample the action according to the probability distribution
151 | a = torch.clamp(a, -self.max_action, self.max_action) # [-max,max]
152 | a_logprob = dist.log_prob(a) # The log probability density of the action
153 | return a.numpy().flatten(), a_logprob.numpy().flatten()
154 |
155 | def update(self, replay_buffer, total_steps):
156 | s, a, a_logprob, r, s_, dw, done = replay_buffer.numpy_to_tensor() # Get training data
157 | """
158 | Calculate the advantage using GAE
159 | 'dw=True' means dead or win, there is no next state s'
160 | 'done=True' represents the terminal of an episode(dead or win or reaching the max_episode_steps). When calculating the adv, if done=True, gae=0
161 | """
162 | adv = []
163 | gae = 0
164 | with torch.no_grad(): # adv and v_target have no gradient
165 | vs = self.critic(s)
166 | vs_ = self.critic(s_)
167 | deltas = r + self.gamma * (1.0 - dw) * vs_ - vs
168 | for delta, d in zip(reversed(deltas.flatten().numpy()), reversed(done.flatten().numpy())):
169 | gae = delta + self.gamma * self.lamda * gae * (1.0 - d)
170 | adv.insert(0, gae)
171 | adv = torch.tensor(adv, dtype=torch.float).view(-1, 1)
172 | v_target = adv + vs
173 | if self.use_adv_norm: # Trick 1:advantage normalization
174 | adv = ((adv - adv.mean()) / (adv.std() + 1e-5))
175 |
176 | # Optimize policy for K epochs:
177 | for _ in range(self.K_epochs):
178 | # Random sampling and no repetition. 'False' indicates that training will continue even if the number of samples in the last time is less than mini_batch_size
179 | for index in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.mini_batch_size, False):
180 | dist_now = self.actor.get_dist(s[index])
181 | dist_entropy = dist_now.entropy().sum(1, keepdim=True) # shape(mini_batch_size X 1)
182 | a_logprob_now = dist_now.log_prob(a[index])
183 | # a/b=exp(log(a)-log(b)) In multi-dimensional continuous action space,we need to sum up the log_prob
184 | ratios = torch.exp(a_logprob_now.sum(1, keepdim=True) - a_logprob[index].sum(1, keepdim=True)) # shape(mini_batch_size X 1)
185 |
186 | surr1 = ratios * adv[index] # Only calculate the gradient of 'a_logprob_now' in ratios
187 | surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * adv[index]
188 | actor_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy # Trick 5: policy entropy
189 | # Update actor
190 | self.optimizer_actor.zero_grad()
191 | actor_loss.mean().backward()
192 | if self.use_grad_clip: # Trick 7: Gradient clip
193 | torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
194 | self.optimizer_actor.step()
195 |
196 | v_s = self.critic(s[index])
197 | critic_loss = F.mse_loss(v_target[index], v_s)
198 | # Update critic
199 | self.optimizer_critic.zero_grad()
200 | critic_loss.backward()
201 | if self.use_grad_clip: # Trick 7: Gradient clip
202 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
203 | self.optimizer_critic.step()
204 |
205 | if self.use_lr_decay: # Trick 6:learning rate Decay
206 | self.lr_decay(total_steps)
207 |
208 | def lr_decay(self, total_steps):
209 | lr_a_now = self.lr_a * (1 - total_steps / self.max_train_steps)
210 | lr_c_now = self.lr_c * (1 - total_steps / self.max_train_steps)
211 | for p in self.optimizer_actor.param_groups:
212 | p['lr'] = lr_a_now
213 | for p in self.optimizer_critic.param_groups:
214 | p['lr'] = lr_c_now
215 |
--------------------------------------------------------------------------------
/5.PPO-continuous/replaybuffer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 |
5 | class ReplayBuffer:
6 | def __init__(self, args):
7 | self.s = np.zeros((args.batch_size, args.state_dim))
8 | self.a = np.zeros((args.batch_size, args.action_dim))
9 | self.a_logprob = np.zeros((args.batch_size, args.action_dim))
10 | self.r = np.zeros((args.batch_size, 1))
11 | self.s_ = np.zeros((args.batch_size, args.state_dim))
12 | self.dw = np.zeros((args.batch_size, 1))
13 | self.done = np.zeros((args.batch_size, 1))
14 | self.count = 0
15 |
16 | def store(self, s, a, a_logprob, r, s_, dw, done):
17 | self.s[self.count] = s
18 | self.a[self.count] = a
19 | self.a_logprob[self.count] = a_logprob
20 | self.r[self.count] = r
21 | self.s_[self.count] = s_
22 | self.dw[self.count] = dw
23 | self.done[self.count] = done
24 | self.count += 1
25 |
26 | def numpy_to_tensor(self):
27 | s = torch.tensor(self.s, dtype=torch.float)
28 | a = torch.tensor(self.a, dtype=torch.float)
29 | a_logprob = torch.tensor(self.a_logprob, dtype=torch.float)
30 | r = torch.tensor(self.r, dtype=torch.float)
31 | s_ = torch.tensor(self.s_, dtype=torch.float)
32 | dw = torch.tensor(self.dw, dtype=torch.float)
33 | done = torch.tensor(self.done, dtype=torch.float)
34 |
35 | return s, a, a_logprob, r, s_, dw, done
36 |
--------------------------------------------------------------------------------
/5.PPO-continuous/runs/readme.txt:
--------------------------------------------------------------------------------
1 | This is a file used to save the tensorboard data.
--------------------------------------------------------------------------------
/5.PPO-continuous/training_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/training_result.png
--------------------------------------------------------------------------------
/6.DDPG/DDPG.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import numpy as np
6 | import copy
7 | from torch.utils.tensorboard import SummaryWriter
8 |
9 |
10 | class Actor(nn.Module):
11 | def __init__(self, state_dim, action_dim, hidden_width, max_action):
12 | super(Actor, self).__init__()
13 | self.max_action = max_action
14 | self.l1 = nn.Linear(state_dim, hidden_width)
15 | self.l2 = nn.Linear(hidden_width, hidden_width)
16 | self.l3 = nn.Linear(hidden_width, action_dim)
17 |
18 | def forward(self, s):
19 | s = F.relu(self.l1(s))
20 | s = F.relu(self.l2(s))
21 | a = self.max_action * torch.tanh(self.l3(s)) # [-max,max]
22 | return a
23 |
24 |
25 | class Critic(nn.Module): # According to (s,a), directly calculate Q(s,a)
26 | def __init__(self, state_dim, action_dim, hidden_width):
27 | super(Critic, self).__init__()
28 | self.l1 = nn.Linear(state_dim + action_dim, hidden_width)
29 | self.l2 = nn.Linear(hidden_width, hidden_width)
30 | self.l3 = nn.Linear(hidden_width, 1)
31 |
32 | def forward(self, s, a):
33 | q = F.relu(self.l1(torch.cat([s, a], 1)))
34 | q = F.relu(self.l2(q))
35 | q = self.l3(q)
36 | return q
37 |
38 |
39 | class ReplayBuffer(object):
40 | def __init__(self, state_dim, action_dim):
41 | self.max_size = int(1e6)
42 | self.count = 0
43 | self.size = 0
44 | self.s = np.zeros((self.max_size, state_dim))
45 | self.a = np.zeros((self.max_size, action_dim))
46 | self.r = np.zeros((self.max_size, 1))
47 | self.s_ = np.zeros((self.max_size, state_dim))
48 | self.dw = np.zeros((self.max_size, 1))
49 |
50 | def store(self, s, a, r, s_, dw):
51 | self.s[self.count] = s
52 | self.a[self.count] = a
53 | self.r[self.count] = r
54 | self.s_[self.count] = s_
55 | self.dw[self.count] = dw
56 | self.count = (self.count + 1) % self.max_size # When the 'count' reaches max_size, it will be reset to 0.
57 | self.size = min(self.size + 1, self.max_size) # Record the number of transitions
58 |
59 | def sample(self, batch_size):
60 | index = np.random.choice(self.size, size=batch_size) # Randomly sampling
61 | batch_s = torch.tensor(self.s[index], dtype=torch.float)
62 | batch_a = torch.tensor(self.a[index], dtype=torch.float)
63 | batch_r = torch.tensor(self.r[index], dtype=torch.float)
64 | batch_s_ = torch.tensor(self.s_[index], dtype=torch.float)
65 | batch_dw = torch.tensor(self.dw[index], dtype=torch.float)
66 |
67 | return batch_s, batch_a, batch_r, batch_s_, batch_dw
68 |
69 |
70 | class DDPG(object):
71 | def __init__(self, state_dim, action_dim, max_action):
72 | self.hidden_width = 256 # The number of neurons in hidden layers of the neural network
73 | self.batch_size = 256 # batch size
74 | self.GAMMA = 0.99 # discount factor
75 | self.TAU = 0.005 # Softly update the target network
76 | self.lr = 3e-4 # learning rate
77 |
78 | self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action)
79 | self.actor_target = copy.deepcopy(self.actor)
80 | self.critic = Critic(state_dim, action_dim, self.hidden_width)
81 | self.critic_target = copy.deepcopy(self.critic)
82 |
83 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
84 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
85 |
86 | self.MseLoss = nn.MSELoss()
87 |
88 | def choose_action(self, s):
89 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
90 | a = self.actor(s).data.numpy().flatten()
91 | return a
92 |
93 | def learn(self, relay_buffer):
94 | batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size) # Sample a batch
95 |
96 | # Compute the target Q
97 | with torch.no_grad(): # target_Q has no gradient
98 | Q_ = self.critic_target(batch_s_, self.actor_target(batch_s_))
99 | target_Q = batch_r + self.GAMMA * (1 - batch_dw) * Q_
100 |
101 | # Compute the current Q and the critic loss
102 | current_Q = self.critic(batch_s, batch_a)
103 | critic_loss = self.MseLoss(target_Q, current_Q)
104 | # Optimize the critic
105 | self.critic_optimizer.zero_grad()
106 | critic_loss.backward()
107 | self.critic_optimizer.step()
108 |
109 | # Freeze critic networks so you don't waste computational effort
110 | for params in self.critic.parameters():
111 | params.requires_grad = False
112 |
113 | # Compute the actor loss
114 | actor_loss = -self.critic(batch_s, self.actor(batch_s)).mean()
115 | # Optimize the actor
116 | self.actor_optimizer.zero_grad()
117 | actor_loss.backward()
118 | self.actor_optimizer.step()
119 |
120 | # Unfreeze critic networks
121 | for params in self.critic.parameters():
122 | params.requires_grad = True
123 |
124 | # Softly update the target networks
125 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
126 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
127 |
128 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
129 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
130 |
131 |
132 | def evaluate_policy(env, agent):
133 | times = 3 # Perform three evaluations and calculate the average
134 | evaluate_reward = 0
135 | for _ in range(times):
136 | s = env.reset()
137 | done = False
138 | episode_reward = 0
139 | while not done:
140 | a = agent.choose_action(s) # We do not add noise when evaluating
141 | s_, r, done, _ = env.step(a)
142 | episode_reward += r
143 | s = s_
144 | evaluate_reward += episode_reward
145 |
146 | return int(evaluate_reward / times)
147 |
148 |
149 | def reward_adapter(r, env_index):
150 | if env_index == 0: # Pendulum-v1
151 | r = (r + 8) / 8
152 | elif env_index == 1: # BipedalWalker-v3
153 | if r <= -100:
154 | r = -1
155 | return r
156 |
157 |
158 | if __name__ == '__main__':
159 | env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2']
160 | env_index = 0
161 | env = gym.make(env_name[env_index])
162 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment
163 | number = 1
164 | # Set random seed
165 | seed = 0
166 | env.seed(seed)
167 | env.action_space.seed(seed)
168 | env_evaluate.seed(seed)
169 | env_evaluate.action_space.seed(seed)
170 | np.random.seed(seed)
171 | torch.manual_seed(seed)
172 |
173 | state_dim = env.observation_space.shape[0]
174 | action_dim = env.action_space.shape[0]
175 | max_action = float(env.action_space.high[0])
176 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode
177 | print("env={}".format(env_name[env_index]))
178 | print("state_dim={}".format(state_dim))
179 | print("action_dim={}".format(action_dim))
180 | print("max_action={}".format(max_action))
181 | print("max_episode_steps={}".format(max_episode_steps))
182 |
183 | agent = DDPG(state_dim, action_dim, max_action)
184 | replay_buffer = ReplayBuffer(state_dim, action_dim)
185 | # Build a tensorboard
186 | writer = SummaryWriter(log_dir='runs/DDPG/DDPG_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed))
187 |
188 | noise_std = 0.1 * max_action # the std of Gaussian noise for exploration
189 | max_train_steps = 3e6 # Maximum number of training steps
190 | random_steps = 25e3 # Take the random actions in the beginning for the better exploration
191 | update_freq = 50 # Take 50 steps,then update the networks 50 times
192 | evaluate_freq = 1e3 # Evaluate the policy every 'evaluate_freq' steps
193 | evaluate_num = 0 # Record the number of evaluations
194 | evaluate_rewards = [] # Record the rewards during the evaluating
195 | total_steps = 0 # Record the total steps during the training
196 |
197 | while total_steps < max_train_steps:
198 | s = env.reset()
199 | episode_steps = 0
200 | done = False
201 | while not done:
202 | episode_steps += 1
203 | if total_steps < random_steps: # Take the random actions in the beginning for the better exploration
204 | a = env.action_space.sample()
205 | else:
206 | # Add Gaussian noise to actions for exploration
207 | a = agent.choose_action(s)
208 | a = (a + np.random.normal(0, noise_std, size=action_dim)).clip(-max_action, max_action)
209 | s_, r, done, _ = env.step(a)
210 | r = reward_adapter(r, env_index) # Adjust rewards for better performance
211 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
212 | # dw means dead or win,there is no next state s';
213 | # but when reaching the max_episode_steps,there is a next state s' actually.
214 | if done and episode_steps != max_episode_steps:
215 | dw = True
216 | else:
217 | dw = False
218 | replay_buffer.store(s, a, r, s_, dw) # Store the transition
219 | s = s_
220 |
221 | # Take 50 steps,then update the networks 50 times
222 | if total_steps >= random_steps and total_steps % update_freq == 0:
223 | for _ in range(update_freq):
224 | agent.learn(replay_buffer)
225 |
226 | # Evaluate the policy every 'evaluate_freq' steps
227 | if (total_steps + 1) % evaluate_freq == 0:
228 | evaluate_num += 1
229 | evaluate_reward = evaluate_policy(env_evaluate, agent)
230 | evaluate_rewards.append(evaluate_reward)
231 | print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward))
232 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
233 | # Save the rewards
234 | if evaluate_num % 10 == 0:
235 | np.save('./data_train/DDPG_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
236 |
237 | total_steps += 1
238 |
--------------------------------------------------------------------------------
/7.TD3/README.md:
--------------------------------------------------------------------------------
1 | # TD3
2 | This is a concise Pytorch implementation of TD3(Twin Delayed DDPG) on continuous action space.
3 |
4 |
5 | ## How to use my code?
6 | You can dircetly run 'TD3.py' in your own IDE.
7 |
8 | ### Trainning environments
9 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 5 environments.
10 | env_index=0 represent 'Pendulum-v1'
11 | env_index=1 represent 'BipedalWalker-v3'
12 | env_index=2 represent 'HalfCheetah-v2'
13 | env_index=3 represent 'Hopper-v2'
14 | env_index=4 represent 'Walker2d-v2'
15 |
16 | ### How to see the training results?
17 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.
18 | The rewards data are saved as numpy in the file 'data_train'.
19 | The training curves are shown below, which are smoothed by averaging over a window of 10 steps.
20 | The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)
21 |
22 | 
23 |
24 | ## Reference
25 | [1] Fujimoto S, Hoof H, Meger D. Addressing function approximation error in actor-critic methods[C]//International conference on machine learning. PMLR, 2018: 1587-1596.
26 |
--------------------------------------------------------------------------------
/7.TD3/TD3.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import numpy as np
6 | import copy
7 | from torch.utils.tensorboard import SummaryWriter
8 |
9 |
10 | class Actor(nn.Module):
11 | def __init__(self, state_dim, action_dim, hidden_width, max_action):
12 | super(Actor, self).__init__()
13 | self.max_action = max_action
14 | self.l1 = nn.Linear(state_dim, hidden_width)
15 | self.l2 = nn.Linear(hidden_width, hidden_width)
16 | self.l3 = nn.Linear(hidden_width, action_dim)
17 |
18 | def forward(self, s):
19 | s = F.relu(self.l1(s))
20 | s = F.relu(self.l2(s))
21 | a = self.max_action * torch.tanh(self.l3(s)) # [-max,max]
22 | return a
23 |
24 |
25 | class Critic(nn.Module): # According to (s,a), directly calculate Q(s,a)
26 | def __init__(self, state_dim, action_dim, hidden_width):
27 | super(Critic, self).__init__()
28 | # Q1
29 | self.l1 = nn.Linear(state_dim + action_dim, hidden_width)
30 | self.l2 = nn.Linear(hidden_width, hidden_width)
31 | self.l3 = nn.Linear(hidden_width, 1)
32 | # Q2
33 | self.l4 = nn.Linear(state_dim + action_dim, hidden_width)
34 | self.l5 = nn.Linear(hidden_width, hidden_width)
35 | self.l6 = nn.Linear(hidden_width, 1)
36 |
37 | def forward(self, s, a):
38 | s_a = torch.cat([s, a], 1)
39 | q1 = F.relu(self.l1(s_a))
40 | q1 = F.relu(self.l2(q1))
41 | q1 = self.l3(q1)
42 |
43 | q2 = F.relu(self.l4(s_a))
44 | q2 = F.relu(self.l5(q2))
45 | q2 = self.l6(q2)
46 |
47 | return q1, q2
48 |
49 | def Q1(self, s, a):
50 | s_a = torch.cat([s, a], 1)
51 | q1 = F.relu(self.l1(s_a))
52 | q1 = F.relu(self.l2(q1))
53 | q1 = self.l3(q1)
54 |
55 | return q1
56 |
57 |
58 | class ReplayBuffer(object):
59 | def __init__(self, state_dim, action_dim):
60 | self.max_size = int(1e6)
61 | self.count = 0
62 | self.size = 0
63 | self.s = np.zeros((self.max_size, state_dim))
64 | self.a = np.zeros((self.max_size, action_dim))
65 | self.r = np.zeros((self.max_size, 1))
66 | self.s_ = np.zeros((self.max_size, state_dim))
67 | self.dw = np.zeros((self.max_size, 1))
68 |
69 | def store(self, s, a, r, s_, dw):
70 | self.s[self.count] = s
71 | self.a[self.count] = a
72 | self.r[self.count] = r
73 | self.s_[self.count] = s_
74 | self.dw[self.count] = dw
75 | self.count = (self.count + 1) % self.max_size # When the 'count' reaches max_size, it will be reset to 0.
76 | self.size = min(self.size + 1, self.max_size) # Record the number of transitions
77 |
78 | def sample(self, batch_size):
79 | index = np.random.choice(self.size, size=batch_size) # Randomly sampling
80 | batch_s = torch.tensor(self.s[index], dtype=torch.float)
81 | batch_a = torch.tensor(self.a[index], dtype=torch.float)
82 | batch_r = torch.tensor(self.r[index], dtype=torch.float)
83 | batch_s_ = torch.tensor(self.s_[index], dtype=torch.float)
84 | batch_dw = torch.tensor(self.dw[index], dtype=torch.float)
85 |
86 | return batch_s, batch_a, batch_r, batch_s_, batch_dw
87 |
88 |
89 | class TD3(object):
90 | def __init__(self, state_dim, action_dim, max_action):
91 | self.max_action = max_action
92 | self.hidden_width = 256 # The number of neurons in hidden layers of the neural network
93 | self.batch_size = 256 # batch size
94 | self.GAMMA = 0.99 # discount factor
95 | self.TAU = 0.005 # Softly update the target network
96 | self.lr = 3e-4 # learning rate
97 | self.policy_noise = 0.2 * max_action # The noise for the trick 'target policy smoothing'
98 | self.noise_clip = 0.5 * max_action # Clip the noise
99 | self.policy_freq = 2 # The frequency of policy updates
100 | self.actor_pointer = 0
101 |
102 | self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action)
103 | self.actor_target = copy.deepcopy(self.actor)
104 | self.critic = Critic(state_dim, action_dim, self.hidden_width)
105 | self.critic_target = copy.deepcopy(self.critic)
106 |
107 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
108 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
109 |
110 | def choose_action(self, s):
111 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
112 | a = self.actor(s).data.numpy().flatten()
113 | return a
114 |
115 | def learn(self, relay_buffer):
116 | self.actor_pointer += 1
117 | batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size) # Sample a batch
118 |
119 | # Compute the target Q
120 | with torch.no_grad(): # target_Q has no gradient
121 | # Trick 1:target policy smoothing
122 | # torch.randn_like can generate random numbers sampled from N(0,1),which have the same size as 'batch_a'
123 | noise = (torch.randn_like(batch_a) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
124 | next_action = (self.actor_target(batch_s_) + noise).clamp(-self.max_action, self.max_action)
125 |
126 | # Trick 2:clipped double Q-learning
127 | target_Q1, target_Q2 = self.critic_target(batch_s_, next_action)
128 | target_Q = batch_r + self.GAMMA * (1 - batch_dw) * torch.min(target_Q1, target_Q2)
129 |
130 | # Get the current Q
131 | current_Q1, current_Q2 = self.critic(batch_s, batch_a)
132 | # Compute the critic loss
133 | critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
134 | # Optimize the critic
135 | self.critic_optimizer.zero_grad()
136 | critic_loss.backward()
137 | self.critic_optimizer.step()
138 |
139 | # Trick 3:delayed policy updates
140 | if self.actor_pointer % self.policy_freq == 0:
141 | # Freeze critic networks so you don't waste computational effort
142 | for params in self.critic.parameters():
143 | params.requires_grad = False
144 |
145 | # Compute actor loss
146 | actor_loss = -self.critic.Q1(batch_s, self.actor(batch_s)).mean() # Only use Q1
147 | # Optimize the actor
148 | self.actor_optimizer.zero_grad()
149 | actor_loss.backward()
150 | self.actor_optimizer.step()
151 |
152 | # Unfreeze critic networks
153 | for params in self.critic.parameters():
154 | params.requires_grad = True
155 |
156 | # Softly update the target networks
157 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
158 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
159 |
160 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
161 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
162 |
163 |
164 | def evaluate_policy(env, agent):
165 | times = 3 # Perform three evaluations and calculate the average
166 | evaluate_reward = 0
167 | for _ in range(times):
168 | s = env.reset()
169 | done = False
170 | episode_reward = 0
171 | while not done:
172 | a = agent.choose_action(s) # We do not add noise when evaluating
173 | s_, r, done, _ = env.step(a)
174 | episode_reward += r
175 | s = s_
176 | evaluate_reward += episode_reward
177 |
178 | return int(evaluate_reward / times)
179 |
180 |
181 | def reward_adapter(r, env_index):
182 | if env_index == 0: # Pendulum-v1
183 | r = (r + 8) / 8
184 | elif env_index == 1: # BipedalWalker-v3
185 | if r <= -100:
186 | r = -1
187 | return r
188 |
189 |
190 | if __name__ == '__main__':
191 | env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2']
192 | env_index = 0
193 | env = gym.make(env_name[env_index])
194 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment
195 | number = 1
196 | # Set random seed
197 | seed = 0
198 | env.seed(seed)
199 | env.action_space.seed(seed)
200 | env_evaluate.seed(seed)
201 | env_evaluate.action_space.seed(seed)
202 | np.random.seed(seed)
203 | torch.manual_seed(seed)
204 |
205 | state_dim = env.observation_space.shape[0]
206 | action_dim = env.action_space.shape[0]
207 | max_action = float(env.action_space.high[0])
208 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode
209 | print("env={}".format(env_name[env_index]))
210 | print("state_dim={}".format(state_dim))
211 | print("action_dim={}".format(action_dim))
212 | print("max_action={}".format(max_action))
213 | print("max_episode_steps={}".format(max_episode_steps))
214 |
215 | agent = TD3(state_dim, action_dim, max_action)
216 | replay_buffer = ReplayBuffer(state_dim, action_dim)
217 | # Build a tensorboard
218 | writer = SummaryWriter(log_dir='runs/TD3/TD3_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed))
219 |
220 | noise_std = 0.1 * max_action # the std of Gaussian noise for exploration
221 | max_train_steps = 3e6 # Maximum number of training steps
222 | random_steps = 25e3 # Take the random actions in the beginning for the better exploration
223 | evaluate_freq = 5e3 # Evaluate the policy every 'evaluate_freq' steps
224 | evaluate_num = 0 # Record the number of evaluations
225 | evaluate_rewards = [] # Record the rewards during the evaluating
226 | total_steps = 0 # Record the total steps during the training
227 |
228 | while total_steps < max_train_steps:
229 | s = env.reset()
230 | episode_steps = 0
231 | done = False
232 | while not done:
233 | episode_steps += 1
234 | if total_steps < random_steps: # Take random actions in the beginning for the better exploration
235 | a = env.action_space.sample()
236 | else:
237 | # Add Gaussian noise to action for exploration
238 | a = agent.choose_action(s)
239 | a = (a + np.random.normal(0, noise_std, size=action_dim)).clip(-max_action, max_action)
240 | s_, r, done, _ = env.step(a)
241 | r = reward_adapter(r, env_index) # Adjust rewards for better performance
242 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
243 | # dw means dead or win,there is no next state s';
244 | # but when reaching the max_episode_steps,there is a next state s' actually.
245 | if done and episode_steps != max_episode_steps:
246 | dw = True
247 | else:
248 | dw = False
249 | replay_buffer.store(s, a, r, s_, dw) # Store the transition
250 | s = s_
251 |
252 | # Update one step
253 | if total_steps >= random_steps:
254 | agent.learn(replay_buffer)
255 |
256 | # Evaluate the policy every 'evaluate_freq' steps
257 | if (total_steps + 1) % evaluate_freq == 0:
258 | evaluate_num += 1
259 | evaluate_reward = evaluate_policy(env_evaluate, agent)
260 | evaluate_rewards.append(evaluate_reward)
261 | print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward))
262 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
263 | # Save the rewards
264 | if evaluate_num % 10 == 0:
265 | np.save('./data_train/TD3_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
266 |
267 | total_steps += 1
268 |
--------------------------------------------------------------------------------
/7.TD3/TD3_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/TD3_result.png
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_0.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_10.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_100.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_0.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_10.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_100.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_0.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_10.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_100.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_0.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_10.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_100.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_0.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_10.npy
--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_100.npy
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_0/events.out.tfevents.1648952137.李智.93956.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_0/events.out.tfevents.1648952137.李智.93956.0
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_10/events.out.tfevents.1648882414.李智.81744.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_10/events.out.tfevents.1648882414.李智.81744.0
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_100/events.out.tfevents.1648925401.李智.81744.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_100/events.out.tfevents.1648925401.李智.81744.1
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_0/events.out.tfevents.1648909506.李智.60360.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_0/events.out.tfevents.1648909506.李智.60360.2
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_10/events.out.tfevents.1648800524.李智.60360.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_10/events.out.tfevents.1648800524.李智.60360.0
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_100/events.out.tfevents.1648852975.李智.60360.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_100/events.out.tfevents.1648852975.李智.60360.1
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_0/events.out.tfevents.1649010066.李智.85868.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_0/events.out.tfevents.1649010066.李智.85868.2
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_10/events.out.tfevents.1648901654.李智.85868.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_10/events.out.tfevents.1648901654.李智.85868.0
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_100/events.out.tfevents.1648956951.李智.85868.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_100/events.out.tfevents.1648956951.李智.85868.1
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_0/events.out.tfevents.1649065960.李智.18392.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_0/events.out.tfevents.1649065960.李智.18392.2
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_10/events.out.tfevents.1649057339.李智.18392.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_10/events.out.tfevents.1649057339.李智.18392.0
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_100/events.out.tfevents.1649061632.李智.18392.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_100/events.out.tfevents.1649061632.李智.18392.1
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_0/events.out.tfevents.1648846023.李智.76672.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_0/events.out.tfevents.1648846023.李智.76672.2
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_10/events.out.tfevents.1648735005.李智.76672.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_10/events.out.tfevents.1648735005.李智.76672.0
--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_100/events.out.tfevents.1648793243.李智.76672.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_100/events.out.tfevents.1648793243.李智.76672.1
--------------------------------------------------------------------------------
/8.SAC/SAC-continuous.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import numpy as np
6 | import copy
7 | from torch.utils.tensorboard import SummaryWriter
8 | from torch.distributions import Normal
9 |
10 |
11 | class Actor(nn.Module):
12 | def __init__(self, state_dim, action_dim, hidden_width, max_action):
13 | super(Actor, self).__init__()
14 | self.max_action = max_action
15 | self.l1 = nn.Linear(state_dim, hidden_width)
16 | self.l2 = nn.Linear(hidden_width, hidden_width)
17 | self.mean_layer = nn.Linear(hidden_width, action_dim)
18 | self.log_std_layer = nn.Linear(hidden_width, action_dim)
19 |
20 | def forward(self, x, deterministic=False, with_logprob=True):
21 | x = F.relu(self.l1(x))
22 | x = F.relu(self.l2(x))
23 | mean = self.mean_layer(x)
24 | log_std = self.log_std_layer(x) # We output the log_std to ensure that std=exp(log_std)>0
25 | log_std = torch.clamp(log_std, -20, 2)
26 | std = torch.exp(log_std)
27 |
28 | dist = Normal(mean, std) # Generate a Gaussian distribution
29 | if deterministic: # When evaluating,we use the deterministic policy
30 | a = mean
31 | else:
32 | a = dist.rsample() # reparameterization trick: mean+std*N(0,1)
33 |
34 | if with_logprob: # The method refers to Open AI Spinning up, which is more stable.
35 | log_pi = dist.log_prob(a).sum(dim=1, keepdim=True)
36 | log_pi -= (2 * (np.log(2) - a - F.softplus(-2 * a))).sum(dim=1, keepdim=True)
37 | else:
38 | log_pi = None
39 |
40 | a = self.max_action * torch.tanh(a) # Use tanh to compress the unbounded Gaussian distribution into a bounded action interval.
41 |
42 | return a, log_pi
43 |
44 |
45 | class Critic(nn.Module): # According to (s,a), directly calculate Q(s,a)
46 | def __init__(self, state_dim, action_dim, hidden_width):
47 | super(Critic, self).__init__()
48 | # Q1
49 | self.l1 = nn.Linear(state_dim + action_dim, hidden_width)
50 | self.l2 = nn.Linear(hidden_width, hidden_width)
51 | self.l3 = nn.Linear(hidden_width, 1)
52 | # Q2
53 | self.l4 = nn.Linear(state_dim + action_dim, hidden_width)
54 | self.l5 = nn.Linear(hidden_width, hidden_width)
55 | self.l6 = nn.Linear(hidden_width, 1)
56 |
57 | def forward(self, s, a):
58 | s_a = torch.cat([s, a], 1)
59 | q1 = F.relu(self.l1(s_a))
60 | q1 = F.relu(self.l2(q1))
61 | q1 = self.l3(q1)
62 |
63 | q2 = F.relu(self.l4(s_a))
64 | q2 = F.relu(self.l5(q2))
65 | q2 = self.l6(q2)
66 |
67 | return q1, q2
68 |
69 |
70 | class ReplayBuffer(object):
71 | def __init__(self, state_dim, action_dim):
72 | self.max_size = int(1e6)
73 | self.count = 0
74 | self.size = 0
75 | self.s = np.zeros((self.max_size, state_dim))
76 | self.a = np.zeros((self.max_size, action_dim))
77 | self.r = np.zeros((self.max_size, 1))
78 | self.s_ = np.zeros((self.max_size, state_dim))
79 | self.dw = np.zeros((self.max_size, 1))
80 |
81 | def store(self, s, a, r, s_, dw):
82 | self.s[self.count] = s
83 | self.a[self.count] = a
84 | self.r[self.count] = r
85 | self.s_[self.count] = s_
86 | self.dw[self.count] = dw
87 | self.count = (self.count + 1) % self.max_size # When the 'count' reaches max_size, it will be reset to 0.
88 | self.size = min(self.size + 1, self.max_size) # Record the number of transitions
89 |
90 | def sample(self, batch_size):
91 | index = np.random.choice(self.size, size=batch_size) # Randomly sampling
92 | batch_s = torch.tensor(self.s[index], dtype=torch.float)
93 | batch_a = torch.tensor(self.a[index], dtype=torch.float)
94 | batch_r = torch.tensor(self.r[index], dtype=torch.float)
95 | batch_s_ = torch.tensor(self.s_[index], dtype=torch.float)
96 | batch_dw = torch.tensor(self.dw[index], dtype=torch.float)
97 |
98 | return batch_s, batch_a, batch_r, batch_s_, batch_dw
99 |
100 |
101 | class SAC(object):
102 | def __init__(self, state_dim, action_dim, max_action):
103 | self.max_action = max_action
104 | self.hidden_width = 256 # The number of neurons in hidden layers of the neural network
105 | self.batch_size = 256 # batch size
106 | self.GAMMA = 0.99 # discount factor
107 | self.TAU = 0.005 # Softly update the target network
108 | self.lr = 3e-4 # learning rate
109 | self.adaptive_alpha = True # Whether to automatically learn the temperature alpha
110 | if self.adaptive_alpha:
111 | # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
112 | self.target_entropy = -action_dim
113 | # We learn log_alpha instead of alpha to ensure that alpha=exp(log_alpha)>0
114 | self.log_alpha = torch.zeros(1, requires_grad=True)
115 | self.alpha = self.log_alpha.exp()
116 | self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.lr)
117 | else:
118 | self.alpha = 0.2
119 |
120 | self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action)
121 | self.critic = Critic(state_dim, action_dim, self.hidden_width)
122 | self.critic_target = copy.deepcopy(self.critic)
123 |
124 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
125 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
126 |
127 | def choose_action(self, s, deterministic=False):
128 | s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
129 | a, _ = self.actor(s, deterministic, False) # When choosing actions, we do not need to compute log_pi
130 | return a.data.numpy().flatten()
131 |
132 | def learn(self, relay_buffer):
133 | batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size) # Sample a batch
134 |
135 | with torch.no_grad():
136 | batch_a_, log_pi_ = self.actor(batch_s_) # a' from the current policy
137 | # Compute target Q
138 | target_Q1, target_Q2 = self.critic_target(batch_s_, batch_a_)
139 | target_Q = batch_r + self.GAMMA * (1 - batch_dw) * (torch.min(target_Q1, target_Q2) - self.alpha * log_pi_)
140 |
141 | # Compute current Q
142 | current_Q1, current_Q2 = self.critic(batch_s, batch_a)
143 | # Compute critic loss
144 | critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
145 | # Optimize the critic
146 | self.critic_optimizer.zero_grad()
147 | critic_loss.backward()
148 | self.critic_optimizer.step()
149 |
150 | # Freeze critic networks so you don't waste computational effort
151 | for params in self.critic.parameters():
152 | params.requires_grad = False
153 |
154 | # Compute actor loss
155 | a, log_pi = self.actor(batch_s)
156 | Q1, Q2 = self.critic(batch_s, a)
157 | Q = torch.min(Q1, Q2)
158 | actor_loss = (self.alpha * log_pi - Q).mean()
159 |
160 | # Optimize the actor
161 | self.actor_optimizer.zero_grad()
162 | actor_loss.backward()
163 | self.actor_optimizer.step()
164 |
165 | # Unfreeze critic networks
166 | for params in self.critic.parameters():
167 | params.requires_grad = True
168 |
169 | # Update alpha
170 | if self.adaptive_alpha:
171 | # We learn log_alpha instead of alpha to ensure that alpha=exp(log_alpha)>0
172 | alpha_loss = -(self.log_alpha.exp() * (log_pi + self.target_entropy).detach()).mean()
173 | self.alpha_optimizer.zero_grad()
174 | alpha_loss.backward()
175 | self.alpha_optimizer.step()
176 | self.alpha = self.log_alpha.exp()
177 |
178 | # Softly update target networks
179 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
180 | target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
181 |
182 |
183 | def evaluate_policy(env, agent):
184 | times = 3 # Perform three evaluations and calculate the average
185 | evaluate_reward = 0
186 | for _ in range(times):
187 | s = env.reset()
188 | done = False
189 | episode_reward = 0
190 | while not done:
191 | a = agent.choose_action(s, deterministic=True) # We use the deterministic policy during the evaluating
192 | s_, r, done, _ = env.step(a)
193 | episode_reward += r
194 | s = s_
195 | evaluate_reward += episode_reward
196 |
197 | return int(evaluate_reward / times)
198 |
199 |
200 | def reward_adapter(r, env_index):
201 | if env_index == 0: # Pendulum-v1
202 | r = (r + 8) / 8
203 | elif env_index == 1: # BipedalWalker-v3
204 | if r <= -100:
205 | r = -1
206 | return r
207 |
208 |
209 | if __name__ == '__main__':
210 | env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2']
211 | env_index = 0
212 | env = gym.make(env_name[env_index])
213 | env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment
214 | number = 1
215 | seed = 0
216 | # Set random seed
217 | env.seed(seed)
218 | env.action_space.seed(seed)
219 | env_evaluate.seed(seed)
220 | env_evaluate.action_space.seed(seed)
221 | np.random.seed(seed)
222 | torch.manual_seed(seed)
223 |
224 | state_dim = env.observation_space.shape[0]
225 | action_dim = env.action_space.shape[0]
226 | max_action = float(env.action_space.high[0])
227 | max_episode_steps = env._max_episode_steps # Maximum number of steps per episode
228 | print("env={}".format(env_name[env_index]))
229 | print("state_dim={}".format(state_dim))
230 | print("action_dim={}".format(action_dim))
231 | print("max_action={}".format(max_action))
232 | print("max_episode_steps={}".format(max_episode_steps))
233 |
234 | agent = SAC(state_dim, action_dim, max_action)
235 | replay_buffer = ReplayBuffer(state_dim, action_dim)
236 | # Build a tensorboard
237 | writer = SummaryWriter(log_dir='runs/SAC/SAC_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed))
238 |
239 | max_train_steps = 3e6 # Maximum number of training steps
240 | random_steps = 25e3 # Take the random actions in the beginning for the better exploration
241 | evaluate_freq = 5e3 # Evaluate the policy every 'evaluate_freq' steps
242 | evaluate_num = 0 # Record the number of evaluations
243 | evaluate_rewards = [] # Record the rewards during the evaluating
244 | total_steps = 0 # Record the total steps during the training
245 |
246 | while total_steps < max_train_steps:
247 | s = env.reset()
248 | episode_steps = 0
249 | done = False
250 | while not done:
251 | episode_steps += 1
252 | if total_steps < random_steps: # Take the random actions in the beginning for the better exploration
253 | a = env.action_space.sample()
254 | else:
255 | a = agent.choose_action(s)
256 | s_, r, done, _ = env.step(a)
257 | r = reward_adapter(r, env_index) # Adjust rewards for better performance
258 | # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
259 | # dw means dead or win,there is no next state s';
260 | # but when reaching the max_episode_steps,there is a next state s' actually.
261 | if done and episode_steps != max_episode_steps:
262 | dw = True
263 | else:
264 | dw = False
265 | replay_buffer.store(s, a, r, s_, dw) # Store the transition
266 | s = s_
267 |
268 | if total_steps >= random_steps:
269 | agent.learn(replay_buffer)
270 |
271 | # Evaluate the policy every 'evaluate_freq' steps
272 | if (total_steps + 1) % evaluate_freq == 0:
273 | evaluate_num += 1
274 | evaluate_reward = evaluate_policy(env_evaluate, agent)
275 | evaluate_rewards.append(evaluate_reward)
276 | print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward))
277 | writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
278 | # Save the rewards
279 | if evaluate_num % 10 == 0:
280 | np.save('./data_train/SAC_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
281 |
282 | total_steps += 1
283 |
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/PPO+RNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/PPO+RNN.png
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/PPO_discrete_rnn_main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from torch.utils.tensorboard import SummaryWriter
4 | import gym
5 | import argparse
6 | from normalization import Normalization, RewardScaling
7 | from replaybuffer import ReplayBuffer
8 | from ppo_discrete_rnn import PPO_discrete_RNN
9 |
10 |
11 | class Runner:
12 | def __init__(self, args, env_name, number, seed):
13 | self.args = args
14 | self.env_name = env_name
15 | self.number = number
16 | self.seed = seed
17 |
18 | # Create env
19 | self.env = gym.make(env_name)
20 | # Set random seed
21 | np.random.seed(self.seed)
22 | torch.manual_seed(self.seed)
23 | self.env.seed(seed)
24 | self.env.action_space.seed(seed)
25 |
26 | self.args.state_dim = self.env.observation_space.shape[0]
27 | self.args.action_dim = self.env.action_space.n
28 | self.args.episode_limit = self.env._max_episode_steps # Maximum number of steps per episode
29 | print("env={}".format(env_name))
30 | print("state_dim={}".format(args.state_dim))
31 | print("action_dim={}".format(args.action_dim))
32 | print("episode_limit={}".format(args.episode_limit))
33 |
34 | self.replay_buffer = ReplayBuffer(args)
35 | self.agent = PPO_discrete_RNN(args)
36 |
37 | # Create a tensorboard
38 | self.writer = SummaryWriter(log_dir='runs/PPO_discrete/env_{}_number_{}_seed_{}'.format(env_name, number, seed))
39 |
40 | self.evaluate_rewards = [] # Record the rewards during the evaluating
41 | self.total_steps = 0
42 |
43 | if self.args.use_state_norm:
44 | print("------use state normalization------")
45 | self.state_norm = Normalization(shape=args.state_dim) # Trick 2:state normalization
46 | if self.args.use_reward_scaling:
47 | print("------use reward scaling------")
48 | self.reward_scaling = RewardScaling(shape=1, gamma=self.args.gamma)
49 |
50 | def run(self, ):
51 | evaluate_num = -1 # Record the number of evaluations
52 | while self.total_steps < self.args.max_train_steps:
53 | if self.total_steps // self.args.evaluate_freq > evaluate_num:
54 | self.evaluate_policy() # Evaluate the policy every 'evaluate_freq' steps
55 | evaluate_num += 1
56 |
57 | _, episode_steps = self.run_episode() # Run an episode
58 | self.total_steps += episode_steps
59 |
60 | if self.replay_buffer.episode_num == self.args.batch_size:
61 | self.agent.train(self.replay_buffer, self.total_steps) # Training
62 | self.replay_buffer.reset_buffer()
63 |
64 | self.evaluate_policy()
65 | self.env.close()
66 |
67 | def run_episode(self, ):
68 | episode_reward = 0
69 | s = self.env.reset()
70 | if self.args.use_reward_scaling:
71 | self.reward_scaling.reset()
72 | self.agent.reset_rnn_hidden()
73 | for episode_step in range(self.args.episode_limit):
74 | if self.args.use_state_norm:
75 | s = self.state_norm(s)
76 | a, a_logprob = self.agent.choose_action(s, evaluate=False)
77 | v = self.agent.get_value(s)
78 | s_, r, done, _ = self.env.step(a)
79 | episode_reward += r
80 |
81 | if done and episode_step + 1 != self.args.episode_limit:
82 | dw = True
83 | else:
84 | dw = False
85 | if self.args.use_reward_scaling:
86 | r = self.reward_scaling(r)
87 | # Store the transition
88 | self.replay_buffer.store_transition(episode_step, s, v, a, a_logprob, r, dw)
89 | s = s_
90 | if done:
91 | break
92 |
93 | # An episode is over, store v in the last step
94 | if self.args.use_state_norm:
95 | s = self.state_norm(s)
96 | v = self.agent.get_value(s)
97 | self.replay_buffer.store_last_value(episode_step + 1, v)
98 |
99 | return episode_reward, episode_step + 1
100 |
101 | def evaluate_policy(self, ):
102 | evaluate_reward = 0
103 | for _ in range(self.args.evaluate_times):
104 | episode_reward, done = 0, False
105 | s = self.env.reset()
106 | self.agent.reset_rnn_hidden()
107 | while not done:
108 | if self.args.use_state_norm:
109 | s = self.state_norm(s, update=False)
110 | a, a_logprob = self.agent.choose_action(s, evaluate=True)
111 | s_, r, done, _ = self.env.step(a)
112 | episode_reward += r
113 | s = s_
114 | evaluate_reward += episode_reward
115 |
116 | evaluate_reward = evaluate_reward / self.args.evaluate_times
117 | self.evaluate_rewards.append(evaluate_reward)
118 | print("total_steps:{} \t evaluate_reward:{}".format(self.total_steps, evaluate_reward))
119 | self.writer.add_scalar('evaluate_step_rewards_{}'.format(self.env_name), evaluate_reward, global_step=self.total_steps)
120 | # Save the rewards and models
121 | np.save('./data_train/PPO_env_{}_number_{}_seed_{}.npy'.format(self.env_name, self.number, self.seed), np.array(self.evaluate_rewards))
122 |
123 |
124 | if __name__ == '__main__':
125 | parser = argparse.ArgumentParser("Hyperparameter Setting for PPO-discrete")
126 | parser.add_argument("--max_train_steps", type=int, default=int(2e5), help=" Maximum number of training steps")
127 | parser.add_argument("--evaluate_freq", type=float, default=5e3, help="Evaluate the policy every 'evaluate_freq' steps")
128 | parser.add_argument("--save_freq", type=int, default=20, help="Save frequency")
129 | parser.add_argument("--evaluate_times", type=float, default=3, help="Evaluate times")
130 |
131 | parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
132 | parser.add_argument("--mini_batch_size", type=int, default=2, help="Minibatch size")
133 | parser.add_argument("--hidden_dim", type=int, default=64, help="The number of neurons in hidden layers of the neural network")
134 | parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate of actor")
135 | parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
136 | parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter")
137 | parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter")
138 | parser.add_argument("--K_epochs", type=int, default=15, help="PPO parameter")
139 | parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization")
140 | parser.add_argument("--use_state_norm", type=bool, default=False, help="Trick 2:state normalization")
141 | parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling")
142 | parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy")
143 | parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay")
144 | parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip")
145 | parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization")
146 | parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5")
147 | parser.add_argument("--use_tanh", type=float, default=False, help="Trick 10: tanh activation function")
148 | parser.add_argument("--use_gru", type=bool, default=True, help="Whether to use GRU")
149 |
150 | args = parser.parse_args()
151 |
152 | env_names = ['CartPole-v1', 'LunarLander-v2']
153 | env_index = 0
154 | for seed in [0, 10, 100]:
155 | runner = Runner(args, env_name=env_names[env_index], number=3, seed=seed)
156 | runner.run()
157 |
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/README.md:
--------------------------------------------------------------------------------
1 | # PPO-discrete + RNN
2 | This is a concise Pytorch implementation of PPO+RNN(GRU/LSTM) on discrete action space.
3 |
4 |
5 | ## How to use my code?
6 | You can dircetly run 'PPO_discrete_rnn_main.py' in your own IDE.
7 |
8 | ## Trainning environments
9 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 2 environments.
10 | env_index=0 represent 'CartPole-v1'
11 | env_index=1 represent 'LunarLander-v2'
12 |
13 | ## Training result
14 | 
15 |
16 |
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/__pycache__/normalization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/__pycache__/normalization.cpython-37.pyc
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/__pycache__/ppo_discrete_rnn.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/__pycache__/ppo_discrete_rnn.cpython-37.pyc
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/__pycache__/replaybuffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/__pycache__/replaybuffer.cpython-37.pyc
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_0.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_10.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_100.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_0.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_10.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_100.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_0.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_10.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_100.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_0.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_10.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_100.npy
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/normalization.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class RunningMeanStd:
5 | # Dynamically calculate mean and std
6 | def __init__(self, shape): # shape:the dimension of input data
7 | self.n = 0
8 | self.mean = np.zeros(shape)
9 | self.S = np.zeros(shape)
10 | self.std = np.sqrt(self.S)
11 |
12 | def update(self, x):
13 | x = np.array(x)
14 | self.n += 1
15 | if self.n == 1:
16 | self.mean = x
17 | self.std = x
18 | else:
19 | old_mean = self.mean.copy()
20 | self.mean = old_mean + (x - old_mean) / self.n
21 | self.S = self.S + (x - old_mean) * (x - self.mean)
22 | self.std = np.sqrt(self.S / self.n)
23 |
24 |
25 | class Normalization:
26 | def __init__(self, shape):
27 | self.running_ms = RunningMeanStd(shape=shape)
28 |
29 | def __call__(self, x, update=True):
30 | # Whether to update the mean and std,during the evaluating,update=False
31 | if update:
32 | self.running_ms.update(x)
33 | x = (x - self.running_ms.mean) / (self.running_ms.std + 1e-8)
34 |
35 | return x
36 |
37 |
38 | class RewardScaling:
39 | def __init__(self, shape, gamma):
40 | self.shape = shape # reward shape=1
41 | self.gamma = gamma # discount factor
42 | self.running_ms = RunningMeanStd(shape=self.shape)
43 | self.R = np.zeros(self.shape)
44 |
45 | def __call__(self, x):
46 | self.R = self.gamma * self.R + x
47 | self.running_ms.update(self.R)
48 | x = x / (self.running_ms.std + 1e-8) # Only divided std
49 | return x
50 |
51 | def reset(self): # When an episode is done,we should reset 'self.R'
52 | self.R = np.zeros(self.shape)
53 |
54 |
55 |
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/ppo_discrete_rnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import numpy as np
5 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler, SequentialSampler
6 | from torch.distributions import Categorical
7 | import copy
8 |
9 |
10 | # Trick 8: orthogonal initialization
11 | def orthogonal_init(layer, gain=np.sqrt(2)):
12 | for name, param in layer.named_parameters():
13 | if 'bias' in name:
14 | nn.init.constant_(param, 0)
15 | elif 'weight' in name:
16 | nn.init.orthogonal_(param, gain=gain)
17 |
18 | return layer
19 |
20 |
21 | class Actor_Critic_RNN(nn.Module):
22 | def __init__(self, args):
23 | super(Actor_Critic_RNN, self).__init__()
24 | self.use_gru = args.use_gru
25 | self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh] # Trick10: use tanh
26 |
27 | self.actor_rnn_hidden = None
28 | self.actor_fc1 = nn.Linear(args.state_dim, args.hidden_dim)
29 | if args.use_gru:
30 | print("------use GRU------")
31 | self.actor_rnn = nn.GRU(args.hidden_dim, args.hidden_dim, batch_first=True)
32 | else:
33 | print("------use LSTM------")
34 | self.actor_rnn = nn.LSTM(args.hidden_dim, args.hidden_dim, batch_first=True)
35 | self.actor_fc2 = nn.Linear(args.hidden_dim, args.action_dim)
36 |
37 | self.critic_rnn_hidden = None
38 | self.critic_fc1 = nn.Linear(args.state_dim, args.hidden_dim)
39 | if args.use_gru:
40 | self.critic_rnn = nn.GRU(args.hidden_dim, args.hidden_dim, batch_first=True)
41 | else:
42 | self.critic_rnn = nn.LSTM(args.hidden_dim, args.hidden_dim, batch_first=True)
43 | self.critic_fc2 = nn.Linear(args.hidden_dim, 1)
44 |
45 | if args.use_orthogonal_init:
46 | print("------use orthogonal init------")
47 | orthogonal_init(self.actor_fc1)
48 | orthogonal_init(self.actor_rnn)
49 | orthogonal_init(self.actor_fc2, gain=0.01)
50 | orthogonal_init(self.critic_fc1)
51 | orthogonal_init(self.critic_rnn)
52 | orthogonal_init(self.critic_fc2)
53 |
54 | def actor(self, s):
55 | s = self.activate_func(self.actor_fc1(s))
56 | output, self.actor_rnn_hidden = self.actor_rnn(s, self.actor_rnn_hidden)
57 | logit = self.actor_fc2(output)
58 | return logit
59 |
60 | def critic(self, s):
61 | s = self.activate_func(self.critic_fc1(s))
62 | output, self.critic_rnn_hidden = self.critic_rnn(s, self.critic_rnn_hidden)
63 | value = self.critic_fc2(output)
64 | return value
65 |
66 |
67 | class PPO_discrete_RNN:
68 | def __init__(self, args):
69 | self.batch_size = args.batch_size
70 | self.mini_batch_size = args.mini_batch_size
71 | self.max_train_steps = args.max_train_steps
72 | self.lr = args.lr # Learning rate of actor
73 | self.gamma = args.gamma # Discount factor
74 | self.lamda = args.lamda # GAE parameter
75 | self.epsilon = args.epsilon # PPO clip parameter
76 | self.K_epochs = args.K_epochs # PPO parameter
77 | self.entropy_coef = args.entropy_coef # Entropy coefficient
78 | self.set_adam_eps = args.set_adam_eps
79 | self.use_grad_clip = args.use_grad_clip
80 | self.use_lr_decay = args.use_lr_decay
81 | self.use_adv_norm = args.use_adv_norm
82 |
83 | self.ac = Actor_Critic_RNN(args)
84 | if self.set_adam_eps: # Trick 9: set Adam epsilon=1e-5
85 | self.optimizer = torch.optim.Adam(self.ac.parameters(), lr=self.lr, eps=1e-5)
86 | else:
87 | self.optimizer = torch.optim.Adam(self.ac.parameters(), lr=self.lr)
88 |
89 | def reset_rnn_hidden(self):
90 | self.ac.actor_rnn_hidden = None
91 | self.ac.critic_rnn_hidden = None
92 |
93 | def choose_action(self, s, evaluate=False):
94 | with torch.no_grad():
95 | s = torch.tensor(s, dtype=torch.float).unsqueeze(0)
96 | logit = self.ac.actor(s)
97 | if evaluate:
98 | a = torch.argmax(logit)
99 | return a.item(), None
100 | else:
101 | dist = Categorical(logits=logit)
102 | a = dist.sample()
103 | a_logprob = dist.log_prob(a)
104 | return a.item(), a_logprob.item()
105 |
106 | def get_value(self, s):
107 | with torch.no_grad():
108 | s = torch.tensor(s, dtype=torch.float).unsqueeze(0)
109 | value = self.ac.critic(s)
110 | return value.item()
111 |
112 | def train(self, replay_buffer, total_steps):
113 | batch = replay_buffer.get_training_data() # Get training data
114 |
115 | # Optimize policy for K epochs:
116 | for _ in range(self.K_epochs):
117 | for index in BatchSampler(SequentialSampler(range(self.batch_size)), self.mini_batch_size, False):
118 | # If use RNN, we need to reset the rnn_hidden of the actor and critic.
119 | self.reset_rnn_hidden()
120 | logits_now = self.ac.actor(batch['s'][index]) # logits_now.shape=(mini_batch_size, max_episode_len, action_dim)
121 | values_now = self.ac.critic(batch['s'][index]).squeeze(-1) # values_now.shape=(mini_batch_size, max_episode_len)
122 |
123 | dist_now = Categorical(logits=logits_now)
124 | dist_entropy = dist_now.entropy() # shape(mini_batch_size, max_episode_len)
125 | a_logprob_now = dist_now.log_prob(batch['a'][index]) # shape(mini_batch_size, max_episode_len)
126 | # a/b=exp(log(a)-log(b))
127 | ratios = torch.exp(a_logprob_now - batch['a_logprob'][index]) # shape(mini_batch_size, max_episode_len)
128 |
129 | # actor loss
130 | surr1 = ratios * batch['adv'][index]
131 | surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * batch['adv'][index]
132 | actor_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy # shape(mini_batch_size, max_episode_len)
133 | actor_loss = (actor_loss * batch['active'][index]).sum() / batch['active'][index].sum()
134 |
135 | # critic_loss
136 | critic_loss = (values_now - batch['v_target'][index]) ** 2
137 | critic_loss = (critic_loss * batch['active'][index]).sum() / batch['active'][index].sum()
138 |
139 | # Update
140 | self.optimizer.zero_grad()
141 | loss = actor_loss + critic_loss * 0.5
142 | loss.backward()
143 | if self.use_grad_clip: # Trick 7: Gradient clip
144 | torch.nn.utils.clip_grad_norm_(self.ac.parameters(), 0.5)
145 | self.optimizer.step()
146 |
147 | if self.use_lr_decay: # Trick 6:learning rate Decay
148 | self.lr_decay(total_steps)
149 |
150 | def lr_decay(self, total_steps):
151 | lr_now = 0.9 * self.lr * (1 - total_steps / self.max_train_steps) + 0.1 * self.lr
152 | for p in self.optimizer.param_groups:
153 | p['lr'] = lr_now
154 |
155 | def save_model(self, env_name, number, seed, total_steps):
156 | torch.save(self.ac.state_dict(), "./model/PPO_actor_env_{}_number_{}_seed_{}_step_{}k.pth".format(env_name, number, seed, int(total_steps / 1000)))
157 |
158 | def load_model(self, env_name, number, seed, step):
159 | self.ac.load_state_dict(torch.load("./model/PPO_actor_env_{}_number_{}_seed_{}_step_{}k.pth".format(env_name, number, seed, step)))
160 |
161 |
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/replaybuffer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import copy
4 |
5 |
6 | class ReplayBuffer:
7 | def __init__(self, args):
8 | self.gamma = args.gamma
9 | self.lamda = args.lamda
10 | self.use_adv_norm = args.use_adv_norm
11 | self.state_dim = args.state_dim
12 | self.action_dim = args.action_dim
13 | self.episode_limit = args.episode_limit
14 | self.batch_size = args.batch_size
15 | self.episode_num = 0
16 | self.max_episode_len = 0
17 | self.buffer = None
18 | self.reset_buffer()
19 |
20 | def reset_buffer(self):
21 | self.buffer = {'s': np.zeros([self.batch_size, self.episode_limit, self.state_dim]),
22 | 'v': np.zeros([self.batch_size, self.episode_limit + 1]),
23 | 'a': np.zeros([self.batch_size, self.episode_limit]),
24 | 'a_logprob': np.zeros([self.batch_size, self.episode_limit]),
25 | 'r': np.zeros([self.batch_size, self.episode_limit]),
26 | 'dw': np.ones([self.batch_size, self.episode_limit]), # Note: We use 'np.ones' to initialize 'dw'
27 | 'active': np.zeros([self.batch_size, self.episode_limit])
28 | }
29 | self.episode_num = 0
30 | self.max_episode_len = 0
31 |
32 | def store_transition(self, episode_step, s, v, a, a_logprob, r, dw):
33 | self.buffer['s'][self.episode_num][episode_step] = s
34 | self.buffer['v'][self.episode_num][episode_step] = v
35 | self.buffer['a'][self.episode_num][episode_step] = a
36 | self.buffer['a_logprob'][self.episode_num][episode_step] = a_logprob
37 | self.buffer['r'][self.episode_num][episode_step] = r
38 | self.buffer['dw'][self.episode_num][episode_step] = dw
39 |
40 | self.buffer['active'][self.episode_num][episode_step] = 1.0
41 |
42 | def store_last_value(self, episode_step, v):
43 | self.buffer['v'][self.episode_num][episode_step] = v
44 | self.episode_num += 1
45 | # Record max_episode_len
46 | if episode_step > self.max_episode_len:
47 | self.max_episode_len = episode_step
48 |
49 | def get_adv(self):
50 | # Calculate the advantage using GAE
51 | v = self.buffer['v'][:, :self.max_episode_len]
52 | v_next = self.buffer['v'][:, 1:self.max_episode_len + 1]
53 | r = self.buffer['r'][:, :self.max_episode_len]
54 | dw = self.buffer['dw'][:, :self.max_episode_len]
55 | active = self.buffer['active'][:, :self.max_episode_len]
56 | adv = np.zeros_like(r) # adv.shape=(batch_size,max_episode_len)
57 | gae = 0
58 | with torch.no_grad(): # adv and v_target have no gradient
59 | # deltas.shape=(batch_size,max_episode_len)
60 | deltas = r + self.gamma * v_next * (1 - dw) - v
61 | for t in reversed(range(self.max_episode_len)):
62 | gae = deltas[:, t] + self.gamma * self.lamda * gae # gae.shape=(batch_size)
63 | adv[:, t] = gae
64 | v_target = adv + v # v_target.shape(batch_size,max_episode_len)
65 | if self.use_adv_norm: # Trick 1:advantage normalization
66 | adv_copy = copy.deepcopy(adv)
67 | adv_copy[active == 0] = np.nan # 忽略掉active=0的那些adv
68 | adv = ((adv - np.nanmean(adv_copy)) / (np.nanstd(adv_copy) + 1e-5))
69 | return adv, v_target
70 |
71 | def get_training_data(self):
72 | adv, v_target = self.get_adv()
73 | batch = {'s': torch.tensor(self.buffer['s'][:, :self.max_episode_len], dtype=torch.float32),
74 | 'a': torch.tensor(self.buffer['a'][:, :self.max_episode_len], dtype=torch.long), # 动作a的类型必须是long
75 | 'a_logprob': torch.tensor(self.buffer['a_logprob'][:, :self.max_episode_len], dtype=torch.float32),
76 | 'active': torch.tensor(self.buffer['active'][:, :self.max_episode_len], dtype=torch.float32),
77 | 'adv': torch.tensor(adv, dtype=torch.float32),
78 | 'v_target': torch.tensor(v_target, dtype=torch.float32)}
79 |
80 | return batch
81 |
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_0/events.out.tfevents.1659015484.DESKTOP-LMKC0MO.6444.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_0/events.out.tfevents.1659015484.DESKTOP-LMKC0MO.6444.0
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_10/events.out.tfevents.1659019387.DESKTOP-LMKC0MO.6444.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_10/events.out.tfevents.1659019387.DESKTOP-LMKC0MO.6444.1
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_100/events.out.tfevents.1659022676.DESKTOP-LMKC0MO.6444.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_100/events.out.tfevents.1659022676.DESKTOP-LMKC0MO.6444.2
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_0/events.out.tfevents.1659083525.DESKTOP-LMKC0MO.2204.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_0/events.out.tfevents.1659083525.DESKTOP-LMKC0MO.2204.0
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_10/events.out.tfevents.1659084659.DESKTOP-LMKC0MO.2204.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_10/events.out.tfevents.1659084659.DESKTOP-LMKC0MO.2204.1
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_100/events.out.tfevents.1659085747.DESKTOP-LMKC0MO.2204.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_100/events.out.tfevents.1659085747.DESKTOP-LMKC0MO.2204.2
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_0/events.out.tfevents.1659015468.DESKTOP-LMKC0MO.13484.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_0/events.out.tfevents.1659015468.DESKTOP-LMKC0MO.13484.0
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_10/events.out.tfevents.1659031613.DESKTOP-LMKC0MO.13484.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_10/events.out.tfevents.1659031613.DESKTOP-LMKC0MO.13484.1
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_100/events.out.tfevents.1659045291.DESKTOP-LMKC0MO.13484.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_100/events.out.tfevents.1659045291.DESKTOP-LMKC0MO.13484.2
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_0/events.out.tfevents.1659083526.DESKTOP-LMKC0MO.12096.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_0/events.out.tfevents.1659083526.DESKTOP-LMKC0MO.12096.0
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_10/events.out.tfevents.1659083528.DESKTOP-LMKC0MO.11420.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_10/events.out.tfevents.1659083528.DESKTOP-LMKC0MO.11420.0
--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_100/events.out.tfevents.1659083530.DESKTOP-LMKC0MO.11484.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_100/events.out.tfevents.1659083530.DESKTOP-LMKC0MO.11484.0
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Lizhi
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DRL-code-pytorch
2 | Concise pytorch implementations of DRL algorithms, including REINFORCE, A2C, Rainbow DQN, PPO(discrete and continuous), DDPG, TD3, SAC, PPO-discrete-RNN(LSTM/GRU).
3 |
4 |
5 | # Dependencies
6 | python==3.7.9
7 | numpy==1.19.4
8 | pytorch==1.12.0
9 | tensorboard==0.6.0
10 | gym==0.21.0
11 |
--------------------------------------------------------------------------------