├── .idea ├── RL.iml ├── deployment.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── Continuous_action ├── DDPG.py ├── PPO+GAE.py ├── PPO.py ├── PPO_continuous_BipedalWalker-v3.pth ├── SAC.py └── TD3.py ├── Discrete_action ├── Actor_Critic.py ├── CnnDQN.py ├── D3QN.py ├── D3QN2.py ├── DDQN.py ├── DQN.py ├── Noise DQN.py ├── __pycache__ │ ├── multiprocessing_env.cpython-37.pyc │ └── replay_buffer.cpython-37.pyc ├── multiprocessing_env.py ├── replay_buffer.py └── wrappers.py └── README.md /.idea/RL.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 21 | 22 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 1614334549215 183 | 191 | 192 | 193 | 194 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | file://$PROJECT_DIR$/DQN.py 263 | 111 264 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | -------------------------------------------------------------------------------- /Continuous_action/DDPG.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Feb 28 2021 3 | @author: wangmeng 4 | """ 5 | import random 6 | import gym 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | import torch.nn.functional as F 12 | from torch.distributions import Normal 13 | import matplotlib.pyplot as plt 14 | 15 | use_cuda = torch.cuda.is_available() 16 | device = torch.device("cuda" if use_cuda else "cpu") 17 | 18 | class ValueNetwork(nn.Module): 19 | def __init__(self, num_inputs, num_actions, hidden_size ,init_w = 3e-3): 20 | super(ValueNetwork, self).__init__() 21 | 22 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size) 23 | self.linear2 = nn.Linear(hidden_size, hidden_size) 24 | self.linear3 = nn.Linear(hidden_size, 1) 25 | 26 | self.linear3.weight.data.uniform_(-init_w,init_w) 27 | self.linear3.bias.data.uniform_(-init_w,init_w) 28 | 29 | def forward(self, state, action): 30 | x = torch.cat([state, action], 1) 31 | x = F.relu(self.linear1(x)) 32 | x = F.relu(self.linear2(x)) 33 | x = self.linear3(x) 34 | return x 35 | 36 | class PolicyNetwork(nn.Module): 37 | def __init__(self, num_inputs, num_actions, hidden_size, init_w = 3e-3): 38 | super(PolicyNetwork, self).__init__() 39 | 40 | self.linear1 = nn.Linear(num_inputs, hidden_size) 41 | self.linear2 = nn.Linear(hidden_size, hidden_size) 42 | self.linear3 = nn.Linear(hidden_size, num_actions) 43 | 44 | # uniform_将tensor用从均匀分布中抽样得到的值填充。参数初始化 45 | self.linear3.weight.data.uniform_(-init_w, init_w) 46 | #也用用normal_(0, 0.1) 来初始化的，高斯分布中抽样填充，这两种都是比较有效的初始化方式 47 | self.linear3.bias.data.uniform_(-init_w, init_w) 48 | #其意义在于我们尽可能保持每个神经元的输入和输出的方差一致。 49 | #使用 RELU（without BN）激活函数时，最好选用 He 初始化方法，将参数初始化为服从高斯分布或者均匀分布的较小随机数 50 | #使用 BN 时，减少了网络对参数初始值尺度的依赖，此时使用较小的标准差(eg：0.01)进行初始化即可 51 | 52 | #但是注意DRL中不建议使用BN 53 | 54 | def forward(self, x): 55 | x = F.relu(self.linear1(x)) 56 | x = F.relu(self.linear2(x)) 57 | x = F.tanh(self.linear3(x)) 58 | return x 59 | 60 | def get_action(self, state): 61 | state = torch.FloatTensor(state).unsqueeze(0).to(device) 62 | action = self.forward(state) 63 | return action.detach().cpu().numpy()[0,0] 64 | 65 | class OUNoise(object): 66 | def __init__(self, action_space, mu=0.0, theta = 0.15, max_sigma = 0.3, min_sigma = 0.3, decay_period = 100000):#decay_period要根据迭代次数合理设置 67 | self.mu = mu 68 | self.theta = theta 69 | self.sigma = max_sigma 70 | self.max_sigma = max_sigma 71 | self.min_sigma = min_sigma 72 | self.decay_period = decay_period 73 | self.action_dim = action_space.shape[0] 74 | self.low = action_space.low 75 | self.high = action_space.high 76 | self.reset() 77 | 78 | def reset(self): 79 | self.state = np.ones(self.action_dim) *self.mu 80 | 81 | def evolve_state(self): 82 | x = self.state 83 | dx = self.theta* (self.mu - x) + self.sigma * np.random.randn(self.action_dim) 84 | self.state = x + dx 85 | return self.state 86 | 87 | def get_action(self, action, t=0): 88 | ou_state = self.evolve_state() 89 | self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) 90 | return np.clip(action + ou_state, self.low, self.high) 91 | 92 | 93 | class ReplayBuffer: 94 | def __init__(self, capacity): 95 | self.capacity = capacity 96 | self.buffer = [] 97 | self.position = 0 98 | 99 | def push(self, state, action, reward, next_state, done): 100 | if len(self.buffer) < self.capacity: 101 | self.buffer.append(None) 102 | self.buffer[self.position] = (state, action, reward, next_state, done) 103 | self.position = (self.position + 1) % self.capacity 104 | 105 | def sample(self, batch_size): 106 | batch = random.sample(self.buffer, batch_size) 107 | state, action, reward, next_state, done = map(np.stack, zip(*batch)) 108 | return state, action, reward, next_state, done 109 | 110 | def __len__(self): 111 | return len(self.buffer) 112 | 113 | 114 | class NormalizedActions(gym.ActionWrapper): 115 | 116 | def action(self, action): 117 | low_bound = self.action_space.low 118 | upper_bound = self.action_space.high 119 | 120 | action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound) 121 | #将经过tanh输出的值重新映射回环境的真实值内 122 | action = np.clip(action, low_bound, upper_bound) 123 | 124 | return action 125 | 126 | def reverse_action(self, action): 127 | low_bound = self.action_space.low 128 | upper_bound = self.action_space.high 129 | 130 | #因为激活函数使用的是tanh，这里将环境输出的动作正则化到（-1，1） 131 | 132 | action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1 133 | action = np.clip(action, low_bound, upper_bound) 134 | 135 | return action 136 | 137 | class DDPG(object): 138 | def __init__(self, action_dim, state_dim, hidden_dim): 139 | super(DDPG,self).__init__() 140 | self.action_dim, self.state_dim, self.hidden_dim = action_dim, state_dim, hidden_dim 141 | self.batch_size = 128 142 | self.gamma = 0.99 143 | self.min_value = -np.inf 144 | self.max_value = np.inf 145 | self.soft_tau = 1e-2 146 | self.replay_buffer_size = 5000 147 | self.value_lr = 1e-3 148 | self.policy_lr = 1e-4 149 | 150 | self.value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) 151 | self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) 152 | 153 | self.target_value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) 154 | self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) 155 | 156 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): 157 | target_param.data.copy_(param.data) 158 | 159 | for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()): 160 | target_param.data.copy_(param.data) 161 | 162 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr) 163 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) 164 | 165 | self.value_criterion = nn.MSELoss() 166 | 167 | self.replay_buffer = ReplayBuffer(self.replay_buffer_size) 168 | 169 | def ddpg_update(self): 170 | state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size) 171 | 172 | state = torch.FloatTensor(state).to(device) 173 | next_state = torch.FloatTensor(next_state).to(device) 174 | action = torch.FloatTensor(action).to(device) 175 | reward = torch.FloatTensor(reward).unsqueeze(1).to(device) 176 | done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) 177 | 178 | policy_loss = self.value_net(state, self.policy_net(state)) 179 | policy_loss = -policy_loss.mean() 180 | 181 | next_action = self.target_policy_net(next_state) 182 | target_value = self.target_value_net(next_state, next_action.detach()) 183 | expected_value = reward + (1.0 - done) * self.gamma * target_value 184 | expected_value = torch.clamp(expected_value, self.min_value, self.max_value) 185 | 186 | value = self.value_net(state, action) 187 | value_loss = self.value_criterion(value, expected_value.detach()) 188 | 189 | self.policy_optimizer.zero_grad() 190 | policy_loss.backward() 191 | self.policy_optimizer.step() 192 | 193 | self.value_optimizer.zero_grad() 194 | value_loss.backward() 195 | self.value_optimizer.step() 196 | 197 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): 198 | target_param.data.copy_( 199 | target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau 200 | ) 201 | 202 | for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()): 203 | target_param.data.copy_( 204 | target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau 205 | ) 206 | 207 | def plot(frame_idx, rewards): 208 | plt.figure(figsize=(20,5)) 209 | plt.subplot(131) 210 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) 211 | plt.plot(rewards) 212 | plt.show() 213 | 214 | 215 | def main(): 216 | env = gym.make("Pendulum-v0") 217 | env = NormalizedActions(env) 218 | 219 | ou_noise = OUNoise(env.action_space) 220 | 221 | state_dim = env.observation_space.shape[0] 222 | action_dim = env.action_space.shape[0] 223 | hidden_dim = 256 224 | 225 | ddpg = DDPG(action_dim, state_dim, hidden_dim) 226 | 227 | max_frames = 12000 228 | max_steps = 500 229 | frame_idx = 0 230 | rewards = [] 231 | batch_size = 128 232 | 233 | while frame_idx < max_frames: 234 | state = env.reset() 235 | ou_noise.reset() 236 | episode_reward = 0 237 | 238 | for step in range(max_steps): 239 | env.render() 240 | action = ddpg.policy_net.get_action(state) 241 | action = ou_noise.get_action(action, step) 242 | next_state, reward, done, _ = env.step(action) 243 | 244 | ddpg.replay_buffer.push(state, action, reward, next_state, done) 245 | if len(ddpg.replay_buffer) > batch_size: 246 | ddpg.ddpg_update() 247 | 248 | state = next_state 249 | episode_reward += reward 250 | frame_idx += 1 251 | 252 | if frame_idx % max(1000, max_steps + 1) == 0: 253 | plot(frame_idx, rewards) 254 | 255 | if done: 256 | break 257 | 258 | rewards.append(episode_reward) 259 | env.close() 260 | 261 | if __name__ == '__main__': 262 | main() -------------------------------------------------------------------------------- /Continuous_action/PPO+GAE.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | 4 | import gym 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from torch.distributions import Normal 12 | from multiprocessing_env import SubprocVecEnv 13 | import matplotlib.pyplot as plt 14 | 15 | use_cuda = torch.cuda.is_available() 16 | device = torch.device("cuda" if use_cuda else "cpu") 17 | 18 | num_envs = 16 19 | env_name = "Pendulum-v0" 20 | 21 | def make_env(): 22 | def _thunk(): 23 | env = gym.make(env_name) 24 | return env 25 | 26 | return _thunk 27 | 28 | envs = [make_env() for i in range(num_envs)] 29 | envs = SubprocVecEnv(envs) 30 | 31 | env = gym.make(env_name) 32 | 33 | 34 | def init_weights(m): 35 | if isinstance(m, nn.Linear): 36 | nn.init.normal_(m.weight, mean=0., std=0.1) 37 | nn.init.constant_(m.bias, 0.1) 38 | 39 | 40 | class ActorCritic(nn.Module): 41 | def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0): 42 | super(ActorCritic, self).__init__() 43 | 44 | self.critic = nn.Sequential( 45 | nn.Linear(num_inputs, hidden_size), 46 | nn.ReLU(), 47 | nn.Linear(hidden_size, 1) 48 | ) 49 | 50 | self.actor = nn.Sequential( 51 | nn.Linear(num_inputs, hidden_size), 52 | nn.ReLU(), 53 | nn.Linear(hidden_size, num_outputs), 54 | ) 55 | self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std) 56 | 57 | self.apply(init_weights) 58 | 59 | def forward(self, x): 60 | value = self.critic(x) 61 | mu = self.actor(x) 62 | std = self.log_std.exp().expand_as(mu) 63 | dist = Normal(mu, std) 64 | return dist, value 65 | 66 | def plot(frame_idx, rewards): 67 | plt.figure(figsize=(20,5)) 68 | plt.subplot(131) 69 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) 70 | plt.plot(rewards) 71 | plt.show() 72 | 73 | 74 | def test_env(vis=False): 75 | state = env.reset() 76 | if vis: env.render() 77 | done = False 78 | total_reward = 0 79 | while not done: 80 | state = torch.FloatTensor(state).unsqueeze(0).to(device) 81 | dist, _ = model(state) 82 | next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) 83 | state = next_state 84 | if vis: env.render() 85 | total_reward += reward 86 | return total_reward 87 | 88 | def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95): 89 | values = values + [next_value] 90 | gae = 0 91 | returns = [] 92 | for step in reversed(range(len(rewards))): 93 | delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step] 94 | gae = delta + gamma * tau * masks[step] * gae 95 | returns.insert(0, gae + values[step]) 96 | return returns 97 | 98 | 99 | def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage): 100 | batch_size = states.size(0) 101 | for _ in range(batch_size // mini_batch_size): 102 | rand_ids = np.random.randint(0, batch_size, mini_batch_size) 103 | yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[ 104 | rand_ids, :] 105 | 106 | 107 | def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2): 108 | for _ in range(ppo_epochs): 109 | for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, 110 | returns, advantages): 111 | dist, value = model(state) 112 | entropy = dist.entropy().mean() 113 | new_log_probs = dist.log_prob(action) 114 | 115 | ratio = (new_log_probs - old_log_probs).exp() 116 | surr1 = ratio * advantage 117 | surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage 118 | 119 | actor_loss = - torch.min(surr1, surr2).mean() 120 | critic_loss = (return_ - value).pow(2).mean() 121 | 122 | loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy 123 | 124 | optimizer.zero_grad() 125 | loss.backward() 126 | optimizer.step() 127 | 128 | num_inputs = envs.observation_space.shape[0] 129 | num_outputs = envs.action_space.shape[0] 130 | 131 | #Hyper params: 132 | hidden_size = 256 133 | lr = 3e-4 134 | num_steps = 20 135 | mini_batch_size = 5 136 | ppo_epochs = 4 137 | threshold_reward = -200 138 | 139 | model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device) 140 | optimizer = optim.Adam(model.parameters(), lr=lr) 141 | 142 | max_frames = 15000 143 | frame_idx = 0 144 | test_rewards = [] 145 | 146 | state = envs.reset() 147 | early_stop = False 148 | 149 | while frame_idx < max_frames and not early_stop: 150 | 151 | log_probs = [] 152 | values = [] 153 | states = [] 154 | actions = [] 155 | rewards = [] 156 | masks = [] 157 | entropy = 0 158 | 159 | for _ in range(num_steps): 160 | state = torch.FloatTensor(state).to(device) 161 | dist, value = model(state) 162 | 163 | action = dist.sample() 164 | next_state, reward, done, _ = envs.step(action.cpu().numpy()) 165 | 166 | log_prob = dist.log_prob(action) 167 | entropy += dist.entropy().mean() 168 | 169 | log_probs.append(log_prob) 170 | values.append(value) 171 | rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) 172 | masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) 173 | 174 | states.append(state) 175 | actions.append(action) 176 | 177 | state = next_state 178 | frame_idx += 1 179 | 180 | if frame_idx % 1000 == 0: 181 | test_reward = np.mean([test_env() for _ in range(10)]) 182 | test_rewards.append(test_reward) 183 | plot(frame_idx, test_rewards) 184 | if test_reward > threshold_reward: early_stop = True 185 | 186 | next_state = torch.FloatTensor(next_state).to(device) 187 | _, next_value = model(next_state) 188 | returns = compute_gae(next_value, rewards, masks, values) 189 | 190 | returns = torch.cat(returns).detach() 191 | log_probs = torch.cat(log_probs).detach() 192 | values = torch.cat(values).detach() 193 | states = torch.cat(states) 194 | actions = torch.cat(actions) 195 | advantage = returns - values 196 | 197 | ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage) -------------------------------------------------------------------------------- /Continuous_action/PPO.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mar 1 2021 3 | @author: wangmeng 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | from torch.distributions import MultivariateNormal 8 | import gym 9 | import numpy as np 10 | 11 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 12 | 13 | 14 | class Memory: 15 | def __init__(self): 16 | self.actions = [] 17 | self.states = [] 18 | self.logprobs = [] 19 | self.rewards = [] 20 | self.is_terminals = [] 21 | 22 | def clear_memory(self): 23 | # del语句作用在变量上，而不是数据对象上。删除的是变量，而不是数据。 24 | del self.actions[:] 25 | del self.states[:] 26 | del self.logprobs[:] 27 | del self.rewards[:] 28 | del self.is_terminals[:] 29 | 30 | 31 | class ActorCritic(nn.Module): 32 | def __init__(self, state_dim, action_dim, action_std): 33 | super(ActorCritic, self).__init__() 34 | # action mean range -1 to 1 35 | self.actor = nn.Sequential( 36 | nn.Linear(state_dim, 64), 37 | nn.Tanh(), 38 | nn.Linear(64, 32), 39 | nn.Tanh(), 40 | nn.Linear(32, action_dim), 41 | nn.Tanh() 42 | ) 43 | # critic 44 | self.critic = nn.Sequential( 45 | nn.Linear(state_dim, 64), 46 | nn.Tanh(), 47 | nn.Linear(64, 32), 48 | nn.Tanh(), 49 | nn.Linear(32, 1) 50 | ) 51 | # 方差 52 | self.action_var = torch.full((action_dim,), action_std * action_std).to(device) 53 | 54 | def forward(self): 55 | # 手动设置异常 56 | raise NotImplementedError 57 | 58 | def act(self, state, memory): 59 | action_mean = self.actor(state) 60 | cov_mat = torch.diag(self.action_var).to(device) 61 | 62 | dist = MultivariateNormal(action_mean, cov_mat) 63 | action = dist.sample() 64 | action_logprob = dist.log_prob(action) 65 | 66 | memory.states.append(state) 67 | memory.actions.append(action) 68 | memory.logprobs.append(action_logprob) 69 | 70 | return action.detach() 71 | 72 | def evaluate(self, state, action): 73 | action_mean = self.actor(state) 74 | 75 | action_var = self.action_var.expand_as(action_mean) 76 | # torch.diag_embed(input, offset=0, dim1=-2, dim2=-1) → Tensor 77 | # Creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2) are filled by input 78 | cov_mat = torch.diag_embed(action_var).to(device) 79 | # 生成一个多元高斯分布矩阵 80 | dist = MultivariateNormal(action_mean, cov_mat) 81 | # 我们的目的是要用这个随机的去逼近真正的选择动作action的高斯分布 82 | action_logprobs = dist.log_prob(action) 83 | # log_prob 是action在前面那个正太分布的概率的log ，我们相信action是对的， 84 | # 那么我们要求的正态分布曲线中点应该在action这里，所以最大化正太分布的概率的log，改变mu,sigma得出一条中心点更加在a的正太分布。 85 | dist_entropy = dist.entropy() 86 | state_value = self.critic(state) 87 | 88 | return action_logprobs, torch.squeeze(state_value), dist_entropy 89 | 90 | 91 | class PPO: 92 | def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip): 93 | self.lr = lr 94 | self.betas = betas 95 | self.gamma = gamma 96 | self.eps_clip = eps_clip 97 | self.K_epochs = K_epochs 98 | 99 | self.policy = ActorCritic(state_dim, action_dim, action_std).to(device) 100 | self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) 101 | 102 | self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device) 103 | self.policy_old.load_state_dict(self.policy.state_dict()) 104 | 105 | self.MseLoss = nn.MSELoss() 106 | 107 | def select_action(self, state, memory): 108 | state = torch.FloatTensor(state.reshape(1, -1)).to(device) 109 | return self.policy_old.act(state, memory).cpu().data.numpy().flatten() 110 | 111 | def update(self, memory): 112 | # Monte Carlo estimate of rewards: 113 | rewards = [] 114 | discounted_reward = 0 115 | for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)): 116 | if is_terminal: 117 | discounted_reward = 0 118 | discounted_reward = reward + (self.gamma * discounted_reward) 119 | rewards.insert(0, discounted_reward) 120 | 121 | # Normalizing the rewards: 122 | rewards = torch.tensor(rewards, dtype=torch.float32).to(device) 123 | rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) 124 | 125 | # convert list to tensor 126 | # 使用stack可以保留两个信息：[1. 序列] 和 [2. 张量矩阵] 信息，属于【扩张再拼接】的函数； 127 | old_states = torch.squeeze(torch.stack(memory.states).to(device), 1).detach() 128 | old_actions = torch.squeeze(torch.stack(memory.actions).to(device), 1).detach() 129 | old_logprobs = torch.squeeze(torch.stack(memory.logprobs), 1).to(device).detach() 130 | 131 | # Optimize policy for K epochs: 132 | for _ in range(self.K_epochs): 133 | # Evaluating old actions and values : 134 | logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions) 135 | 136 | # Finding the ratio (pi_theta / pi_theta__old): 137 | ratios = torch.exp(logprobs - old_logprobs.detach()) 138 | 139 | # Finding Surrogate Loss: 140 | advantages = rewards - state_values.detach() 141 | surr1 = ratios * advantages 142 | surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages 143 | loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy 144 | 145 | # take gradient step 146 | self.optimizer.zero_grad() 147 | loss.mean().backward() 148 | self.optimizer.step() 149 | 150 | # Copy new weights into old policy: 151 | self.policy_old.load_state_dict(self.policy.state_dict()) 152 | 153 | 154 | def main(): 155 | ############## Hyperparameters ############## 156 | env_name = "BipedalWalker-v3" 157 | render = False 158 | solved_reward = 300 # stop training if avg_reward > solved_reward 159 | log_interval = 20 # print avg reward in the interval 160 | max_episodes = 10000 # max training episodes 161 | max_timesteps = 1500 # max timesteps in one episode 162 | 163 | update_timestep = 4000 # update policy every n timesteps 164 | action_std = 0.5 # constant std for action distribution (Multivariate Normal) 165 | K_epochs = 80 # update policy for K epochs 166 | eps_clip = 0.2 # clip parameter for PPO 167 | gamma = 0.99 # discount factor 168 | 169 | lr = 0.0003 # parameters for Adam optimizer 170 | betas = (0.9, 0.999) 171 | 172 | random_seed = None 173 | ############################################# 174 | 175 | # creating environment 176 | env = gym.make(env_name) 177 | state_dim = env.observation_space.shape[0] 178 | action_dim = env.action_space.shape[0] 179 | 180 | if random_seed: 181 | print("Random Seed: {}".format(random_seed)) 182 | torch.manual_seed(random_seed) 183 | env.seed(random_seed) 184 | np.random.seed(random_seed) 185 | 186 | memory = Memory() 187 | ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip) 188 | print(lr, betas) 189 | 190 | # logging variables 191 | running_reward = 0 192 | avg_length = 0 193 | time_step = 0 194 | 195 | # training loop 196 | for i_episode in range(1, max_episodes + 1): 197 | state = env.reset() 198 | for t in range(max_timesteps): 199 | time_step += 1 200 | # Running policy_old: 201 | action = ppo.select_action(state, memory) 202 | state, reward, done, _ = env.step(action) 203 | 204 | # Saving reward and is_terminals: 205 | memory.rewards.append(reward) 206 | memory.is_terminals.append(done) 207 | 208 | # update if its time 209 | if time_step % update_timestep == 0: 210 | ppo.update(memory) 211 | memory.clear_memory() 212 | time_step = 0 213 | running_reward += reward 214 | if render: 215 | env.render() 216 | if done: 217 | break 218 | 219 | avg_length += t 220 | 221 | # stop training if avg_reward > solved_reward 222 | if running_reward > (log_interval * solved_reward): 223 | print("########## Solved! ##########") 224 | torch.save(ppo.policy.state_dict(), './PPO_continuous_solved_{}.pth'.format(env_name)) 225 | break 226 | 227 | # save every 500 episodes 228 | if i_episode % 500 == 0: 229 | torch.save(ppo.policy.state_dict(), './PPO_continuous_{}.pth'.format(env_name)) 230 | 231 | # logging 232 | if i_episode % log_interval == 0: 233 | avg_length = int(avg_length / log_interval) 234 | running_reward = int((running_reward / log_interval)) 235 | 236 | print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward)) 237 | running_reward = 0 238 | avg_length = 0 239 | 240 | 241 | if __name__ == '__main__': 242 | main() 243 | 244 | -------------------------------------------------------------------------------- /Continuous_action/PPO_continuous_BipedalWalker-v3.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mengwanglalala/RL-algorithms/97f5b3e3b570ecb3c88ecf5f1ade148552103071/Continuous_action/PPO_continuous_BipedalWalker-v3.pth -------------------------------------------------------------------------------- /Continuous_action/SAC.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mar 4 2021 3 | @author: wangmeng 4 | """ 5 | import math 6 | import random 7 | import gym 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import torch.nn.functional as F 13 | from torch.distributions import Normal 14 | import matplotlib.pyplot as plt 15 | 16 | use_cuda = torch.cuda.is_available() 17 | device = torch.device("cuda" if use_cuda else "cpu") 18 | 19 | 20 | class ValueNetwork(nn.Module): 21 | def __init__(self, state_dim, hidden_dim, init_w=3e-3): 22 | super(ValueNetwork, self).__init__() 23 | 24 | self.linear1 = nn.Linear(state_dim, hidden_dim) 25 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 26 | self.linear3 = nn.Linear(hidden_dim, 1) 27 | 28 | self.linear3.weight.data.uniform_(-init_w, init_w) 29 | self.linear3.bias.data.uniform_(-init_w, init_w) 30 | 31 | def forward(self, state): 32 | x = F.relu(self.linear1(state)) 33 | x = F.relu(self.linear2(x)) 34 | x = self.linear3(x) 35 | return x 36 | 37 | #网络结构与ValueNetwork相同 38 | class SoftQNetwork(nn.Module): 39 | def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3): 40 | super(SoftQNetwork, self).__init__() 41 | 42 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size) 43 | self.linear2 = nn.Linear(hidden_size, hidden_size) 44 | self.linear3 = nn.Linear(hidden_size, 1) 45 | 46 | self.linear3.weight.data.uniform_(-init_w, init_w) 47 | self.linear3.bias.data.uniform_(-init_w, init_w) 48 | 49 | def forward(self, state, action): 50 | x = torch.cat([state, action], 1) 51 | x = F.relu(self.linear1(x)) 52 | x = F.relu(self.linear2(x)) 53 | x = self.linear3(x) 54 | return x 55 | 56 | 57 | class PolicyNetwork(nn.Module): 58 | def __init__(self, num_inputs, num_actions, hidden_size, init_w = 3e-3, log_std_min=-20, log_std_max=2):#多了标准差计算 59 | super(PolicyNetwork, self).__init__() 60 | self.log_std_min = log_std_min 61 | self.log_std_max = log_std_max 62 | 63 | self.linear1 = nn.Linear(num_inputs, hidden_size) 64 | self.linear2 = nn.Linear(hidden_size, hidden_size) 65 | 66 | #算均值 67 | self.mean_linear = nn.Linear(hidden_size, num_actions) 68 | self.mean_linear.weight.data.uniform_(-init_w, init_w) 69 | self.mean_linear.bias.data.uniform_(-init_w, init_w) 70 | 71 | #算标准差 72 | self.log_std_linear = nn.Linear(hidden_size, num_actions) 73 | self.log_std_linear.weight.data.uniform_(-init_w, init_w) 74 | self.log_std_linear.bias.data.uniform_(-init_w, init_w) 75 | 76 | 77 | def forward(self, x): 78 | x = F.relu(self.linear1(x)) 79 | x = F.relu(self.linear2(x)) 80 | mean = self.mean_linear(x) 81 | log_std = self.log_std_linear(x) 82 | #clamp将输入张量每个元素的夹紧到区间内 83 | log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) 84 | return mean, log_std 85 | 86 | def evaluate(self,state, epsilon = 1e-6): 87 | mean, log_std = self.forward(state) 88 | std = log_std.exp() 89 | 90 | #建立动作空间的各动作概率分布 91 | normal = Normal(mean, std) 92 | z = normal.sample() 93 | action = torch.tanh(z) 94 | 95 | log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon) 96 | log_prob = log_prob.sum(-1, keepdim=True) 97 | return action, log_prob, z, mean, log_std 98 | 99 | def get_action(self, state): 100 | state = torch.FloatTensor(state).unsqueeze(0).to(device) 101 | mean, log_std = self.forward(state) 102 | std = log_std.exp() 103 | normal = Normal(mean, std) 104 | z = normal.sample() 105 | action = torch.tanh(z) 106 | action = action.detach().cpu().numpy() 107 | return action[0] 108 | 109 | class ReplayBuffer: 110 | def __init__(self, capacity): 111 | self.capacity = capacity 112 | self.buffer = [] 113 | self.position = 0 114 | 115 | def push(self, state, action, reward, next_state, done): 116 | if len(self.buffer) < self.capacity: 117 | self.buffer.append(None) 118 | self.buffer[self.position] = (state, action, reward, next_state, done) 119 | self.position = (self.position + 1) % self.capacity 120 | 121 | def sample(self, batch_size): 122 | batch = random.sample(self.buffer, batch_size) 123 | state, action, reward, next_state, done = map(np.stack, zip(*batch)) 124 | return state, action, reward, next_state, done 125 | 126 | def __len__(self): 127 | return len(self.buffer) 128 | 129 | 130 | class NormalizedActions(gym.ActionWrapper): 131 | 132 | def action(self, action): 133 | low_bound = self.action_space.low 134 | upper_bound = self.action_space.high 135 | action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound) 136 | action = np.clip(action, low_bound, upper_bound) 137 | return action 138 | 139 | def reverse_action(self, action): 140 | low_bound = self.action_space.low 141 | upper_bound = self.action_space.high 142 | action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1 143 | action = np.clip(action, low_bound, upper_bound) 144 | 145 | return action 146 | 147 | class SAC(object): 148 | def __init__(self, action_dim, state_dim, hidden_dim): 149 | super(SAC, self).__init__() 150 | 151 | self.replay_buffer_size = 1000000 152 | self.replay_buffer = ReplayBuffer(self.replay_buffer_size) 153 | 154 | #与DDPG略显不同，少了target police，多了 soft q net 155 | self.value_net = ValueNetwork(state_dim, hidden_dim).to(device) 156 | self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(device) 157 | 158 | self.soft_q_net = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) 159 | self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) 160 | 161 | #复制value network参数至target网络中 162 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): 163 | target_param.data.copy_(param.data) 164 | 165 | self.value_criterion = nn.MSELoss() 166 | #多了soft q 的mseloss 167 | self.soft_q_criterion = nn.MSELoss() 168 | 169 | self.value_lr = 3e-4 170 | self.soft_q_lr = 3e-4 171 | self.policy_lr = 3e-4 172 | 173 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr) 174 | self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=self.soft_q_lr) 175 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) 176 | 177 | def soft_q_update(self, batch_size, 178 | gamma=0.99, 179 | mean_lambda=1e-3, 180 | std_lambda=1e-3, 181 | z_lambda=0.0, 182 | soft_tau=1e-2, 183 | ): 184 | state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) 185 | 186 | state = torch.FloatTensor(state).to(device) 187 | next_state = torch.FloatTensor(next_state).to(device) 188 | action = torch.FloatTensor(action).to(device) 189 | reward = torch.FloatTensor(reward).unsqueeze(1).to(device) 190 | done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) 191 | 192 | expected_q_value = self.soft_q_net(state, action) 193 | expected_value = self.value_net(state) 194 | new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) 195 | 196 | target_value = self.target_value_net(next_state) 197 | next_q_value = reward + (1 - done) * gamma * target_value 198 | q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach()) 199 | 200 | expected_new_q_value = self.soft_q_net(state, new_action) 201 | next_value = expected_new_q_value - log_prob 202 | value_loss = self.value_criterion(expected_value, next_value.detach()) 203 | 204 | log_prob_target = expected_new_q_value - expected_value 205 | policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() 206 | 207 | mean_loss = mean_lambda * mean.pow(2).mean() 208 | std_loss = std_lambda * log_std.pow(2).mean() 209 | z_loss = z_lambda * z.pow(2).sum(1).mean() 210 | 211 | policy_loss += mean_loss + std_loss + z_loss 212 | 213 | self.soft_q_optimizer.zero_grad() 214 | q_value_loss.backward() 215 | self.soft_q_optimizer.step() 216 | 217 | self.value_optimizer.zero_grad() 218 | value_loss.backward() 219 | self.value_optimizer.step() 220 | 221 | self.policy_optimizer.zero_grad() 222 | policy_loss.backward() 223 | self.policy_optimizer.step() 224 | 225 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): 226 | target_param.data.copy_( 227 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 228 | ) 229 | 230 | def plot(frame_idx, rewards): 231 | #plt.figure(figsize=(20,5)) 232 | #plt.subplot(131) 233 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) 234 | plt.plot(rewards) 235 | plt.show() 236 | 237 | def main(): 238 | env = gym.make("Pendulum-v0") 239 | env = NormalizedActions(env) 240 | 241 | action_dim = env.action_space.shape[0] 242 | state_dim = env.observation_space.shape[0] 243 | hidden_dim = 256 244 | 245 | model = SAC(action_dim, state_dim, hidden_dim) 246 | 247 | max_frames = 40000 248 | max_steps = 500 249 | frame_idx = 0 250 | rewards = [] 251 | batch_size = 128 252 | 253 | max_frames = 40000 254 | 255 | while frame_idx < max_frames: 256 | state = env.reset() 257 | episode_reward = 0 258 | 259 | for step in range(max_steps): 260 | action = model.policy_net.get_action(state) 261 | next_state, reward, done, _ = env.step(action) 262 | 263 | model.replay_buffer.push(state, action, reward, next_state, done) 264 | if len(model.replay_buffer) > batch_size: 265 | model.soft_q_update(batch_size) 266 | 267 | state = next_state 268 | episode_reward += reward 269 | frame_idx += 1 270 | 271 | if frame_idx % 1000 == 0: 272 | plot(frame_idx, rewards) 273 | 274 | if done: 275 | break 276 | 277 | rewards.append(episode_reward) 278 | 279 | 280 | if __name__ == '__main__': 281 | main() -------------------------------------------------------------------------------- /Continuous_action/TD3.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | import random 4 | 5 | import gym 6 | import numpy as np 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | import torch.nn.functional as F 12 | from torch.distributions import Normal 13 | import matplotlib.pyplot as plt 14 | 15 | 16 | use_cuda = torch.cuda.is_available() 17 | device = torch.device("cuda" if use_cuda else "cpu") 18 | 19 | 20 | class ReplayBuffer: 21 | def __init__(self, capacity): 22 | self.capacity = capacity 23 | self.buffer = [] 24 | self.position = 0 25 | 26 | def push(self, state, action, reward, next_state, done): 27 | if len(self.buffer) < self.capacity: 28 | self.buffer.append(None) 29 | self.buffer[self.position] = (state, action, reward, next_state, done) 30 | self.position = (self.position + 1) % self.capacity 31 | 32 | def sample(self, batch_size): 33 | batch = random.sample(self.buffer, batch_size) 34 | state, action, reward, next_state, done = map(np.stack, zip(*batch)) 35 | return state, action, reward, next_state, done 36 | 37 | def __len__(self): 38 | return len(self.buffer) 39 | 40 | 41 | class NormalizedActions(gym.ActionWrapper): 42 | def action(self, action): 43 | low = self.action_space.low 44 | high = self.action_space.high 45 | 46 | action = low + (action + 1.0) * 0.5 * (high - low) 47 | action = np.clip(action, low, high) 48 | 49 | return action 50 | 51 | def reverse_action(self, action): 52 | low = self.action_space.low 53 | high = self.action_space.high 54 | 55 | action = 2 * (action - low) / (high - low) - 1 56 | action = np.clip(action, low, high) 57 | 58 | return action 59 | 60 | 61 | class GaussianExploration(object): 62 | def __init__(self, action_space, max_sigma=1.0, min_sigma=1.0, decay_period=1000000): 63 | self.low = action_space.low 64 | self.high = action_space.high 65 | self.max_sigma = max_sigma 66 | self.min_sigma = min_sigma 67 | self.decay_period = decay_period 68 | 69 | def get_action(self, action, t=0): 70 | sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) 71 | action = action + np.random.normal(size=len(action)) * sigma 72 | return np.clip(action, self.low, self.high) 73 | 74 | 75 | def soft_update(net, target_net, soft_tau=1e-2): 76 | for target_param, param in zip(target_net.parameters(), net.parameters()): 77 | target_param.data.copy_( 78 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 79 | ) 80 | 81 | 82 | def plot(frame_idx, rewards): 83 | plt.figure(figsize=(20, 5)) 84 | plt.subplot(131) 85 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) 86 | plt.plot(rewards) 87 | plt.show() 88 | 89 | 90 | class ValueNetwork(nn.Module): 91 | def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3): 92 | super(ValueNetwork, self).__init__() 93 | 94 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size) 95 | self.linear2 = nn.Linear(hidden_size, hidden_size) 96 | self.linear3 = nn.Linear(hidden_size, 1) 97 | 98 | self.linear3.weight.data.uniform_(-init_w, init_w) 99 | self.linear3.bias.data.uniform_(-init_w, init_w) 100 | 101 | def forward(self, state, action): 102 | x = torch.cat([state, action], 1) 103 | x = F.relu(self.linear1(x)) 104 | x = F.relu(self.linear2(x)) 105 | x = self.linear3(x) 106 | return x 107 | 108 | 109 | class PolicyNetwork(nn.Module): 110 | def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3): 111 | super(PolicyNetwork, self).__init__() 112 | 113 | self.linear1 = nn.Linear(num_inputs, hidden_size) 114 | self.linear2 = nn.Linear(hidden_size, hidden_size) 115 | self.linear3 = nn.Linear(hidden_size, num_actions) 116 | 117 | self.linear3.weight.data.uniform_(-init_w, init_w) 118 | self.linear3.bias.data.uniform_(-init_w, init_w) 119 | 120 | def forward(self, state): 121 | x = F.relu(self.linear1(state)) 122 | x = F.relu(self.linear2(x)) 123 | x = F.tanh(self.linear3(x)) 124 | return x 125 | 126 | def get_action(self, state): 127 | state = torch.FloatTensor(state).unsqueeze(0).to(device) 128 | action = self.forward(state) 129 | return action.detach().cpu().numpy()[0] 130 | 131 | class TD(object): 132 | def __init__(self, action_dim, state_dim, hidden_dim): 133 | super(TD, self).__init__() 134 | self.action_dim, self.state_dim, self.hidden_dim = action_dim, state_dim, hidden_dim 135 | self.batch_size = 128 136 | self.gamma = 0.99 137 | self.soft_tau = 1e-2 138 | self.noise_std = 0.2 139 | self.noise_clip = 0.5 140 | self.policy_update = 2 141 | self.soft_tau = 1e-2 142 | self.replay_buffer_size = 1000000 143 | self.value_lr = 1e-3 144 | self.policy_lr = 1e-3 145 | 146 | self.value_net1 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) 147 | self.value_net2 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) 148 | self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) 149 | 150 | self.target_value_net1 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) 151 | self.target_value_net2 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) 152 | self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) 153 | 154 | soft_update(self.value_net1, self.target_value_net1, soft_tau=1.0) 155 | soft_update(self.value_net2, self.target_value_net2, soft_tau=1.0) 156 | soft_update(self.policy_net, self.target_policy_net, soft_tau=1.0) 157 | 158 | self.value_criterion = nn.MSELoss() 159 | 160 | self.value_optimizer1 = optim.Adam(self.value_net1.parameters(), lr=self.value_lr) 161 | self.value_optimizer2 = optim.Adam(self.value_net2.parameters(), lr=self.value_lr) 162 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) 163 | 164 | self.replay_buffer = ReplayBuffer(self.replay_buffer_size) 165 | 166 | 167 | def td3_update(self, step, batch_size): 168 | 169 | state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) 170 | 171 | state = torch.FloatTensor(state).to(device) 172 | next_state = torch.FloatTensor(next_state).to(device) 173 | action = torch.FloatTensor(action).to(device) 174 | reward = torch.FloatTensor(reward).unsqueeze(1).to(device) 175 | done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) 176 | 177 | next_action = self.target_policy_net(next_state) 178 | noise = torch.normal(torch.zeros(next_action.size()), self.noise_std).to(device) 179 | noise = torch.clamp(noise, -self.noise_clip, self.noise_clip) 180 | next_action += noise 181 | 182 | target_q_value1 = self.target_value_net1(next_state, next_action) 183 | target_q_value2 = self.target_value_net2(next_state, next_action) 184 | target_q_value = torch.min(target_q_value1, target_q_value2) 185 | expected_q_value = reward + (1.0 - done) * self.gamma * target_q_value 186 | 187 | q_value1 = self.value_net1(state, action) 188 | q_value2 = self.value_net2(state, action) 189 | 190 | value_loss1 = self.value_criterion(q_value1, expected_q_value.detach()) 191 | value_loss2 = self.value_criterion(q_value2, expected_q_value.detach()) 192 | 193 | self.value_optimizer1.zero_grad() 194 | value_loss1.backward() 195 | self.value_optimizer1.step() 196 | 197 | self.value_optimizer2.zero_grad() 198 | value_loss2.backward() 199 | self.value_optimizer2.step() 200 | 201 | if step % self.policy_update == 0: 202 | policy_loss = self.value_net1(state, self.policy_net(state)) 203 | policy_loss = -policy_loss.mean() 204 | 205 | self.policy_optimizer.zero_grad() 206 | policy_loss.backward() 207 | self.policy_optimizer.step() 208 | 209 | soft_update(self.value_net1, self.target_value_net1, soft_tau=self.soft_tau) 210 | soft_update(self.value_net2, self.target_value_net2, soft_tau=self.soft_tau) 211 | soft_update(self.policy_net, self.target_policy_net, soft_tau=self.soft_tau) 212 | 213 | def main(): 214 | env = NormalizedActions(gym.make('Pendulum-v0')) 215 | noise = GaussianExploration(env.action_space) 216 | 217 | state_dim = env.observation_space.shape[0] 218 | action_dim = env.action_space.shape[0] 219 | hidden_dim = 256 220 | 221 | TD3 = TD(action_dim, state_dim, hidden_dim) 222 | 223 | 224 | max_frames = 10000 225 | max_steps = 500 226 | frame_idx = 0 227 | rewards = [] 228 | batch_size = 128 229 | 230 | while frame_idx < max_frames: 231 | state = env.reset() 232 | episode_reward = 0 233 | 234 | for step in range(max_steps): 235 | action = TD3.policy_net.get_action(state) 236 | action = noise.get_action(action, step) 237 | next_state, reward, done, _ = env.step(action) 238 | 239 | TD3.replay_buffer.push(state, action, reward, next_state, done) 240 | if len(TD3.replay_buffer) > batch_size: 241 | TD3.td3_update(step, batch_size) 242 | 243 | state = next_state 244 | episode_reward += reward 245 | frame_idx += 1 246 | 247 | if frame_idx % 1000 == 0: 248 | plot(frame_idx, rewards) 249 | 250 | if done: 251 | break 252 | 253 | rewards.append(episode_reward) 254 | 255 | if __name__ == '__main__': 256 | main() 257 | 258 | 259 | 260 | 261 | -------------------------------------------------------------------------------- /Discrete_action/Actor_Critic.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mar 3 2021 3 | @author: wangmeng 4 | """ 5 | import math 6 | import random 7 | 8 | import gym 9 | import numpy as np 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | import torch.nn.functional as F 15 | from torch.distributions import Categorical 16 | import matplotlib.pyplot as plt 17 | 18 | from multiprocessing_env import SubprocVecEnv 19 | 20 | use_cuda = torch.cuda.is_available() 21 | device = torch.device("cuda" if use_cuda else "cpu") 22 | 23 | class ActorCritic(nn.Module): 24 | def __init__(self, inputs, outputs, hidden_size, std=0.0): 25 | super(ActorCritic, self).__init__() 26 | self.Actor = nn.Sequential( 27 | nn.Linear(inputs, hidden_size), 28 | nn.ReLU(), 29 | nn.Linear(hidden_size, outputs), 30 | #使得在softmax操作之后在dim这个维度相加等于1 31 | #注意，默认的方法已经弃用，最好在使用的时候声明dim 32 | nn.Softmax(dim=1) 33 | 34 | ) 35 | self.Critic = nn.Sequential( 36 | nn.Linear(inputs, hidden_size), 37 | nn.ReLU(), 38 | nn.Linear(hidden_size,1) 39 | ) 40 | def forward(self,x): 41 | value = self.Critic(x) 42 | probs = self.Actor(x) 43 | #分类,对actor输出的动作概率进行分类统计 44 | dist = Categorical(probs) 45 | return dist, value 46 | 47 | def make_env(): 48 | def _thunk(): 49 | env = gym.make("CartPole-v0") 50 | return env 51 | return _thunk 52 | 53 | #通过N步采样，以加速收敛，这里是计算优势函数 54 | def compute_returns(next_value, rewards, masks, gamma=0.99): 55 | R = next_value 56 | returns = [] 57 | for step in reversed(range(len(rewards))): 58 | R = rewards[step] + gamma * R * masks[step] 59 | #list.insert(index, obj)，index -- 对象 obj 需要插入的索引位置。 60 | returns.insert(0, R) 61 | return returns 62 | 63 | def test_env(model, env,vis=False): 64 | state = env.reset() 65 | if vis: env.render() 66 | done = False 67 | total_reward = 0 68 | while not done: 69 | state = torch.FloatTensor(state).unsqueeze(0).to(device) 70 | dist, _ = model(state) 71 | next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) 72 | state = next_state 73 | if vis: env.render() 74 | total_reward += reward 75 | return total_reward 76 | 77 | def plot(frame_idx, rewards): 78 | plt.figure(figsize=(20,5)) 79 | plt.subplot(131) 80 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) 81 | plt.plot(rewards) 82 | plt.show() 83 | 84 | def main(): 85 | num_envs = 16 86 | envs = [make_env() for i in range(num_envs)] 87 | envs = SubprocVecEnv(envs) 88 | env = gym.make("CartPole-v0") 89 | 90 | num_inputs = envs.observation_space.shape[0] 91 | num_outputs = envs.action_space.n 92 | # Hyper params: 93 | hidden_size = 256 94 | lr = 3e-4 95 | num_steps = 5 96 | 97 | model = ActorCritic(num_inputs,num_outputs,hidden_size).to(device) 98 | 99 | optimizer = optim.Adam(model.parameters()) 100 | 101 | max_frames = 20000 102 | frame_idx = 0 103 | test_rewards = [] 104 | state = envs.reset() 105 | 106 | while frame_idx < max_frames: 107 | 108 | log_probs = [] 109 | values = [] 110 | rewards = [] 111 | masks = [] 112 | entropy = 0 113 | 114 | #每个子网络运行num_steps个steps,实现n步采样 115 | for _ in range(num_steps): 116 | state = torch.FloatTensor(state).to(device) 117 | dist, value = model(state) 118 | action = dist.sample() 119 | next_state, reward, done, _ = envs.step(action.cpu().numpy()) 120 | log_prob = dist.log_prob(action) 121 | entropy += dist.entropy().mean() 122 | 123 | #记录下这num_steps步的各子网络相关参数 124 | log_probs.append(log_prob) 125 | values.append(value) 126 | rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) 127 | masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) 128 | 129 | state = next_state 130 | frame_idx += 1 131 | 132 | if frame_idx % 100 == 0: 133 | test_rewards.append(np.mean([test_env(model, env) for _ in range(10)])) 134 | plot(frame_idx, test_rewards) 135 | 136 | #将子网络的参数传给主网络，并进行参数更新 137 | next_state = torch.FloatTensor(next_state).to(device) 138 | _, next_value = model(next_state) 139 | returns = compute_returns(next_value, rewards, masks) 140 | 141 | #将5个step的值串起来 142 | log_probs = torch.cat(log_probs) 143 | returns = torch.cat(returns).detach() 144 | values = torch.cat(values) 145 | 146 | advantage = returns - values 147 | #计算loss均值 148 | actor_loss = -(log_probs * advantage.detach()).mean() 149 | critic_loss = advantage.pow(2).mean() 150 | 151 | loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy 152 | 153 | optimizer.zero_grad() 154 | loss.backward() 155 | optimizer.step() 156 | 157 | if __name__ == '__main__': 158 | main() -------------------------------------------------------------------------------- /Discrete_action/CnnDQN.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Feb 27 2021 3 | @author: wangmeng 4 | """ 5 | import math, random 6 | import gym 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | import torch.autograd as autograd 12 | import torch.nn.functional as F 13 | from collections import Counter 14 | from collections import deque 15 | import matplotlib.pyplot as plt 16 | from wrappers import make_atari, wrap_deepmind, wrap_pytorch 17 | 18 | USE_CUDA = torch.cuda.is_available() 19 | #将变量放到cuda上 20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 21 | 22 | class CnnDQN(nn.Module): 23 | def __init__(self, observation_space, action_sapce): 24 | super(CnnDQN, self).__init__() 25 | 26 | self.observation_space = observation_space 27 | self.action_sapce = action_sapce 28 | 29 | self.features = nn.Sequential( 30 | nn.Conv2d(self.observation_space[0], 32, kernel_size=8, stride=4), 31 | nn.ReLU(), 32 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 33 | nn.ReLU(), 34 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 35 | nn.ReLU() 36 | ) 37 | 38 | self.fc = nn.Sequential( 39 | nn.Linear(7 * 7 * 64, 512), 40 | nn.ReLU(), 41 | nn.Linear(512,self.action_sapce) 42 | ) 43 | 44 | def forward(self,x): 45 | x = self.features(x) 46 | x = x.view(x.size(0), -1)#将多维度的Tensor展平成一维 47 | # x.size(0)指batchsize的值,x = x.view(x.size(0), -1)简化x = x.view(batchsize, -1),view()函数的功能根reshape类似，用来转换size大小。 48 | # x = x.view(batchsize, -1)中batchsize指转换后有几行，而-1指在不告诉函数有多少列的情况下，根据原tensor数据和batchsize自动分配列数。 49 | x = self.fc(x) 50 | return x 51 | 52 | # def feature_size(self): 53 | # #这里就很粗暴，先建立一个大小和预期输入的全0tensor，送入features中运行，最后得到输出，展平，读取长度。这里是7 * 7 * 64 54 | # return self.features(autograd.Variable(torch.zeros(1, *self.observation_space))).view(1, -1).size(1) 55 | 56 | def act(self, state, epsilon): 57 | if random.random() > epsilon: 58 | state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)#(1,1,84,84) 59 | q_value = self.forward(state) 60 | action = q_value.max(1)[1].data[0] 61 | action = action.cpu().numpy() # 从网络中得到的tensor形式，因为之后要输入给gym环境中，这里把它放回cpu，转为数组形式 62 | action = int(action) 63 | 64 | else: 65 | action = random.randrange(self.action_sapce) 66 | return action 67 | 68 | class ReplayBuffer(object): 69 | def __init__(self, capacity): 70 | #deque模块是python标准库collections中的一项，它提供了两端都可以操作的序列，其实就是双向队列， 71 | #可以从左右两端增加元素，或者是删除元素。如果设置了最大长度，非输入端的数据会逐步移出窗口。 72 | self.buffer = deque (maxlen = capacity) 73 | 74 | def push (self, state ,aciton, reward, next_state, done): 75 | state = np.expand_dims(state,0) 76 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接 77 | next_state = np.expand_dims(next_state,0) 78 | self.buffer.append((state, aciton, reward, next_state, done)) 79 | 80 | def sample(self, batch_size): 81 | # 将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表 82 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) 83 | #最后使用concatenate对数组进行拼接，相当于少了一个维度 84 | return np.concatenate(state), action, reward, np.concatenate(next_state), done 85 | 86 | 87 | def compute_td_loss(model,optimizer, replay_buffer, gamma, batch_size): 88 | state, action, reward, next_state, done = replay_buffer.sample(batch_size) 89 | #通通丢到GPU上去 90 | state = Variable(torch.FloatTensor(np.float32(state))) 91 | next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True) 92 | action = Variable(torch.LongTensor(action)) 93 | reward = Variable(torch.FloatTensor(reward)) 94 | done = Variable(torch.FloatTensor(done)) 95 | 96 | q_values = model(state) 97 | next_q_values = model(next_state) 98 | 99 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) 100 | #gather可以看作是对q_values的查询，即元素都是q_values中的元素，查询索引都存在action中。输出大小与action.unsqueeze(1)一致。 101 | #dim=1,它存放的都是第1维度的索引；dim=0，它存放的都是第0维度的索引； 102 | #这里增加维度主要是为了方便gather操作，之后再删除该维度 103 | next_q_value = next_q_values.max(1)[0] 104 | 105 | expected_q_value = reward + gamma * next_q_value * (1 - done) 106 | 107 | loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() 108 | 109 | optimizer.zero_grad() 110 | loss.backward() 111 | optimizer.step() 112 | 113 | return loss 114 | 115 | 116 | def main(): 117 | env_id = "PongNoFrameskip-v4" 118 | env = make_atari(env_id) 119 | env = wrap_deepmind(env) 120 | env = wrap_pytorch(env) 121 | 122 | observation_space = env.observation_space.shape 123 | action_sapce = env.action_space.n 124 | 125 | model = CnnDQN(observation_space, action_sapce) 126 | 127 | if USE_CUDA: 128 | model = model.cuda() 129 | 130 | optimizer = optim.Adam(model.parameters()) 131 | 132 | replay_buffer = ReplayBuffer(1000) 133 | 134 | batch_size = 32 135 | gamma = 0.99 136 | replay_initial = 100 137 | num_frames = 14000 138 | 139 | losses = [] 140 | all_rewards = [] 141 | x_axis1 = [] 142 | x_axis2= [] 143 | episode_reward = 0 144 | 145 | epsilon_start = 1.0 146 | epsilon_final = 0.01 147 | epsilon_decay = 30000 148 | 149 | # 要求探索率随着迭代次数增加而减小 150 | epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) 151 | 152 | state = env.reset() 153 | 154 | for frame_idx in range(1, num_frames + 1): 155 | #显示动画 156 | env.render() 157 | epsilon = epsilon_by_frame(frame_idx) 158 | action = model.act(state, epsilon) 159 | next_state, reward, done, _ = env.step(action) 160 | replay_buffer.push(state, action, reward, next_state, done) 161 | state = next_state 162 | episode_reward += reward 163 | 164 | if done: 165 | state = env.reset() 166 | x_axis1.append(frame_idx) 167 | all_rewards.append(episode_reward) 168 | episode_reward = 0 169 | 170 | if frame_idx+1 > replay_initial: 171 | loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size) 172 | x_axis2.append(frame_idx) 173 | losses.append(np.array(loss.data.cpu())) 174 | 175 | 176 | 177 | if frame_idx % 100 == 0: 178 | plt.figure(1) 179 | plt.subplot(121) 180 | plt.plot(x_axis1, all_rewards) 181 | plt.subplot(122) 182 | plt.plot(x_axis2, losses) 183 | plt.show() 184 | 185 | env.close() 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | if __name__ == '__main__': 195 | main() -------------------------------------------------------------------------------- /Discrete_action/D3QN.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mae 1 2021 3 | @author: wangmeng 4 | """ 5 | 6 | import math, random 7 | import gym 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import torch.autograd as autograd 13 | import torch.nn.functional as F 14 | from collections import Counter 15 | from collections import deque 16 | import matplotlib.pyplot as plt 17 | 18 | USE_CUDA = torch.cuda.is_available() 19 | #将变量放到cuda上 20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 21 | 22 | class DuelingDQN(nn.Module): 23 | def __init__(self, observation_space, action_sapce): 24 | super(DuelingDQN, self).__init__() 25 | 26 | self.observation_space = observation_space 27 | self.action_sapce = action_sapce 28 | 29 | #######################改动部分############################# 30 | self.feature = nn.Sequential( 31 | nn.Linear(observation_space,128), 32 | nn.ReLU() 33 | ) 34 | self.advantage = nn.Sequential( 35 | nn.Linear(128, 128), 36 | nn.ReLU(), 37 | nn.Linear(128, action_sapce), 38 | ) 39 | self.value = nn.Sequential( 40 | nn.Linear(128,128), 41 | nn.ReLU(), 42 | nn.Linear(128,1), 43 | 44 | ) 45 | 46 | 47 | def forward(self, x): 48 | x = self.feature(x) 49 | advantage = self.advantage(x) 50 | value = self.value(x) 51 | #这里不减去advantage均值的话会导致训练不稳定，因为value的作用可能有的时候被忽略掉了，有的时候又突然非常大。 52 | return value + advantage - advantage.mean() 53 | 54 | def act(self, state, epsilon): 55 | if random.random() > epsilon: 56 | #如果使用的是GPU，这里需要把数据丢到GPU上 57 | state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)#volatile的作用是作为指令关键字，确保本条指令不会因编译器的优化而省略，且要求每次直接读值。 58 | #.squeeze() 把数据条目中维度为1 的删除掉 59 | q_value = self.forward(state) 60 | action = q_value.max(1)[1].data[0] 61 | #max(1)返回每一行中最大值的那个元素，且返回其索引,max(0)是列 62 | #max()[1]只返回最大值的每个索引，max()[0]，只返回最大值的每个数 63 | 64 | action = action.cpu().numpy()#从网络中得到的tensor形式，因为之后要输入给gym环境中，这里把它放回cpu，转为数组形式 65 | action =int(action) 66 | else: 67 | action = random.randrange(self.action_sapce)#返回指定递增基数集合中的一个随机数，基数默认值为1。 68 | return action 69 | 70 | class ReplayBuffer(object): 71 | def __init__(self, capacity): 72 | #deque模块是python标准库collections中的一项，它提供了两端都可以操作的序列，其实就是双向队列， 73 | #可以从左右两端增加元素，或者是删除元素。如果设置了最大长度，非输入端的数据会逐步移出窗口。 74 | self.buffer = deque(maxlen = capacity) 75 | 76 | def push (self, state, aciton, reward, next_state, done): 77 | state = np.expand_dims(state,0) 78 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接 79 | next_state = np.expand_dims(next_state,0) 80 | self.buffer.append((state, aciton, reward, next_state, done)) 81 | 82 | def sample(self, batch_size): 83 | # 将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表 84 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) 85 | #最后使用concatenate对数组进行拼接，相当于少了一个维度 86 | return np.concatenate(state), action, reward, np.concatenate(next_state), done 87 | 88 | 89 | def compute_td_loss(current_model, target_model,optimizer, replay_buffer, gamma, batch_size): 90 | state, action, reward, next_state, done = replay_buffer.sample(batch_size) 91 | #通通丢到GPU上去 92 | state = Variable(torch.FloatTensor(np.float32(state))) 93 | next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True) 94 | action = Variable(torch.LongTensor(action)) 95 | reward = Variable(torch.FloatTensor(reward)) 96 | done = Variable(torch.FloatTensor(done)) 97 | 98 | q_values = current_model(state) 99 | next_q_values = target_model(next_state) 100 | 101 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) 102 | #gather可以看作是对q_values的查询，即元素都是q_values中的元素，查询索引都存在action中。输出大小与action.unsqueeze(1)一致。 103 | #dim=1,它存放的都是第1维度的索引；dim=0，它存放的都是第0维度的索引； 104 | #这里增加维度主要是为了方便gather操作，之后再删除该维度 105 | next_q_value = next_q_values.max(1)[0] 106 | 107 | expected_q_value = reward + gamma * next_q_value * (1 - done) 108 | 109 | #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() 110 | loss = (q_value - expected_q_value.detach()).pow(2).mean() 111 | 112 | optimizer.zero_grad() 113 | loss.backward() 114 | optimizer.step() 115 | 116 | return loss 117 | 118 | def update_target(current_model, target_model): 119 | target_model.load_state_dict(current_model.state_dict()) 120 | 121 | 122 | def main(): 123 | env_id = "CartPole-v0" 124 | env = gym.make(env_id) 125 | 126 | observation_space = env.observation_space.shape[0] 127 | action_sapce = env.action_space.n 128 | 129 | current_model = DuelingDQN(observation_space, action_sapce) 130 | target_model = DuelingDQN(observation_space, action_sapce) 131 | 132 | if USE_CUDA: 133 | current_model = current_model.cuda() 134 | target_model = target_model.cuda() 135 | 136 | optimizer = optim.Adam(current_model.parameters()) 137 | 138 | replay_buffer = ReplayBuffer(1000) 139 | 140 | update_target(current_model, target_model) 141 | 142 | batch_size = 32 143 | gamma = 0.99 144 | 145 | num_frames = 10000 146 | 147 | losses = [] 148 | all_rewards = [] 149 | x_axis1 = [] 150 | x_axis2 = [] 151 | episode_reward = 0 152 | 153 | epsilon_start = 1.0 154 | epsilon_final = 0.01 155 | epsilon_decay = 500 156 | 157 | #要求探索率随着迭代次数增加而减小 158 | epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * frame_idx / epsilon_decay) 159 | 160 | state = env.reset() 161 | for frame_idx in range(1, num_frames + 1): 162 | #显示动画 163 | #env.render() 164 | epsilon = epsilon_by_frame(frame_idx) 165 | action = current_model.act(state, epsilon) 166 | next_state, reward, done, _ = env.step(action) 167 | replay_buffer.push(state, action, reward, next_state, done) 168 | state = next_state 169 | episode_reward += reward 170 | 171 | if done: 172 | state = env.reset() 173 | x_axis1.append(frame_idx) 174 | all_rewards.append(episode_reward) 175 | episode_reward = 0 176 | 177 | if frame_idx+1 > batch_size: 178 | x_axis2.append(frame_idx) 179 | loss = compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size) 180 | losses.append(np.array(loss.data.cpu())) 181 | 182 | if frame_idx % 100 == 0: 183 | update_target(current_model, target_model) 184 | 185 | 186 | if frame_idx % 200 == 0: 187 | plt.figure(1) 188 | plt.subplot(121) 189 | plt.plot(x_axis1, all_rewards) 190 | plt.subplot(122) 191 | plt.plot(x_axis2, losses) 192 | plt.show() 193 | 194 | 195 | if __name__ == '__main__': 196 | main() -------------------------------------------------------------------------------- /Discrete_action/D3QN2.py: -------------------------------------------------------------------------------- 1 | #https://github.com/awill139/d3qn_pytorch 2 | import torch 3 | import numpy as np 4 | from torch import optim 5 | import torch.nn as nn 6 | 7 | import gym 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 11 | 12 | 13 | class ReplayBuffer(): 14 | def __init__(self, max_size, input_shape): 15 | self.mem_size = max_size 16 | self.mem_cntr = 0 17 | 18 | self.state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float32) 19 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float32) 20 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 21 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 22 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 23 | 24 | def store_transition(self, state, action, reward, state_, done): 25 | idx = self.mem_cntr % self.mem_size 26 | 27 | self.state_memory[idx] = state 28 | self.new_state_memory[idx] = state_ 29 | self.action_memory[idx] = action 30 | self.reward_memory[idx] = reward 31 | self.terminal_memory[idx] = done 32 | 33 | self.mem_cntr += 1 34 | 35 | def sample(self, batch_size): 36 | max_mem = min(self.mem_cntr, self.mem_size) 37 | batch = np.random.choice(max_mem, batch_size, replace=False) 38 | 39 | states = self.state_memory[batch] 40 | states_ = self.new_state_memory[batch] 41 | actions = self.action_memory[batch] 42 | rewards = self.reward_memory[batch] 43 | dones = self.terminal_memory[batch] 44 | 45 | return states, actions, rewards, states_, dones 46 | 47 | 48 | class DuelingDeepQNet(nn.Module): 49 | def __init__(self, n_actions, input_dim, fc1_dims, fc2_dims, lr=0.0003): 50 | super(DuelingDeepQNet, self).__init__() 51 | 52 | self.fc1 = nn.Linear(*input_dim, fc1_dims) 53 | self.fc2 = nn.Linear(fc1_dims, fc2_dims) 54 | self.V = nn.Linear(fc2_dims, 1) 55 | self.A = nn.Linear(fc2_dims, n_actions) 56 | 57 | self.relu1 = nn.ReLU() 58 | self.relu2 = nn.ReLU() 59 | 60 | self.optim = optim.Adam(self.parameters(), lr=lr) 61 | self.crit = nn.MSELoss() 62 | 63 | def forward(self, state): 64 | x = self.relu1(self.fc1(state)) 65 | x = self.relu2(self.fc2(x)) 66 | 67 | V = self.V(x) 68 | A = self.A(x) 69 | 70 | Q = V + (A - torch.mean(A, dim=1, keepdim=True)) 71 | 72 | return Q 73 | 74 | def advantage(self, state): 75 | x = self.relu1(self.fc1(state)) 76 | x = self.relu2(self.fc2(x)) 77 | 78 | return self.A(x) 79 | 80 | 81 | class Agent: 82 | def __init__(self, gamma, n_actions, epsilon, batch_size, 83 | input_dims, epsilon_decay=1e-8, eps_min=0.01, 84 | mem_size=1000000, fc1_dims=128, fc2_dims=128, replace=100): 85 | self.action_space = [i for i in range(n_actions)] 86 | self.gamma = gamma 87 | self.epsilon = epsilon 88 | self.epsilon_decay = epsilon_decay 89 | self.eps_min = eps_min 90 | self.replace = replace 91 | self.batch_size = batch_size 92 | 93 | self.learn_step_counter = 0 94 | self.memory = ReplayBuffer(max_size=mem_size, input_shape=input_dims) 95 | self.q_eval = DuelingDeepQNet(n_actions=n_actions, input_dim=input_dims, fc1_dims=fc1_dims, fc2_dims=fc2_dims) 96 | self.q_next = DuelingDeepQNet(n_actions=n_actions, input_dim=input_dims, fc1_dims=fc1_dims, fc2_dims=fc2_dims) 97 | 98 | self.q_eval.to(device) 99 | self.q_next.to(device) 100 | 101 | def store_transition(self, state, action, reward, new_state, done): 102 | self.memory.store_transition(state, action, reward, new_state, done) 103 | 104 | def choose_action(self, observation): 105 | if np.random.random() < self.epsilon: 106 | state = torch.Tensor([observation]).to(device) 107 | advantage = self.q_eval.advantage(state) 108 | action = torch.argmax(advantage).item() 109 | else: 110 | action = np.random.choice(self.action_space) 111 | 112 | return action 113 | 114 | def learn(self): 115 | if self.memory.mem_cntr < self.batch_size: 116 | return 117 | if self.learn_step_counter % self.replace == 0: 118 | self.q_next.load_state_dict(self.q_eval.state_dict()) 119 | 120 | states, actions, rewards, states_, dones = self.memory.sample(self.batch_size) 121 | 122 | states = torch.tensor(states).to(device) 123 | rewards = torch.tensor(rewards).to(device) 124 | dones = torch.tensor(dones).to(device) 125 | actions = torch.tensor(actions).to(device) 126 | states_ = torch.tensor(states_).to(device) 127 | 128 | indices = np.arange(self.batch_size) 129 | 130 | q_pred = self.q_eval(states)[indices, actions] 131 | q_next = self.q_next(states_) 132 | 133 | max_actions = torch.argmax(self.q_eval(states_), dim=1) 134 | # q_eval = self.q_eval(torch.Tensor(states_).to(device))[indices, actions] 135 | q_target = rewards + self.gamma * q_next[indices, max_actions] 136 | 137 | q_next[dones] = 0.0 138 | self.q_eval.optim.zero_grad() 139 | 140 | loss = self.q_eval.crit(q_target, q_pred) 141 | loss.backward() 142 | 143 | self.q_eval.optim.step() 144 | 145 | self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.eps_min else self.eps_min 146 | self.learn_step_counter += 1 147 | 148 | 149 | def main(): 150 | env = gym.make('LunarLander-v2') 151 | # env_id = "CartPole-v0" 152 | # env = gym.make(env_id) 153 | n_actions = env.action_space.n 154 | obs_shape = list(env.observation_space.shape) 155 | agent = Agent(gamma=0.99, n_actions=n_actions, epsilon=1.0, batch_size=64, input_dims=obs_shape) 156 | 157 | n_games = 1000 158 | scores = [] 159 | eps_history = [] 160 | 161 | 162 | 163 | for i in range(n_games): 164 | done = False 165 | score = 0 166 | obs = env.reset() 167 | 168 | 169 | while not done: 170 | env.render() 171 | action = agent.choose_action(obs) 172 | obs_, reward, done, info = env.step(action) 173 | agent.store_transition(obs, action, reward, obs_, int(done)) 174 | score += reward 175 | obs = obs_ 176 | agent.learn() 177 | 178 | eps_history.append(agent.epsilon) 179 | 180 | scores.append(score) 181 | 182 | 183 | avg_score = np.mean(scores[-100:]) 184 | 185 | print('episode: {}\t curr_score: {}\t avg score: {}'.format(i, score, avg_score)) 186 | 187 | 188 | env.close() 189 | 190 | if __name__ == '__main__': 191 | main() -------------------------------------------------------------------------------- /Discrete_action/DDQN.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Feb 26 2021 3 | @author: wangmeng 4 | """ 5 | 6 | import math, random 7 | import gym 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import torch.autograd as autograd 13 | import torch.nn.functional as F 14 | from collections import Counter 15 | from collections import deque 16 | import matplotlib.pyplot as plt 17 | 18 | USE_CUDA = torch.cuda.is_available() 19 | #将变量放到cuda上 20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 21 | 22 | class DQN(nn.Module): 23 | def __init__(self, observation_space, action_sapce): 24 | super(DQN, self).__init__() 25 | 26 | self.observation_space = observation_space 27 | self.action_sapce = action_sapce 28 | 29 | self.layers = nn.Sequential( 30 | nn.Linear(observation_space,128), 31 | nn.ReLU(), 32 | nn.Linear(128,128), 33 | nn.ReLU(), 34 | nn.Linear(128, action_sapce) 35 | ) 36 | 37 | 38 | def forward(self, x): 39 | return self.layers(x) 40 | 41 | def act(self, state, epsilon): 42 | if random.random() > epsilon: 43 | #如果使用的是GPU，这里需要把数据丢到GPU上 44 | state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)#volatile的作用是作为指令关键字，确保本条指令不会因编译器的优化而省略，且要求每次直接读值。 45 | #.squeeze() 把数据条目中维度为1 的删除掉 46 | q_value = self.forward(state) 47 | action = q_value.max(1)[1].data[0] 48 | #max(1)返回每一行中最大值的那个元素，且返回其索引,max(0)是列 49 | #max()[1]只返回最大值的每个索引，max()[0]，只返回最大值的每个数 50 | 51 | action = action.cpu().numpy()#从网络中得到的tensor形式，因为之后要输入给gym环境中，这里把它放回cpu，转为数组形式 52 | action =int(action) 53 | else: 54 | action = random.randrange(self.action_sapce)#返回指定递增基数集合中的一个随机数，基数默认值为1。 55 | return action 56 | 57 | class ReplayBuffer(object): 58 | def __init__(self, capacity): 59 | #deque模块是python标准库collections中的一项，它提供了两端都可以操作的序列，其实就是双向队列， 60 | #可以从左右两端增加元素，或者是删除元素。如果设置了最大长度，非输入端的数据会逐步移出窗口。 61 | self.buffer = deque(maxlen = capacity) 62 | 63 | def push (self, state, aciton, reward, next_state, done): 64 | state = np.expand_dims(state,0) 65 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接 66 | next_state = np.expand_dims(next_state,0) 67 | self.buffer.append((state, aciton, reward, next_state, done)) 68 | 69 | def sample(self, batch_size): 70 | # 将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表 71 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) 72 | #最后使用concatenate对数组进行拼接，相当于少了一个维度 73 | return np.concatenate(state), action, reward, np.concatenate(next_state), done 74 | 75 | 76 | def compute_td_loss(model,optimizer, replay_buffer, gamma, batch_size): 77 | state, action, reward, next_state, done = replay_buffer.sample(batch_size) 78 | #通通丢到GPU上去 79 | state = Variable(torch.FloatTensor(np.float32(state))) 80 | next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True) 81 | action = Variable(torch.LongTensor(action)) 82 | reward = Variable(torch.FloatTensor(reward)) 83 | done = Variable(torch.FloatTensor(done)) 84 | 85 | q_values = model(state) 86 | next_q_values = model(next_state) 87 | 88 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) 89 | #gather可以看作是对q_values的查询，即元素都是q_values中的元素，查询索引都存在action中。输出大小与action.unsqueeze(1)一致。 90 | #dim=1,它存放的都是第1维度的索引；dim=0，它存放的都是第0维度的索引； 91 | #这里增加维度主要是为了方便gather操作，之后再删除该维度 92 | next_q_value = next_q_values.max(1)[0] 93 | 94 | expected_q_value = reward + gamma * next_q_value * (1 - done) 95 | 96 | loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() 97 | 98 | optimizer.zero_grad() 99 | loss.backward() 100 | optimizer.step() 101 | 102 | return loss 103 | 104 | 105 | def update_target(current_model, target_model): 106 | target_model.load_state_dict(current_model.state_dict()) 107 | 108 | 109 | def main(): 110 | env_id = "CartPole-v0" 111 | env = gym.make(env_id) 112 | 113 | observation_space = env.observation_space.shape[0] 114 | action_sapce = env.action_space.n 115 | 116 | #######################改动之处############################### 117 | current_model = DQN (observation_space, action_sapce) 118 | target_model = DQN(observation_space, action_sapce) 119 | 120 | if USE_CUDA: 121 | current_model = current_model.cuda() 122 | target_model = target_model.cuda() 123 | 124 | update_target(current_model, target_model) 125 | 126 | optimizer = optim.Adam(current_model.parameters()) 127 | ############################################################# 128 | replay_buffer = ReplayBuffer(1000) 129 | 130 | batch_size = 32 131 | gamma = 0.99 132 | 133 | num_frames = 10000 134 | 135 | losses = [] 136 | all_rewards = [] 137 | x_axis1 = [] 138 | x_axis2 = [] 139 | episode_reward = 0 140 | 141 | epsilon_start = 1.0 142 | epsilon_final = 0.01 143 | epsilon_decay = 500 144 | 145 | #要求探索率随着迭代次数增加而减小 146 | epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * frame_idx / epsilon_decay) 147 | 148 | state = env.reset() 149 | for frame_idx in range(1, num_frames + 1): 150 | #显示动画 151 | #env.render() 152 | epsilon = epsilon_by_frame(frame_idx) 153 | action = current_model.act(state, epsilon) 154 | next_state, reward, done, _ = env.step(action) 155 | replay_buffer.push(state, action, reward, next_state, done) 156 | state = next_state 157 | episode_reward += reward 158 | 159 | if done: 160 | state = env.reset() 161 | x_axis1.append(frame_idx) 162 | all_rewards.append(episode_reward) 163 | episode_reward = 0 164 | 165 | if frame_idx+1 > batch_size: 166 | x_axis2.append(frame_idx) 167 | loss = compute_td_loss(current_model, optimizer, replay_buffer, gamma, batch_size) 168 | losses.append(np.array(loss.data.cpu())) 169 | 170 | #########################改动之处############################# 171 | if frame_idx % 100 == 0: 172 | update_target(current_model, target_model) 173 | 174 | 175 | 176 | if frame_idx % 200 == 0: 177 | plt.figure(1) 178 | plt.subplot(121) 179 | plt.plot(x_axis1, all_rewards) 180 | plt.subplot(122) 181 | plt.plot(x_axis2, losses) 182 | plt.show() 183 | 184 | 185 | if __name__ == '__main__': 186 | main() -------------------------------------------------------------------------------- /Discrete_action/DQN.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Feb 26 2021 3 | @author: wangmeng 4 | """ 5 | 6 | import math, random 7 | import gym 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import torch.autograd as autograd 13 | import torch.nn.functional as F 14 | from collections import Counter 15 | from collections import deque 16 | import matplotlib.pyplot as plt 17 | 18 | USE_CUDA = torch.cuda.is_available() 19 | #将变量放到cuda上 20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 21 | 22 | class DQN(nn.Module): 23 | def __init__(self, observation_space, action_sapce): 24 | super(DQN, self).__init__() 25 | 26 | self.observation_space = observation_space 27 | self.action_sapce = action_sapce 28 | 29 | self.layers = nn.Sequential( 30 | nn.Linear(observation_space,128), 31 | nn.ReLU(), 32 | nn.Linear(128,128), 33 | nn.ReLU(), 34 | nn.Linear(128, action_sapce) 35 | ) 36 | 37 | 38 | def forward(self, x): 39 | return self.layers(x) 40 | 41 | def act(self, state, epsilon): 42 | if random.random() > epsilon: 43 | #如果使用的是GPU，这里需要把数据丢到GPU上 44 | state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)#volatile的作用是作为指令关键字，确保本条指令不会因编译器的优化而省略，且要求每次直接读值。 45 | #.squeeze() 把数据条目中维度为1 的删除掉 46 | q_value = self.forward(state) 47 | action = q_value.max(1)[1].data[0] 48 | #max(1)返回每一行中最大值的那个元素，且返回其索引,max(0)是列 49 | #max()[1]只返回最大值的每个索引，max()[0]，只返回最大值的每个数 50 | 51 | action = action.cpu().numpy()#从网络中得到的tensor形式，因为之后要输入给gym环境中，这里把它放回cpu，转为数组形式 52 | action =int(action) 53 | else: 54 | action = random.randrange(self.action_sapce)#返回指定递增基数集合中的一个随机数，基数默认值为1。 55 | return action 56 | 57 | class ReplayBuffer(object): 58 | def __init__(self, capacity): 59 | #deque模块是python标准库collections中的一项，它提供了两端都可以操作的序列，其实就是双向队列， 60 | #可以从左右两端增加元素，或者是删除元素。如果设置了最大长度，非输入端的数据会逐步移出窗口。 61 | self.buffer = deque(maxlen = capacity) 62 | 63 | def push (self, state, aciton, reward, next_state, done): 64 | state = np.expand_dims(state,0) 65 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接 66 | next_state = np.expand_dims(next_state,0) 67 | self.buffer.append((state, aciton, reward, next_state, done)) 68 | 69 | def sample(self, batch_size): 70 | # 将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表 71 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) 72 | #最后使用concatenate对数组进行拼接，相当于少了一个维度 73 | return np.concatenate(state), action, reward, np.concatenate(next_state), done 74 | 75 | 76 | def compute_td_loss(model,optimizer, replay_buffer, gamma, batch_size): 77 | state, action, reward, next_state, done = replay_buffer.sample(batch_size) 78 | #通通丢到GPU上去 79 | state = Variable(torch.FloatTensor(np.float32(state))) 80 | next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True) 81 | action = Variable(torch.LongTensor(action)) 82 | reward = Variable(torch.FloatTensor(reward)) 83 | done = Variable(torch.FloatTensor(done)) 84 | 85 | q_values = model(state) 86 | next_q_values = model(next_state) 87 | 88 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) 89 | #gather可以看作是对q_values的查询，即元素都是q_values中的元素，查询索引都存在action中。输出大小与action.unsqueeze(1)一致。 90 | #dim=1,它存放的都是第1维度的索引；dim=0，它存放的都是第0维度的索引； 91 | #这里增加维度主要是为了方便gather操作，之后再删除该维度 92 | next_q_value = next_q_values.max(1)[0] 93 | 94 | expected_q_value = reward + gamma * next_q_value * (1 - done) 95 | 96 | loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() 97 | 98 | optimizer.zero_grad() 99 | loss.backward() 100 | optimizer.step() 101 | 102 | return loss 103 | 104 | 105 | 106 | 107 | def main(): 108 | env_id = "CartPole-v0" 109 | env = gym.make(env_id) 110 | 111 | observation_space = env.observation_space.shape[0] 112 | action_sapce = env.action_space.n 113 | 114 | model = DQN (observation_space, action_sapce) 115 | 116 | if USE_CUDA: 117 | model = model.cuda() 118 | 119 | optimizer = optim.Adam(model.parameters()) 120 | 121 | replay_buffer = ReplayBuffer(1000) 122 | 123 | batch_size = 32 124 | gamma = 0.99 125 | 126 | num_frames = 10000 127 | 128 | losses = [] 129 | all_rewards = [] 130 | x_axis1 = [] 131 | x_axis2 = [] 132 | episode_reward = 0 133 | 134 | epsilon_start = 1.0 135 | epsilon_final = 0.01 136 | epsilon_decay = 500 137 | 138 | #要求探索率随着迭代次数增加而减小 139 | epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * frame_idx / epsilon_decay) 140 | 141 | state = env.reset() 142 | for frame_idx in range(1, num_frames + 1): 143 | #显示动画 144 | #env.render() 145 | epsilon = epsilon_by_frame(frame_idx) 146 | action = model.act(state, epsilon) 147 | next_state, reward, done, _ = env.step(action) 148 | replay_buffer.push(state, action, reward, next_state, done) 149 | state = next_state 150 | episode_reward += reward 151 | 152 | if done: 153 | state = env.reset() 154 | x_axis1.append(frame_idx) 155 | all_rewards.append(episode_reward) 156 | episode_reward = 0 157 | 158 | if frame_idx+1 > batch_size: 159 | x_axis2.append(frame_idx) 160 | loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size) 161 | losses.append(np.array(loss.data.cpu())) 162 | 163 | 164 | 165 | if frame_idx % 200 == 0: 166 | plt.figure(1) 167 | plt.subplot(121) 168 | plt.plot(x_axis1, all_rewards) 169 | plt.subplot(122) 170 | plt.plot(x_axis2, losses) 171 | plt.show() 172 | 173 | 174 | if __name__ == '__main__': 175 | main() -------------------------------------------------------------------------------- /Discrete_action/Noise DQN.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mae 5 2021 3 | @author: wangmeng 4 | 5 | NoisyNet 一种将参数化的噪音加入到神经网络权重上去的方法来增加强化学习中的探索，称为 NoisyNet 6 | 噪音的参数可以通过梯度来进行学习，非常容易就能实现，而且只增加了一点计算量，在 A3C ，DQN 算法上效果不错。 7 | NoisyNet 的思想很简单，就是在神经网络的权重上增加一些噪音来起到探索的目的。 8 | """ 9 | 10 | import math, random 11 | import gym 12 | import numpy as np 13 | import torch 14 | import torch.nn as nn 15 | import torch.optim as optim 16 | import torch.autograd as autograd 17 | import torch.nn.functional as F 18 | from collections import Counter 19 | from collections import deque 20 | import matplotlib.pyplot as plt 21 | from replay_buffer import * 22 | 23 | USE_CUDA = torch.cuda.is_available() 24 | #将变量放到cuda上 25 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 26 | 27 | #定义一个添加噪声的网络层 28 | class NoisyLinear(nn.Module): 29 | def __init__(self, in_features, out_features, std_init=0.4): 30 | super(NoisyLinear, self).__init__() 31 | 32 | self.in_features = in_features 33 | self.out_features = out_features 34 | self.std_init = std_init 35 | 36 | self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features)) 37 | self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features)) 38 | #向模块添加持久缓冲区,这通常用于注册不应被视为模型参数的缓冲区。例如，BatchNorm的running_mean不是一个参数，而是持久状态的一部分。 39 | #缓冲区可以使用给定的名称作为属性访问。 40 | self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features)) 41 | 42 | self.bias_mu = nn.Parameter(torch.FloatTensor(out_features)) 43 | self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features)) 44 | self.register_buffer('bias_epsilon', torch.FloatTensor(out_features)) 45 | 46 | self.reset_parameters() 47 | self.reset_noise() 48 | 49 | def forward(self,x): 50 | if self.training: 51 | weight = self.weight_mu + self.weight_sigma.mul(Variable(self.weight_epsilon)) 52 | bias = self.bias_mu + self.bias_sigma.mul(Variable(self.bias_epsilon)) 53 | else: 54 | weight = self.weight_mu 55 | bias = self.bias_mu 56 | return F.linear(x, weight, bias) 57 | 58 | def reset_parameters(self): 59 | mu_range = 1 / math.sqrt(self.weight_mu.size(1)) 60 | 61 | self.weight_mu.data.uniform_(-mu_range, mu_range) 62 | self.weight_sigma.data.uniform_(self.std_init / math.sqrt(self.bias_sigma.size(0))) 63 | 64 | self.bias_mu.data.uniform_(-mu_range, mu_range) 65 | self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0))) 66 | 67 | def reset_noise(self): 68 | epsilon_in = self._scale_noise(self.in_features) 69 | epsilon_out = self._scale_noise(self.out_features) 70 | 71 | self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in)) 72 | self.bias_epsilon.copy_(self._scale_noise(self.out_features)) 73 | 74 | def _scale_noise(self, size): 75 | x = torch.randn(size) 76 | x = x.sign().mul(x.abs().sqrt()) 77 | return x 78 | 79 | class NoisyDQN(nn.Module): 80 | def __init__(self, observation_space, action_sapce): 81 | super(NoisyDQN, self).__init__() 82 | 83 | self.linear = nn.Linear(observation_space, 128) 84 | self.noisy1 = NoisyLinear(128, 128) 85 | self.noisy2 = NoisyLinear(128, action_sapce) 86 | def forward(self, x): 87 | x = F.relu(self.linear(x)) 88 | x = F.relu(self.noisy1(x)) 89 | x = self.noisy2(x) 90 | return x 91 | def act(self, state): 92 | state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile = True) 93 | q_value = self.forward(state) 94 | action = q_value.max(1)[1].data[0] 95 | action = action.cpu().numpy() # 从网络中得到的tensor形式，因为之后要输入给gym环境中，这里把它放回cpu，转为数组形式 96 | action = int(action) 97 | return action 98 | 99 | def reset_noise(self): 100 | self.noisy1.reset_noise() 101 | self.noisy2.reset_noise() 102 | 103 | 104 | class ReplayBuffer(object): 105 | def __init__(self, capacity): 106 | #deque模块是python标准库collections中的一项，它提供了两端都可以操作的序列，其实就是双向队列， 107 | #可以从左右两端增加元素，或者是删除元素。如果设置了最大长度，非输入端的数据会逐步移出窗口。 108 | self.buffer = deque(maxlen = capacity) 109 | 110 | def push (self, state, aciton, reward, next_state, done): 111 | state = np.expand_dims(state,0) 112 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接 113 | next_state = np.expand_dims(next_state,0) 114 | self.buffer.append((state, aciton, reward, next_state, done)) 115 | 116 | def sample(self, batch_size): 117 | # 将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表 118 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) 119 | #最后使用concatenate对数组进行拼接，相当于少了一个维度 120 | return np.concatenate(state), action, reward, np.concatenate(next_state), done 121 | 122 | 123 | def compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size, beta): 124 | state, action, reward, next_state, done, weights, indices = replay_buffer.sample(batch_size, beta) 125 | 126 | state = Variable(torch.FloatTensor(np.float32(state))) 127 | next_state = Variable(torch.FloatTensor(np.float32(next_state))) 128 | action = Variable(torch.LongTensor(action)) 129 | reward = Variable(torch.FloatTensor(reward)) 130 | done = Variable(torch.FloatTensor(np.float32(done))) 131 | weights = Variable(torch.FloatTensor(weights)) 132 | 133 | q_values = current_model(state) 134 | next_q_values = target_model(next_state) 135 | 136 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) 137 | #gather可以看作是对q_values的查询，即元素都是q_values中的元素，查询索引都存在action中。输出大小与action.unsqueeze(1)一致。 138 | #dim=1,它存放的都是第1维度的索引；dim=0，它存放的都是第0维度的索引； 139 | #这里增加维度主要是为了方便gather操作，之后再删除该维度 140 | next_q_value = next_q_values.max(1)[0] 141 | 142 | expected_q_value = reward + gamma * next_q_value * (1 - done) 143 | 144 | loss = (q_value - expected_q_value.detach()).pow(2) * weights 145 | prios = loss + 1e-5 146 | loss = loss.mean() 147 | 148 | optimizer.zero_grad() 149 | loss.backward() 150 | optimizer.step() 151 | 152 | replay_buffer.update_priorities(indices, prios.data.cpu().numpy()) 153 | current_model.reset_noise() 154 | target_model.reset_noise() 155 | 156 | return loss 157 | 158 | def update_target(current_model, target_model): 159 | target_model.load_state_dict(current_model.state_dict())#加载模型 160 | 161 | def plot(frame_idx, rewards, losses): 162 | plt.figure(figsize=(20,5)) 163 | plt.subplot(131) 164 | plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:]))) 165 | plt.plot(rewards) 166 | plt.subplot(132) 167 | plt.title('loss') 168 | plt.plot(losses) 169 | plt.show() 170 | 171 | def main(): 172 | env_id = "CartPole-v0" 173 | env = gym.make(env_id) 174 | 175 | observation_space = env.observation_space.shape[0] 176 | action_sapce = env.action_space.n 177 | 178 | current_model = NoisyDQN(observation_space, action_sapce) 179 | target_model = NoisyDQN(observation_space, action_sapce) 180 | 181 | if USE_CUDA: 182 | current_model = current_model.cuda() 183 | target_model = target_model.cuda() 184 | 185 | optimizer = optim.Adam(current_model.parameters()) 186 | 187 | beta_start = 0.4 188 | beta_frames = 1000 189 | beta_by_frame = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames) 190 | 191 | 192 | replay_buffer = PrioritizedReplayBuffer(10000, alpha=0.6) 193 | 194 | update_target(current_model, target_model) 195 | 196 | num_frames = 10000 197 | batch_size = 32 198 | gamma = 0.99 199 | 200 | losses = [] 201 | all_rewards = [] 202 | episode_reward = 0 203 | 204 | state = env.reset() 205 | for frame_idx in range(1, num_frames + 1): 206 | #显示动画 207 | #env.render() 208 | action = current_model.act(state) 209 | 210 | next_state, reward, done, _ = env.step(action) 211 | replay_buffer.push(state, action, reward, next_state, done) 212 | 213 | state = next_state 214 | episode_reward += reward 215 | 216 | if done: 217 | state = env.reset() 218 | all_rewards.append(episode_reward) 219 | episode_reward = 0 220 | 221 | if len(replay_buffer) > batch_size: 222 | beta = beta_by_frame(frame_idx) 223 | loss = compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size, beta) 224 | losses.append(np.array(loss.data.cpu())) 225 | 226 | if frame_idx % 200 == 0: 227 | plot(frame_idx, all_rewards, losses) 228 | 229 | if frame_idx % 1000 == 0: 230 | update_target(current_model, target_model) 231 | 232 | 233 | if __name__ == '__main__': 234 | main() -------------------------------------------------------------------------------- /Discrete_action/__pycache__/multiprocessing_env.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mengwanglalala/RL-algorithms/97f5b3e3b570ecb3c88ecf5f1ade148552103071/Discrete_action/__pycache__/multiprocessing_env.cpython-37.pyc -------------------------------------------------------------------------------- /Discrete_action/__pycache__/replay_buffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mengwanglalala/RL-algorithms/97f5b3e3b570ecb3c88ecf5f1ade148552103071/Discrete_action/__pycache__/replay_buffer.cpython-37.pyc -------------------------------------------------------------------------------- /Discrete_action/multiprocessing_env.py: -------------------------------------------------------------------------------- 1 | # This code is from openai baseline 2 | # https://github.com/openai/baselines/tree/master/baselines/common/vec_env 3 | 4 | import numpy as np 5 | from multiprocessing import Process, Pipe 6 | 7 | 8 | def worker(remote, parent_remote, env_fn_wrapper): 9 | parent_remote.close() 10 | env = env_fn_wrapper.x() 11 | while True: 12 | cmd, data = remote.recv() 13 | if cmd == 'step': 14 | ob, reward, done, info = env.step(data) 15 | if done: 16 | ob = env.reset() 17 | remote.send((ob, reward, done, info)) 18 | elif cmd == 'reset': 19 | ob = env.reset() 20 | remote.send(ob) 21 | elif cmd == 'reset_task': 22 | ob = env.reset_task() 23 | remote.send(ob) 24 | elif cmd == 'close': 25 | remote.close() 26 | break 27 | elif cmd == 'get_spaces': 28 | remote.send((env.observation_space, env.action_space)) 29 | else: 30 | raise NotImplementedError 31 | 32 | 33 | class VecEnv(object): 34 | """ 35 | An abstract asynchronous, vectorized environment. 36 | """ 37 | 38 | def __init__(self, num_envs, observation_space, action_space): 39 | self.num_envs = num_envs 40 | self.observation_space = observation_space 41 | self.action_space = action_space 42 | 43 | def reset(self): 44 | """ 45 | Reset all the environments and return an array of 46 | observations, or a tuple of observation arrays. 47 | If step_async is still doing work, that work will 48 | be cancelled and step_wait() should not be called 49 | until step_async() is invoked again. 50 | """ 51 | pass 52 | 53 | def step_async(self, actions): 54 | """ 55 | Tell all the environments to start taking a step 56 | with the given actions. 57 | Call step_wait() to get the results of the step. 58 | You should not call this if a step_async run is 59 | already pending. 60 | """ 61 | pass 62 | 63 | def step_wait(self): 64 | """ 65 | Wait for the step taken with step_async(). 66 | Returns (obs, rews, dones, infos): 67 | - obs: an array of observations, or a tuple of 68 | arrays of observations. 69 | - rews: an array of rewards 70 | - dones: an array of "episode done" booleans 71 | - infos: a sequence of info objects 72 | """ 73 | pass 74 | 75 | def close(self): 76 | """ 77 | Clean up the environments' resources. 78 | """ 79 | pass 80 | 81 | def step(self, actions): 82 | self.step_async(actions) 83 | return self.step_wait() 84 | 85 | 86 | class CloudpickleWrapper(object): 87 | """ 88 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 89 | """ 90 | 91 | def __init__(self, x): 92 | self.x = x 93 | 94 | def __getstate__(self): 95 | import cloudpickle 96 | return cloudpickle.dumps(self.x) 97 | 98 | def __setstate__(self, ob): 99 | import pickle 100 | self.x = pickle.loads(ob) 101 | 102 | 103 | class SubprocVecEnv(VecEnv): 104 | def __init__(self, env_fns, spaces=None): 105 | """ 106 | envs: list of gym environments to run in subprocesses 107 | """ 108 | self.waiting = False 109 | self.closed = False 110 | nenvs = len(env_fns) 111 | self.nenvs = nenvs 112 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 113 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 114 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 115 | for p in self.ps: 116 | p.daemon = True # if the main process crashes, we should not cause things to hang 117 | p.start() 118 | for remote in self.work_remotes: 119 | remote.close() 120 | 121 | self.remotes[0].send(('get_spaces', None)) 122 | observation_space, action_space = self.remotes[0].recv() 123 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 124 | 125 | def step_async(self, actions): 126 | for remote, action in zip(self.remotes, actions): 127 | remote.send(('step', action)) 128 | self.waiting = True 129 | 130 | def step_wait(self): 131 | results = [remote.recv() for remote in self.remotes] 132 | self.waiting = False 133 | obs, rews, dones, infos = zip(*results) 134 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 135 | 136 | def reset(self): 137 | for remote in self.remotes: 138 | remote.send(('reset', None)) 139 | return np.stack([remote.recv() for remote in self.remotes]) 140 | 141 | def reset_task(self): 142 | for remote in self.remotes: 143 | remote.send(('reset_task', None)) 144 | return np.stack([remote.recv() for remote in self.remotes]) 145 | 146 | def close(self): 147 | if self.closed: 148 | return 149 | if self.waiting: 150 | for remote in self.remotes: 151 | remote.recv() 152 | for remote in self.remotes: 153 | remote.send(('close', None)) 154 | for p in self.ps: 155 | p.join() 156 | self.closed = True 157 | 158 | def __len__(self): 159 | return self.nenvs -------------------------------------------------------------------------------- /Discrete_action/replay_buffer.py: -------------------------------------------------------------------------------- 1 | #code from openai 2 | #https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py 3 | 4 | import numpy as np 5 | import random 6 | 7 | import operator 8 | 9 | 10 | class SegmentTree(object): 11 | def __init__(self, capacity, operation, neutral_element): 12 | """Build a Segment Tree data structure. 13 | https://en.wikipedia.org/wiki/Segment_tree 14 | Can be used as regular array, but with two 15 | important differences: 16 | a) setting item's value is slightly slower. 17 | It is O(lg capacity) instead of O(1). 18 | b) user has access to an efficient `reduce` 19 | operation which reduces `operation` over 20 | a contiguous subsequence of items in the 21 | array. 22 | Paramters 23 | --------- 24 | capacity: int 25 | Total size of the array - must be a power of two. 26 | operation: lambda obj, obj -> obj 27 | and operation for combining elements (eg. sum, max) 28 | must for a mathematical group together with the set of 29 | possible values for array elements. 30 | neutral_element: obj 31 | neutral element for the operation above. eg. float('-inf') 32 | for max and 0 for sum. 33 | """ 34 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." 35 | self._capacity = capacity 36 | self._value = [neutral_element for _ in range(2 * capacity)] 37 | self._operation = operation 38 | 39 | def _reduce_helper(self, start, end, node, node_start, node_end): 40 | if start == node_start and end == node_end: 41 | return self._value[node] 42 | mid = (node_start + node_end) // 2 43 | if end <= mid: 44 | return self._reduce_helper(start, end, 2 * node, node_start, mid) 45 | else: 46 | if mid + 1 <= start: 47 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) 48 | else: 49 | return self._operation( 50 | self._reduce_helper(start, mid, 2 * node, node_start, mid), 51 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) 52 | ) 53 | 54 | def reduce(self, start=0, end=None): 55 | """Returns result of applying `self.operation` 56 | to a contiguous subsequence of the array. 57 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) 58 | Parameters 59 | ---------- 60 | start: int 61 | beginning of the subsequence 62 | end: int 63 | end of the subsequences 64 | Returns 65 | ------- 66 | reduced: obj 67 | result of reducing self.operation over the specified range of array elements. 68 | """ 69 | if end is None: 70 | end = self._capacity 71 | if end < 0: 72 | end += self._capacity 73 | end -= 1 74 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1) 75 | 76 | def __setitem__(self, idx, val): 77 | # index of the leaf 78 | idx += self._capacity 79 | self._value[idx] = val 80 | idx //= 2 81 | while idx >= 1: 82 | self._value[idx] = self._operation( 83 | self._value[2 * idx], 84 | self._value[2 * idx + 1] 85 | ) 86 | idx //= 2 87 | 88 | def __getitem__(self, idx): 89 | assert 0 <= idx < self._capacity 90 | return self._value[self._capacity + idx] 91 | 92 | 93 | class SumSegmentTree(SegmentTree): 94 | def __init__(self, capacity): 95 | super(SumSegmentTree, self).__init__( 96 | capacity=capacity, 97 | operation=operator.add, 98 | neutral_element=0.0 99 | ) 100 | 101 | def sum(self, start=0, end=None): 102 | """Returns arr[start] + ... + arr[end]""" 103 | return super(SumSegmentTree, self).reduce(start, end) 104 | 105 | def find_prefixsum_idx(self, prefixsum): 106 | """Find the highest index `i` in the array such that 107 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum 108 | if array values are probabilities, this function 109 | allows to sample indexes according to the discrete 110 | probability efficiently. 111 | Parameters 112 | ---------- 113 | perfixsum: float 114 | upperbound on the sum of array prefix 115 | Returns 116 | ------- 117 | idx: int 118 | highest index satisfying the prefixsum constraint 119 | """ 120 | assert 0 <= prefixsum <= self.sum() + 1e-5 121 | idx = 1 122 | while idx < self._capacity: # while non-leaf 123 | if self._value[2 * idx] > prefixsum: 124 | idx = 2 * idx 125 | else: 126 | prefixsum -= self._value[2 * idx] 127 | idx = 2 * idx + 1 128 | return idx - self._capacity 129 | 130 | 131 | class MinSegmentTree(SegmentTree): 132 | def __init__(self, capacity): 133 | super(MinSegmentTree, self).__init__( 134 | capacity=capacity, 135 | operation=min, 136 | neutral_element=float('inf') 137 | ) 138 | 139 | def min(self, start=0, end=None): 140 | """Returns min(arr[start], ..., arr[end])""" 141 | 142 | return super(MinSegmentTree, self).reduce(start, end) 143 | 144 | 145 | class ReplayBuffer(object): 146 | def __init__(self, size): 147 | """Create Replay buffer. 148 | Parameters 149 | ---------- 150 | size: int 151 | Max number of transitions to store in the buffer. When the buffer 152 | overflows the old memories are dropped. 153 | """ 154 | self._storage = [] 155 | self._maxsize = size 156 | self._next_idx = 0 157 | 158 | def __len__(self): 159 | return len(self._storage) 160 | 161 | def push(self, state, action, reward, next_state, done): 162 | data = (state, action, reward, next_state, done) 163 | 164 | if self._next_idx >= len(self._storage): 165 | self._storage.append(data) 166 | else: 167 | self._storage[self._next_idx] = data 168 | self._next_idx = (self._next_idx + 1) % self._maxsize 169 | 170 | def _encode_sample(self, idxes): 171 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 172 | for i in idxes: 173 | data = self._storage[i] 174 | obs_t, action, reward, obs_tp1, done = data 175 | obses_t.append(np.array(obs_t, copy=False)) 176 | actions.append(np.array(action, copy=False)) 177 | rewards.append(reward) 178 | obses_tp1.append(np.array(obs_tp1, copy=False)) 179 | dones.append(done) 180 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 181 | 182 | def sample(self, batch_size): 183 | """Sample a batch of experiences. 184 | Parameters 185 | ---------- 186 | batch_size: int 187 | How many transitions to sample. 188 | Returns 189 | ------- 190 | obs_batch: np.array 191 | batch of observations 192 | act_batch: np.array 193 | batch of actions executed given obs_batch 194 | rew_batch: np.array 195 | rewards received as results of executing act_batch 196 | next_obs_batch: np.array 197 | next set of observations seen after executing act_batch 198 | done_mask: np.array 199 | done_mask[i] = 1 if executing act_batch[i] resulted in 200 | the end of an episode and 0 otherwise. 201 | """ 202 | idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 203 | return self._encode_sample(idxes) 204 | 205 | 206 | class PrioritizedReplayBuffer(ReplayBuffer): 207 | def __init__(self, size, alpha): 208 | """Create Prioritized Replay buffer. 209 | Parameters 210 | ---------- 211 | size: int 212 | Max number of transitions to store in the buffer. When the buffer 213 | overflows the old memories are dropped. 214 | alpha: float 215 | how much prioritization is used 216 | (0 - no prioritization, 1 - full prioritization) 217 | See Also 218 | -------- 219 | ReplayBuffer.__init__ 220 | """ 221 | super(PrioritizedReplayBuffer, self).__init__(size) 222 | assert alpha > 0 223 | self._alpha = alpha 224 | 225 | it_capacity = 1 226 | while it_capacity < size: 227 | it_capacity *= 2 228 | 229 | self._it_sum = SumSegmentTree(it_capacity) 230 | self._it_min = MinSegmentTree(it_capacity) 231 | self._max_priority = 1.0 232 | 233 | def push(self, *args, **kwargs): 234 | """See ReplayBuffer.store_effect""" 235 | idx = self._next_idx 236 | super(PrioritizedReplayBuffer, self).push(*args, **kwargs) 237 | self._it_sum[idx] = self._max_priority ** self._alpha 238 | self._it_min[idx] = self._max_priority ** self._alpha 239 | 240 | def _sample_proportional(self, batch_size): 241 | res = [] 242 | for _ in range(batch_size): 243 | # TODO(szymon): should we ensure no repeats? 244 | mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) 245 | idx = self._it_sum.find_prefixsum_idx(mass) 246 | res.append(idx) 247 | return res 248 | 249 | def sample(self, batch_size, beta): 250 | """Sample a batch of experiences. 251 | compared to ReplayBuffer.sample 252 | it also returns importance weights and idxes 253 | of sampled experiences. 254 | Parameters 255 | ---------- 256 | batch_size: int 257 | How many transitions to sample. 258 | beta: float 259 | To what degree to use importance weights 260 | (0 - no corrections, 1 - full correction) 261 | Returns 262 | ------- 263 | obs_batch: np.array 264 | batch of observations 265 | act_batch: np.array 266 | batch of actions executed given obs_batch 267 | rew_batch: np.array 268 | rewards received as results of executing act_batch 269 | next_obs_batch: np.array 270 | next set of observations seen after executing act_batch 271 | done_mask: np.array 272 | done_mask[i] = 1 if executing act_batch[i] resulted in 273 | the end of an episode and 0 otherwise. 274 | weights: np.array 275 | Array of shape (batch_size,) and dtype np.float32 276 | denoting importance weight of each sampled transition 277 | idxes: np.array 278 | Array of shape (batch_size,) and dtype np.int32 279 | idexes in buffer of sampled experiences 280 | """ 281 | assert beta > 0 282 | 283 | idxes = self._sample_proportional(batch_size) 284 | 285 | weights = [] 286 | p_min = self._it_min.min() / self._it_sum.sum() 287 | max_weight = (p_min * len(self._storage)) ** (-beta) 288 | 289 | for idx in idxes: 290 | p_sample = self._it_sum[idx] / self._it_sum.sum() 291 | weight = (p_sample * len(self._storage)) ** (-beta) 292 | weights.append(weight / max_weight) 293 | weights = np.array(weights) 294 | encoded_sample = self._encode_sample(idxes) 295 | return tuple(list(encoded_sample) + [weights, idxes]) 296 | 297 | def update_priorities(self, idxes, priorities): 298 | """Update priorities of sampled transitions. 299 | sets priority of transition at index idxes[i] in buffer 300 | to priorities[i]. 301 | Parameters 302 | ---------- 303 | idxes: [int] 304 | List of idxes of sampled transitions 305 | priorities: [float] 306 | List of updated priorities corresponding to 307 | transitions at the sampled idxes denoted by 308 | variable `idxes`. 309 | """ 310 | assert len(idxes) == len(priorities) 311 | for idx, priority in zip(idxes, priorities): 312 | assert priority > 0 313 | assert 0 <= idx < len(self._storage) 314 | self._it_sum[idx] = priority ** self._alpha 315 | self._it_min[idx] = priority ** self._alpha 316 | 317 | self._max_priority = max(self._max_priority, priority) -------------------------------------------------------------------------------- /Discrete_action/wrappers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | import gym 4 | from gym import spaces 5 | import cv2 6 | cv2.ocl.setUseOpenCL(False) 7 | 8 | class NoopResetEnv(gym.Wrapper): 9 | def __init__(self, env, noop_max=30): 10 | """Sample initial states by taking random number of no-ops on reset. 11 | No-op is assumed to be action 0. 12 | """ 13 | gym.Wrapper.__init__(self, env) 14 | self.noop_max = noop_max 15 | self.override_num_noops = None 16 | self.noop_action = 0 17 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 18 | 19 | def reset(self, **kwargs): 20 | """ Do no-op action for a number of steps in [1, noop_max].""" 21 | self.env.reset(**kwargs) 22 | if self.override_num_noops is not None: 23 | noops = self.override_num_noops 24 | else: 25 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 26 | assert noops > 0 27 | obs = None 28 | for _ in range(noops): 29 | obs, _, done, _ = self.env.step(self.noop_action) 30 | if done: 31 | obs = self.env.reset(**kwargs) 32 | return obs 33 | 34 | def step(self, ac): 35 | return self.env.step(ac) 36 | 37 | class FireResetEnv(gym.Wrapper): 38 | def __init__(self, env): 39 | """Take action on reset for environments that are fixed until firing.""" 40 | gym.Wrapper.__init__(self, env) 41 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 42 | assert len(env.unwrapped.get_action_meanings()) >= 3 43 | 44 | def reset(self, **kwargs): 45 | self.env.reset(**kwargs) 46 | obs, _, done, _ = self.env.step(1) 47 | if done: 48 | self.env.reset(**kwargs) 49 | obs, _, done, _ = self.env.step(2) 50 | if done: 51 | self.env.reset(**kwargs) 52 | return obs 53 | 54 | def step(self, ac): 55 | return self.env.step(ac) 56 | 57 | class EpisodicLifeEnv(gym.Wrapper): 58 | def __init__(self, env): 59 | """Make end-of-life == end-of-episode, but only reset on true game over. 60 | Done by DeepMind for the Discrete_action and co. since it helps value estimation. 61 | """ 62 | gym.Wrapper.__init__(self, env) 63 | self.lives = 0 64 | self.was_real_done = True 65 | 66 | def step(self, action): 67 | obs, reward, done, info = self.env.step(action) 68 | self.was_real_done = done 69 | # check current lives, make loss of life terminal, 70 | # then update lives to handle bonus lives 71 | lives = self.env.unwrapped.ale.lives() 72 | if lives < self.lives and lives > 0: 73 | # for Qbert sometimes we stay in lives == 0 condtion for a few frames 74 | # so its important to keep lives > 0, so that we only reset once 75 | # the environment advertises done. 76 | done = True 77 | self.lives = lives 78 | return obs, reward, done, info 79 | 80 | def reset(self, **kwargs): 81 | """Reset only when lives are exhausted. 82 | This way all states are still reachable even though lives are episodic, 83 | and the learner need not know about any of this behind-the-scenes. 84 | """ 85 | if self.was_real_done: 86 | obs = self.env.reset(**kwargs) 87 | else: 88 | # no-op step to advance from terminal/lost life state 89 | obs, _, _, _ = self.env.step(0) 90 | self.lives = self.env.unwrapped.ale.lives() 91 | return obs 92 | 93 | class MaxAndSkipEnv(gym.Wrapper): 94 | def __init__(self, env, skip=4): 95 | """Return only every `skip`-th frame""" 96 | gym.Wrapper.__init__(self, env) 97 | # most recent raw observations (for max pooling across time steps) 98 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 99 | self._skip = skip 100 | 101 | def reset(self): 102 | return self.env.reset() 103 | 104 | def step(self, action): 105 | """Repeat action, sum reward, and max over last observations.""" 106 | total_reward = 0.0 107 | done = None 108 | for i in range(self._skip): 109 | obs, reward, done, info = self.env.step(action) 110 | if i == self._skip - 2: self._obs_buffer[0] = obs 111 | if i == self._skip - 1: self._obs_buffer[1] = obs 112 | total_reward += reward 113 | if done: 114 | break 115 | # Note that the observation on the done=True frame 116 | # doesn't matter 117 | max_frame = self._obs_buffer.max(axis=0) 118 | 119 | return max_frame, total_reward, done, info 120 | 121 | def reset(self, **kwargs): 122 | return self.env.reset(**kwargs) 123 | 124 | class ClipRewardEnv(gym.RewardWrapper): 125 | def __init__(self, env): 126 | gym.RewardWrapper.__init__(self, env) 127 | 128 | def reward(self, reward): 129 | """Bin reward to {+1, 0, -1} by its sign.""" 130 | return np.sign(reward) 131 | 132 | class WarpFrame(gym.ObservationWrapper): 133 | def __init__(self, env): 134 | """Warp frames to 84x84 as done in the Nature paper and later work.""" 135 | gym.ObservationWrapper.__init__(self, env) 136 | self.width = 84 137 | self.height = 84 138 | self.observation_space = spaces.Box(low=0, high=255, 139 | shape=(self.height, self.width, 1), dtype=np.uint8) 140 | 141 | def observation(self, frame): 142 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 143 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) 144 | return frame[:, :, None] 145 | 146 | class FrameStack(gym.Wrapper): 147 | def __init__(self, env, k): 148 | """Stack k last frames. 149 | Returns lazy array, which is much more memory efficient. 150 | See Also 151 | -------- 152 | baselines.common.atari_wrappers.LazyFrames 153 | """ 154 | gym.Wrapper.__init__(self, env) 155 | self.k = k 156 | self.frames = deque([], maxlen=k) 157 | shp = env.observation_space.shape 158 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8) 159 | 160 | def reset(self): 161 | ob = self.env.reset() 162 | for _ in range(self.k): 163 | self.frames.append(ob) 164 | return self._get_ob() 165 | 166 | def step(self, action): 167 | ob, reward, done, info = self.env.step(action) 168 | self.frames.append(ob) 169 | return self._get_ob(), reward, done, info 170 | 171 | def _get_ob(self): 172 | assert len(self.frames) == self.k 173 | return LazyFrames(list(self.frames)) 174 | 175 | class ScaledFloatFrame(gym.ObservationWrapper): 176 | def __init__(self, env): 177 | gym.ObservationWrapper.__init__(self, env) 178 | 179 | def observation(self, observation): 180 | # careful! This undoes the memory optimization, use 181 | # with smaller replay buffers only. 182 | return np.array(observation).astype(np.float32) / 255.0 183 | 184 | class LazyFrames(object): 185 | def __init__(self, frames): 186 | """This object ensures that common frames between the observations are only stored once. 187 | It exists purely to optimize memory usage which can be huge for Discrete_action's 1M frames replay 188 | buffers. 189 | This object should only be converted to numpy array before being passed to the model. 190 | You'd not believe how complex the previous solution was.""" 191 | self._frames = frames 192 | self._out = None 193 | 194 | def _force(self): 195 | if self._out is None: 196 | self._out = np.concatenate(self._frames, axis=2) 197 | self._frames = None 198 | return self._out 199 | 200 | def __array__(self, dtype=None): 201 | out = self._force() 202 | if dtype is not None: 203 | out = out.astype(dtype) 204 | return out 205 | 206 | def __len__(self): 207 | return len(self._force()) 208 | 209 | def __getitem__(self, i): 210 | return self._force()[i] 211 | 212 | def make_atari(env_id): 213 | env = gym.make(env_id) 214 | assert 'NoFrameskip' in env.spec.id 215 | env = NoopResetEnv(env, noop_max=30) 216 | env = MaxAndSkipEnv(env, skip=4) 217 | return env 218 | 219 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): 220 | """Configure environment for DeepMind-style Atari. 221 | """ 222 | if episode_life: 223 | env = EpisodicLifeEnv(env) 224 | if 'FIRE' in env.unwrapped.get_action_meanings(): 225 | env = FireResetEnv(env) 226 | env = WarpFrame(env) 227 | if scale: 228 | env = ScaledFloatFrame(env) 229 | if clip_rewards: 230 | env = ClipRewardEnv(env) 231 | if frame_stack: 232 | env = FrameStack(env, 4) 233 | return env 234 | 235 | 236 | 237 | class ImageToPyTorch(gym.ObservationWrapper): 238 | """ 239 | Image shape to num_channels x weight x height 240 | """ 241 | def __init__(self, env): 242 | super(ImageToPyTorch, self).__init__(env) 243 | old_shape = self.observation_space.shape 244 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.uint8) 245 | 246 | def observation(self, observation): 247 | return np.swapaxes(observation, 2, 0) 248 | 249 | 250 | def wrap_pytorch(env): 251 | return ImageToPyTorch(env) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL-algorithms 2 | 更新一些基础的RL代码 3 | - [离散的动作空间](#离散的动作空间) 4 | - [DQN](#DQN) 5 | - [DDQN](#DDQN) 6 | - [Dueling-DQN](#Dueling-DQN) 7 | - [D3QN](#D3QN) 8 | - [Noisy-DQN](#Noisy-DQN) 9 | 10 | - [连续的动作空间](#连续的动作空间) 11 | - [DDPG](#DDPG) 12 | - [A3C](#A3C) 13 | - [PPO](#PPO) 14 | - [PPO+GAE](#PPO+GAE) 15 | - [SAC](#SAC) 16 | - [TD3](#TD3) 17 | 18 | 19 | 20 | ## 离散的动作空间 21 | ### DQN 22 | 可用于入门深度强化学习，使用一个Q Network来估计Q值，从而替换了 Q-table，完成从离散状态空间到连续状态空间的跨越。Q Network 会对每一个离散动作的Q值进行估计，执行的时候选择Q值最高的动作（greedy 策略）。并使用 epslion-greedy 策略进行探索（探索的时候，有很小的概率随机执行动作），来获得各种动作的训练数据 23 | 24 | ### DDQN 25 | (Double DQN)更加稳定，因为最优化操作会传播高估误差，所以她同时训练两个Q network并选择较小的Q值用于计算TD-error，降低高估误差。 26 | 27 | ### Dueling-DQN 28 | 使用了优势函数 advantage function（A3C也用了）：它只估计state的Q值，不考虑动作，好的策略能将state 导向一个更有优势的局面。然而不是任何时刻 action 都会影响 state的转移，因此Dueling DQN 结合了优势函数估计的Q值与原本DQN对不同动作估计的Q值。DQN算法学习 state 与每个离散动作一一对应的Q值后才能知道学到 state 的Q值，而Dueling DQN 能通过优势函数直接学到state的价值，这使得Dueling DQN在一些action不影响环境的情况下能学比DQN更快 29 | 30 | ### D3QN 31 | Dueling DQN 与Double DQN相互兼容，一起用效果很好。简单，泛用，没有使用禁忌。任何一个刚入门的人都能独立地在前两种算法的基础上改出D3QN。在论文中使用了D3QN应该引用DuelingDQN 与 DoubleDQN的文章 32 | 33 | ### Noisy-DQN 34 | 探索能力稍强。Noisy DQN 把噪声添加到网络的输出层之前值。原本Q值较大的动作在添加噪声后Q值变大的概率也比较大。这种探索比epslion-greedy随机选一个动作去执行更好，至少这种针对性的探索既保证了探索动作多样，也提高了探索效率。 35 | 36 | 37 | ## 连续的动作空间 38 | ### DDPG 39 | DDPG（Deep DPG ），可用于入门连续动作空间的DRL算法。DPG 确定策略梯度算法，直接让策略网络输出action，成功在连续动作空间任务上训练出能用的策略，但是它使用 OU-noise 这种有很多超参数的方法去探索环境，训练慢，且不稳定。 40 | 41 | ### PPO 42 | （Proximal PO 近端策略搜索）训练稳定，调参简单，robust（稳健、耐操）。PPO对TRPO的信任域计算过程进行简化，论文中用的词是 surrogate objective。PPO动作的噪声方差是一个可训练的矢量（与动作矢量相同形状），而不由网络输出，这样做增强了PPO的稳健性 robustness。 43 | 44 | ### A3C 45 | （Asynchronous Advantage Actor-Critic）Asynchronous 指开启多个actor 在环境中探索，并异步更新。原本DDPG的Critic 是 Q(s, a)，根据state-action pair 估计Q值，优势函数只使用 state 去估计Q值，这是很好的创新：降低了随机策略梯度算法估计Q值的难度。然而优势函数有明显缺陷：不是任何时刻 action 都会影响 state的转移（详见 Dueling DQN），因此这个算法只适合入门学习「优势函数 advantage function」。如果你看到新论文还在使用A3C，那么你要怀疑其作者RL的水平。此外，A3C算法有离散动作版本，也有连续动作版本。A2C 指的是没有Asynchronous 的版本。 46 | 47 | ### SAC 48 | （Soft Actor-Critic with maximum entropy 最大熵）训练很快，探索能力好，但是很依赖Reward Function，不像PPO那样随便整一个Reward function 也能训练。PPO算法会计算新旧策略的差异（计算两个分布之间的距离），并让这个差异保持在信任域内，且不至于太小。SAC算法不是on-policy算法，不容易计算新旧策略的差异，所以它在优化时最大化策略的熵（动作的方差越大，策略的熵越高） 49 | SAC也可以离散化到离散空间。对于SAC-discrete, 其更适合有很多不确定性的环境，对于一些确定状态的环境表现不如rainbow DQN 50 | 51 | ### TD3 52 | TD3（TDDD，Twin Delay DDPG），擅长调参的人才建议用，因为它影响训练的敏感超参数很多。它从Double DQN那里继承了Twin Critic，用来降低高估误差；它用来和随机策略梯度很像的方法：计算用于更新TD-error的Q值时，给action加上了噪声，用于让Critic拟合更平滑的Q值估计函数。TD3建议延迟更新目标网络，即多更新几次网络后，再使用 soft update 将网络更新到target network上，我认为这没有多大用，后来的其他算法也不用这个技巧。TD3还建议在计算Q值时，为动作添加一个噪声，用于平滑Critic函数，在确定策略中，TD3这么用很像“随机策略” 53 | 54 | ### PPO+GAE 55 | （Generalized Advantage Estimation）训练最稳定，调参最简单，适合高维状态 High-dimensional state，但是环境不能有太多随机因数。GAE会根据经验轨迹 trajectory 生成优势函数估计值，然后让Critic去拟合这个值。在这样的调整下，在随机因素小的环境中，不需要太多 trajectory 即可描述当前的策略。尽管GAE可以用于多种RL算法，但是它与PPO这种On-policy 的相性最好。 56 | 57 | ### PPG 58 | PPG（Proximal Policy Gradient），A3C、PPO 都是同策略 On-policy，它要求：在环境中探索并产生训练数据的策略与被更新的策略网络一定得是同一个策略。她们需要删掉已旧策略的数据，然后使用新策略在环境中重新收集。为了让PPO也能用 off-policy 的数据来训练，PPG诞生了，思路挺简单的，原本的On-policy PPO部分该干啥干啥，额外引入一个使用off-policy数据进行训练的Critic，让它与PPO的Critic共享参数，也就是Auxiliary Task，这种算法并不是在任何情况下都能比PPO好，因为PPG涉及到Auxiliary task，这要求她尽可能收集更多的训练数据，并在大batch size 下面才能表现得更好。 --------------------------------------------------------------------------------