├── .idea
├── RL.iml
├── deployment.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── Continuous_action
├── DDPG.py
├── PPO+GAE.py
├── PPO.py
├── PPO_continuous_BipedalWalker-v3.pth
├── SAC.py
└── TD3.py
├── Discrete_action
├── Actor_Critic.py
├── CnnDQN.py
├── D3QN.py
├── D3QN2.py
├── DDQN.py
├── DQN.py
├── Noise DQN.py
├── __pycache__
│ ├── multiprocessing_env.cpython-37.pyc
│ └── replay_buffer.cpython-37.pyc
├── multiprocessing_env.py
├── replay_buffer.py
└── wrappers.py
└── README.md
/.idea/RL.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
21 |
22 |
23 |
24 |
25 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 | 1614334549215
183 |
184 |
185 | 1614334549215
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 | file://$PROJECT_DIR$/DQN.py
263 | 111
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
--------------------------------------------------------------------------------
/Continuous_action/DDPG.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Feb 28 2021
3 | @author: wangmeng
4 | """
5 | import random
6 | import gym
7 | import numpy as np
8 | import torch
9 | import torch.nn as nn
10 | import torch.optim as optim
11 | import torch.nn.functional as F
12 | from torch.distributions import Normal
13 | import matplotlib.pyplot as plt
14 |
15 | use_cuda = torch.cuda.is_available()
16 | device = torch.device("cuda" if use_cuda else "cpu")
17 |
18 | class ValueNetwork(nn.Module):
19 | def __init__(self, num_inputs, num_actions, hidden_size ,init_w = 3e-3):
20 | super(ValueNetwork, self).__init__()
21 |
22 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
23 | self.linear2 = nn.Linear(hidden_size, hidden_size)
24 | self.linear3 = nn.Linear(hidden_size, 1)
25 |
26 | self.linear3.weight.data.uniform_(-init_w,init_w)
27 | self.linear3.bias.data.uniform_(-init_w,init_w)
28 |
29 | def forward(self, state, action):
30 | x = torch.cat([state, action], 1)
31 | x = F.relu(self.linear1(x))
32 | x = F.relu(self.linear2(x))
33 | x = self.linear3(x)
34 | return x
35 |
36 | class PolicyNetwork(nn.Module):
37 | def __init__(self, num_inputs, num_actions, hidden_size, init_w = 3e-3):
38 | super(PolicyNetwork, self).__init__()
39 |
40 | self.linear1 = nn.Linear(num_inputs, hidden_size)
41 | self.linear2 = nn.Linear(hidden_size, hidden_size)
42 | self.linear3 = nn.Linear(hidden_size, num_actions)
43 |
44 | # uniform_将tensor用从均匀分布中抽样得到的值填充。参数初始化
45 | self.linear3.weight.data.uniform_(-init_w, init_w)
46 | #也用用normal_(0, 0.1) 来初始化的,高斯分布中抽样填充,这两种都是比较有效的初始化方式
47 | self.linear3.bias.data.uniform_(-init_w, init_w)
48 | #其意义在于我们尽可能保持 每个神经元的输入和输出的方差一致。
49 | #使用 RELU(without BN) 激活函数时,最好选用 He 初始化方法,将参数初始化为服从高斯分布或者均匀分布的较小随机数
50 | #使用 BN 时,减少了网络对参数初始值尺度的依赖,此时使用较小的标准差(eg:0.01)进行初始化即可
51 |
52 | #但是注意DRL中不建议使用BN
53 |
54 | def forward(self, x):
55 | x = F.relu(self.linear1(x))
56 | x = F.relu(self.linear2(x))
57 | x = F.tanh(self.linear3(x))
58 | return x
59 |
60 | def get_action(self, state):
61 | state = torch.FloatTensor(state).unsqueeze(0).to(device)
62 | action = self.forward(state)
63 | return action.detach().cpu().numpy()[0,0]
64 |
65 | class OUNoise(object):
66 | def __init__(self, action_space, mu=0.0, theta = 0.15, max_sigma = 0.3, min_sigma = 0.3, decay_period = 100000):#decay_period要根据迭代次数合理设置
67 | self.mu = mu
68 | self.theta = theta
69 | self.sigma = max_sigma
70 | self.max_sigma = max_sigma
71 | self.min_sigma = min_sigma
72 | self.decay_period = decay_period
73 | self.action_dim = action_space.shape[0]
74 | self.low = action_space.low
75 | self.high = action_space.high
76 | self.reset()
77 |
78 | def reset(self):
79 | self.state = np.ones(self.action_dim) *self.mu
80 |
81 | def evolve_state(self):
82 | x = self.state
83 | dx = self.theta* (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
84 | self.state = x + dx
85 | return self.state
86 |
87 | def get_action(self, action, t=0):
88 | ou_state = self.evolve_state()
89 | self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
90 | return np.clip(action + ou_state, self.low, self.high)
91 |
92 |
93 | class ReplayBuffer:
94 | def __init__(self, capacity):
95 | self.capacity = capacity
96 | self.buffer = []
97 | self.position = 0
98 |
99 | def push(self, state, action, reward, next_state, done):
100 | if len(self.buffer) < self.capacity:
101 | self.buffer.append(None)
102 | self.buffer[self.position] = (state, action, reward, next_state, done)
103 | self.position = (self.position + 1) % self.capacity
104 |
105 | def sample(self, batch_size):
106 | batch = random.sample(self.buffer, batch_size)
107 | state, action, reward, next_state, done = map(np.stack, zip(*batch))
108 | return state, action, reward, next_state, done
109 |
110 | def __len__(self):
111 | return len(self.buffer)
112 |
113 |
114 | class NormalizedActions(gym.ActionWrapper):
115 |
116 | def action(self, action):
117 | low_bound = self.action_space.low
118 | upper_bound = self.action_space.high
119 |
120 | action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
121 | #将经过tanh输出的值重新映射回环境的真实值内
122 | action = np.clip(action, low_bound, upper_bound)
123 |
124 | return action
125 |
126 | def reverse_action(self, action):
127 | low_bound = self.action_space.low
128 | upper_bound = self.action_space.high
129 |
130 | #因为激活函数使用的是tanh,这里将环境输出的动作正则化到(-1,1)
131 |
132 | action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
133 | action = np.clip(action, low_bound, upper_bound)
134 |
135 | return action
136 |
137 | class DDPG(object):
138 | def __init__(self, action_dim, state_dim, hidden_dim):
139 | super(DDPG,self).__init__()
140 | self.action_dim, self.state_dim, self.hidden_dim = action_dim, state_dim, hidden_dim
141 | self.batch_size = 128
142 | self.gamma = 0.99
143 | self.min_value = -np.inf
144 | self.max_value = np.inf
145 | self.soft_tau = 1e-2
146 | self.replay_buffer_size = 5000
147 | self.value_lr = 1e-3
148 | self.policy_lr = 1e-4
149 |
150 | self.value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
151 | self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)
152 |
153 | self.target_value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
154 | self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)
155 |
156 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
157 | target_param.data.copy_(param.data)
158 |
159 | for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()):
160 | target_param.data.copy_(param.data)
161 |
162 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr)
163 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr)
164 |
165 | self.value_criterion = nn.MSELoss()
166 |
167 | self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
168 |
169 | def ddpg_update(self):
170 | state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size)
171 |
172 | state = torch.FloatTensor(state).to(device)
173 | next_state = torch.FloatTensor(next_state).to(device)
174 | action = torch.FloatTensor(action).to(device)
175 | reward = torch.FloatTensor(reward).unsqueeze(1).to(device)
176 | done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)
177 |
178 | policy_loss = self.value_net(state, self.policy_net(state))
179 | policy_loss = -policy_loss.mean()
180 |
181 | next_action = self.target_policy_net(next_state)
182 | target_value = self.target_value_net(next_state, next_action.detach())
183 | expected_value = reward + (1.0 - done) * self.gamma * target_value
184 | expected_value = torch.clamp(expected_value, self.min_value, self.max_value)
185 |
186 | value = self.value_net(state, action)
187 | value_loss = self.value_criterion(value, expected_value.detach())
188 |
189 | self.policy_optimizer.zero_grad()
190 | policy_loss.backward()
191 | self.policy_optimizer.step()
192 |
193 | self.value_optimizer.zero_grad()
194 | value_loss.backward()
195 | self.value_optimizer.step()
196 |
197 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
198 | target_param.data.copy_(
199 | target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau
200 | )
201 |
202 | for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()):
203 | target_param.data.copy_(
204 | target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau
205 | )
206 |
207 | def plot(frame_idx, rewards):
208 | plt.figure(figsize=(20,5))
209 | plt.subplot(131)
210 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
211 | plt.plot(rewards)
212 | plt.show()
213 |
214 |
215 | def main():
216 | env = gym.make("Pendulum-v0")
217 | env = NormalizedActions(env)
218 |
219 | ou_noise = OUNoise(env.action_space)
220 |
221 | state_dim = env.observation_space.shape[0]
222 | action_dim = env.action_space.shape[0]
223 | hidden_dim = 256
224 |
225 | ddpg = DDPG(action_dim, state_dim, hidden_dim)
226 |
227 | max_frames = 12000
228 | max_steps = 500
229 | frame_idx = 0
230 | rewards = []
231 | batch_size = 128
232 |
233 | while frame_idx < max_frames:
234 | state = env.reset()
235 | ou_noise.reset()
236 | episode_reward = 0
237 |
238 | for step in range(max_steps):
239 | env.render()
240 | action = ddpg.policy_net.get_action(state)
241 | action = ou_noise.get_action(action, step)
242 | next_state, reward, done, _ = env.step(action)
243 |
244 | ddpg.replay_buffer.push(state, action, reward, next_state, done)
245 | if len(ddpg.replay_buffer) > batch_size:
246 | ddpg.ddpg_update()
247 |
248 | state = next_state
249 | episode_reward += reward
250 | frame_idx += 1
251 |
252 | if frame_idx % max(1000, max_steps + 1) == 0:
253 | plot(frame_idx, rewards)
254 |
255 | if done:
256 | break
257 |
258 | rewards.append(episode_reward)
259 | env.close()
260 |
261 | if __name__ == '__main__':
262 | main()
--------------------------------------------------------------------------------
/Continuous_action/PPO+GAE.py:
--------------------------------------------------------------------------------
1 | import math
2 | import random
3 |
4 | import gym
5 | import numpy as np
6 |
7 | import torch
8 | import torch.nn as nn
9 | import torch.optim as optim
10 | import torch.nn.functional as F
11 | from torch.distributions import Normal
12 | from multiprocessing_env import SubprocVecEnv
13 | import matplotlib.pyplot as plt
14 |
15 | use_cuda = torch.cuda.is_available()
16 | device = torch.device("cuda" if use_cuda else "cpu")
17 |
18 | num_envs = 16
19 | env_name = "Pendulum-v0"
20 |
21 | def make_env():
22 | def _thunk():
23 | env = gym.make(env_name)
24 | return env
25 |
26 | return _thunk
27 |
28 | envs = [make_env() for i in range(num_envs)]
29 | envs = SubprocVecEnv(envs)
30 |
31 | env = gym.make(env_name)
32 |
33 |
34 | def init_weights(m):
35 | if isinstance(m, nn.Linear):
36 | nn.init.normal_(m.weight, mean=0., std=0.1)
37 | nn.init.constant_(m.bias, 0.1)
38 |
39 |
40 | class ActorCritic(nn.Module):
41 | def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
42 | super(ActorCritic, self).__init__()
43 |
44 | self.critic = nn.Sequential(
45 | nn.Linear(num_inputs, hidden_size),
46 | nn.ReLU(),
47 | nn.Linear(hidden_size, 1)
48 | )
49 |
50 | self.actor = nn.Sequential(
51 | nn.Linear(num_inputs, hidden_size),
52 | nn.ReLU(),
53 | nn.Linear(hidden_size, num_outputs),
54 | )
55 | self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
56 |
57 | self.apply(init_weights)
58 |
59 | def forward(self, x):
60 | value = self.critic(x)
61 | mu = self.actor(x)
62 | std = self.log_std.exp().expand_as(mu)
63 | dist = Normal(mu, std)
64 | return dist, value
65 |
66 | def plot(frame_idx, rewards):
67 | plt.figure(figsize=(20,5))
68 | plt.subplot(131)
69 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
70 | plt.plot(rewards)
71 | plt.show()
72 |
73 |
74 | def test_env(vis=False):
75 | state = env.reset()
76 | if vis: env.render()
77 | done = False
78 | total_reward = 0
79 | while not done:
80 | state = torch.FloatTensor(state).unsqueeze(0).to(device)
81 | dist, _ = model(state)
82 | next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
83 | state = next_state
84 | if vis: env.render()
85 | total_reward += reward
86 | return total_reward
87 |
88 | def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
89 | values = values + [next_value]
90 | gae = 0
91 | returns = []
92 | for step in reversed(range(len(rewards))):
93 | delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
94 | gae = delta + gamma * tau * masks[step] * gae
95 | returns.insert(0, gae + values[step])
96 | return returns
97 |
98 |
99 | def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
100 | batch_size = states.size(0)
101 | for _ in range(batch_size // mini_batch_size):
102 | rand_ids = np.random.randint(0, batch_size, mini_batch_size)
103 | yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[
104 | rand_ids, :]
105 |
106 |
107 | def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
108 | for _ in range(ppo_epochs):
109 | for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs,
110 | returns, advantages):
111 | dist, value = model(state)
112 | entropy = dist.entropy().mean()
113 | new_log_probs = dist.log_prob(action)
114 |
115 | ratio = (new_log_probs - old_log_probs).exp()
116 | surr1 = ratio * advantage
117 | surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage
118 |
119 | actor_loss = - torch.min(surr1, surr2).mean()
120 | critic_loss = (return_ - value).pow(2).mean()
121 |
122 | loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
123 |
124 | optimizer.zero_grad()
125 | loss.backward()
126 | optimizer.step()
127 |
128 | num_inputs = envs.observation_space.shape[0]
129 | num_outputs = envs.action_space.shape[0]
130 |
131 | #Hyper params:
132 | hidden_size = 256
133 | lr = 3e-4
134 | num_steps = 20
135 | mini_batch_size = 5
136 | ppo_epochs = 4
137 | threshold_reward = -200
138 |
139 | model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
140 | optimizer = optim.Adam(model.parameters(), lr=lr)
141 |
142 | max_frames = 15000
143 | frame_idx = 0
144 | test_rewards = []
145 |
146 | state = envs.reset()
147 | early_stop = False
148 |
149 | while frame_idx < max_frames and not early_stop:
150 |
151 | log_probs = []
152 | values = []
153 | states = []
154 | actions = []
155 | rewards = []
156 | masks = []
157 | entropy = 0
158 |
159 | for _ in range(num_steps):
160 | state = torch.FloatTensor(state).to(device)
161 | dist, value = model(state)
162 |
163 | action = dist.sample()
164 | next_state, reward, done, _ = envs.step(action.cpu().numpy())
165 |
166 | log_prob = dist.log_prob(action)
167 | entropy += dist.entropy().mean()
168 |
169 | log_probs.append(log_prob)
170 | values.append(value)
171 | rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
172 | masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
173 |
174 | states.append(state)
175 | actions.append(action)
176 |
177 | state = next_state
178 | frame_idx += 1
179 |
180 | if frame_idx % 1000 == 0:
181 | test_reward = np.mean([test_env() for _ in range(10)])
182 | test_rewards.append(test_reward)
183 | plot(frame_idx, test_rewards)
184 | if test_reward > threshold_reward: early_stop = True
185 |
186 | next_state = torch.FloatTensor(next_state).to(device)
187 | _, next_value = model(next_state)
188 | returns = compute_gae(next_value, rewards, masks, values)
189 |
190 | returns = torch.cat(returns).detach()
191 | log_probs = torch.cat(log_probs).detach()
192 | values = torch.cat(values).detach()
193 | states = torch.cat(states)
194 | actions = torch.cat(actions)
195 | advantage = returns - values
196 |
197 | ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)
--------------------------------------------------------------------------------
/Continuous_action/PPO.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Mar 1 2021
3 | @author: wangmeng
4 | """
5 | import torch
6 | import torch.nn as nn
7 | from torch.distributions import MultivariateNormal
8 | import gym
9 | import numpy as np
10 |
11 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
12 |
13 |
14 | class Memory:
15 | def __init__(self):
16 | self.actions = []
17 | self.states = []
18 | self.logprobs = []
19 | self.rewards = []
20 | self.is_terminals = []
21 |
22 | def clear_memory(self):
23 | # del语句作用在变量上,而不是数据对象上。删除的是变量,而不是数据。
24 | del self.actions[:]
25 | del self.states[:]
26 | del self.logprobs[:]
27 | del self.rewards[:]
28 | del self.is_terminals[:]
29 |
30 |
31 | class ActorCritic(nn.Module):
32 | def __init__(self, state_dim, action_dim, action_std):
33 | super(ActorCritic, self).__init__()
34 | # action mean range -1 to 1
35 | self.actor = nn.Sequential(
36 | nn.Linear(state_dim, 64),
37 | nn.Tanh(),
38 | nn.Linear(64, 32),
39 | nn.Tanh(),
40 | nn.Linear(32, action_dim),
41 | nn.Tanh()
42 | )
43 | # critic
44 | self.critic = nn.Sequential(
45 | nn.Linear(state_dim, 64),
46 | nn.Tanh(),
47 | nn.Linear(64, 32),
48 | nn.Tanh(),
49 | nn.Linear(32, 1)
50 | )
51 | # 方差
52 | self.action_var = torch.full((action_dim,), action_std * action_std).to(device)
53 |
54 | def forward(self):
55 | # 手动设置异常
56 | raise NotImplementedError
57 |
58 | def act(self, state, memory):
59 | action_mean = self.actor(state)
60 | cov_mat = torch.diag(self.action_var).to(device)
61 |
62 | dist = MultivariateNormal(action_mean, cov_mat)
63 | action = dist.sample()
64 | action_logprob = dist.log_prob(action)
65 |
66 | memory.states.append(state)
67 | memory.actions.append(action)
68 | memory.logprobs.append(action_logprob)
69 |
70 | return action.detach()
71 |
72 | def evaluate(self, state, action):
73 | action_mean = self.actor(state)
74 |
75 | action_var = self.action_var.expand_as(action_mean)
76 | # torch.diag_embed(input, offset=0, dim1=-2, dim2=-1) → Tensor
77 | # Creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2) are filled by input
78 | cov_mat = torch.diag_embed(action_var).to(device)
79 | # 生成一个多元高斯分布矩阵
80 | dist = MultivariateNormal(action_mean, cov_mat)
81 | # 我们的目的是要用这个随机的去逼近真正的选择动作action的高斯分布
82 | action_logprobs = dist.log_prob(action)
83 | # log_prob 是action在前面那个正太分布的概率的log ,我们相信action是对的 ,
84 | # 那么我们要求的正态分布曲线中点应该在action这里,所以最大化正太分布的概率的log, 改变mu,sigma得出一条中心点更加在a的正太分布。
85 | dist_entropy = dist.entropy()
86 | state_value = self.critic(state)
87 |
88 | return action_logprobs, torch.squeeze(state_value), dist_entropy
89 |
90 |
91 | class PPO:
92 | def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip):
93 | self.lr = lr
94 | self.betas = betas
95 | self.gamma = gamma
96 | self.eps_clip = eps_clip
97 | self.K_epochs = K_epochs
98 |
99 | self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
100 | self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
101 |
102 | self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device)
103 | self.policy_old.load_state_dict(self.policy.state_dict())
104 |
105 | self.MseLoss = nn.MSELoss()
106 |
107 | def select_action(self, state, memory):
108 | state = torch.FloatTensor(state.reshape(1, -1)).to(device)
109 | return self.policy_old.act(state, memory).cpu().data.numpy().flatten()
110 |
111 | def update(self, memory):
112 | # Monte Carlo estimate of rewards:
113 | rewards = []
114 | discounted_reward = 0
115 | for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
116 | if is_terminal:
117 | discounted_reward = 0
118 | discounted_reward = reward + (self.gamma * discounted_reward)
119 | rewards.insert(0, discounted_reward)
120 |
121 | # Normalizing the rewards:
122 | rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
123 | rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
124 |
125 | # convert list to tensor
126 | # 使用stack可以保留两个信息:[1. 序列] 和 [2. 张量矩阵] 信息,属于【扩张再拼接】的函数;
127 | old_states = torch.squeeze(torch.stack(memory.states).to(device), 1).detach()
128 | old_actions = torch.squeeze(torch.stack(memory.actions).to(device), 1).detach()
129 | old_logprobs = torch.squeeze(torch.stack(memory.logprobs), 1).to(device).detach()
130 |
131 | # Optimize policy for K epochs:
132 | for _ in range(self.K_epochs):
133 | # Evaluating old actions and values :
134 | logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
135 |
136 | # Finding the ratio (pi_theta / pi_theta__old):
137 | ratios = torch.exp(logprobs - old_logprobs.detach())
138 |
139 | # Finding Surrogate Loss:
140 | advantages = rewards - state_values.detach()
141 | surr1 = ratios * advantages
142 | surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
143 | loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy
144 |
145 | # take gradient step
146 | self.optimizer.zero_grad()
147 | loss.mean().backward()
148 | self.optimizer.step()
149 |
150 | # Copy new weights into old policy:
151 | self.policy_old.load_state_dict(self.policy.state_dict())
152 |
153 |
154 | def main():
155 | ############## Hyperparameters ##############
156 | env_name = "BipedalWalker-v3"
157 | render = False
158 | solved_reward = 300 # stop training if avg_reward > solved_reward
159 | log_interval = 20 # print avg reward in the interval
160 | max_episodes = 10000 # max training episodes
161 | max_timesteps = 1500 # max timesteps in one episode
162 |
163 | update_timestep = 4000 # update policy every n timesteps
164 | action_std = 0.5 # constant std for action distribution (Multivariate Normal)
165 | K_epochs = 80 # update policy for K epochs
166 | eps_clip = 0.2 # clip parameter for PPO
167 | gamma = 0.99 # discount factor
168 |
169 | lr = 0.0003 # parameters for Adam optimizer
170 | betas = (0.9, 0.999)
171 |
172 | random_seed = None
173 | #############################################
174 |
175 | # creating environment
176 | env = gym.make(env_name)
177 | state_dim = env.observation_space.shape[0]
178 | action_dim = env.action_space.shape[0]
179 |
180 | if random_seed:
181 | print("Random Seed: {}".format(random_seed))
182 | torch.manual_seed(random_seed)
183 | env.seed(random_seed)
184 | np.random.seed(random_seed)
185 |
186 | memory = Memory()
187 | ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
188 | print(lr, betas)
189 |
190 | # logging variables
191 | running_reward = 0
192 | avg_length = 0
193 | time_step = 0
194 |
195 | # training loop
196 | for i_episode in range(1, max_episodes + 1):
197 | state = env.reset()
198 | for t in range(max_timesteps):
199 | time_step += 1
200 | # Running policy_old:
201 | action = ppo.select_action(state, memory)
202 | state, reward, done, _ = env.step(action)
203 |
204 | # Saving reward and is_terminals:
205 | memory.rewards.append(reward)
206 | memory.is_terminals.append(done)
207 |
208 | # update if its time
209 | if time_step % update_timestep == 0:
210 | ppo.update(memory)
211 | memory.clear_memory()
212 | time_step = 0
213 | running_reward += reward
214 | if render:
215 | env.render()
216 | if done:
217 | break
218 |
219 | avg_length += t
220 |
221 | # stop training if avg_reward > solved_reward
222 | if running_reward > (log_interval * solved_reward):
223 | print("########## Solved! ##########")
224 | torch.save(ppo.policy.state_dict(), './PPO_continuous_solved_{}.pth'.format(env_name))
225 | break
226 |
227 | # save every 500 episodes
228 | if i_episode % 500 == 0:
229 | torch.save(ppo.policy.state_dict(), './PPO_continuous_{}.pth'.format(env_name))
230 |
231 | # logging
232 | if i_episode % log_interval == 0:
233 | avg_length = int(avg_length / log_interval)
234 | running_reward = int((running_reward / log_interval))
235 |
236 | print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward))
237 | running_reward = 0
238 | avg_length = 0
239 |
240 |
241 | if __name__ == '__main__':
242 | main()
243 |
244 |
--------------------------------------------------------------------------------
/Continuous_action/PPO_continuous_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mengwanglalala/RL-algorithms/97f5b3e3b570ecb3c88ecf5f1ade148552103071/Continuous_action/PPO_continuous_BipedalWalker-v3.pth
--------------------------------------------------------------------------------
/Continuous_action/SAC.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Mar 4 2021
3 | @author: wangmeng
4 | """
5 | import math
6 | import random
7 | import gym
8 | import numpy as np
9 | import torch
10 | import torch.nn as nn
11 | import torch.optim as optim
12 | import torch.nn.functional as F
13 | from torch.distributions import Normal
14 | import matplotlib.pyplot as plt
15 |
16 | use_cuda = torch.cuda.is_available()
17 | device = torch.device("cuda" if use_cuda else "cpu")
18 |
19 |
20 | class ValueNetwork(nn.Module):
21 | def __init__(self, state_dim, hidden_dim, init_w=3e-3):
22 | super(ValueNetwork, self).__init__()
23 |
24 | self.linear1 = nn.Linear(state_dim, hidden_dim)
25 | self.linear2 = nn.Linear(hidden_dim, hidden_dim)
26 | self.linear3 = nn.Linear(hidden_dim, 1)
27 |
28 | self.linear3.weight.data.uniform_(-init_w, init_w)
29 | self.linear3.bias.data.uniform_(-init_w, init_w)
30 |
31 | def forward(self, state):
32 | x = F.relu(self.linear1(state))
33 | x = F.relu(self.linear2(x))
34 | x = self.linear3(x)
35 | return x
36 |
37 | #网络结构与ValueNetwork相同
38 | class SoftQNetwork(nn.Module):
39 | def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
40 | super(SoftQNetwork, self).__init__()
41 |
42 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
43 | self.linear2 = nn.Linear(hidden_size, hidden_size)
44 | self.linear3 = nn.Linear(hidden_size, 1)
45 |
46 | self.linear3.weight.data.uniform_(-init_w, init_w)
47 | self.linear3.bias.data.uniform_(-init_w, init_w)
48 |
49 | def forward(self, state, action):
50 | x = torch.cat([state, action], 1)
51 | x = F.relu(self.linear1(x))
52 | x = F.relu(self.linear2(x))
53 | x = self.linear3(x)
54 | return x
55 |
56 |
57 | class PolicyNetwork(nn.Module):
58 | def __init__(self, num_inputs, num_actions, hidden_size, init_w = 3e-3, log_std_min=-20, log_std_max=2):#多了标准差计算
59 | super(PolicyNetwork, self).__init__()
60 | self.log_std_min = log_std_min
61 | self.log_std_max = log_std_max
62 |
63 | self.linear1 = nn.Linear(num_inputs, hidden_size)
64 | self.linear2 = nn.Linear(hidden_size, hidden_size)
65 |
66 | #算均值
67 | self.mean_linear = nn.Linear(hidden_size, num_actions)
68 | self.mean_linear.weight.data.uniform_(-init_w, init_w)
69 | self.mean_linear.bias.data.uniform_(-init_w, init_w)
70 |
71 | #算标准差
72 | self.log_std_linear = nn.Linear(hidden_size, num_actions)
73 | self.log_std_linear.weight.data.uniform_(-init_w, init_w)
74 | self.log_std_linear.bias.data.uniform_(-init_w, init_w)
75 |
76 |
77 | def forward(self, x):
78 | x = F.relu(self.linear1(x))
79 | x = F.relu(self.linear2(x))
80 | mean = self.mean_linear(x)
81 | log_std = self.log_std_linear(x)
82 | #clamp将输入张量每个元素的夹紧到区间内
83 | log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
84 | return mean, log_std
85 |
86 | def evaluate(self,state, epsilon = 1e-6):
87 | mean, log_std = self.forward(state)
88 | std = log_std.exp()
89 |
90 | #建立动作空间的各动作概率分布
91 | normal = Normal(mean, std)
92 | z = normal.sample()
93 | action = torch.tanh(z)
94 |
95 | log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
96 | log_prob = log_prob.sum(-1, keepdim=True)
97 | return action, log_prob, z, mean, log_std
98 |
99 | def get_action(self, state):
100 | state = torch.FloatTensor(state).unsqueeze(0).to(device)
101 | mean, log_std = self.forward(state)
102 | std = log_std.exp()
103 | normal = Normal(mean, std)
104 | z = normal.sample()
105 | action = torch.tanh(z)
106 | action = action.detach().cpu().numpy()
107 | return action[0]
108 |
109 | class ReplayBuffer:
110 | def __init__(self, capacity):
111 | self.capacity = capacity
112 | self.buffer = []
113 | self.position = 0
114 |
115 | def push(self, state, action, reward, next_state, done):
116 | if len(self.buffer) < self.capacity:
117 | self.buffer.append(None)
118 | self.buffer[self.position] = (state, action, reward, next_state, done)
119 | self.position = (self.position + 1) % self.capacity
120 |
121 | def sample(self, batch_size):
122 | batch = random.sample(self.buffer, batch_size)
123 | state, action, reward, next_state, done = map(np.stack, zip(*batch))
124 | return state, action, reward, next_state, done
125 |
126 | def __len__(self):
127 | return len(self.buffer)
128 |
129 |
130 | class NormalizedActions(gym.ActionWrapper):
131 |
132 | def action(self, action):
133 | low_bound = self.action_space.low
134 | upper_bound = self.action_space.high
135 | action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
136 | action = np.clip(action, low_bound, upper_bound)
137 | return action
138 |
139 | def reverse_action(self, action):
140 | low_bound = self.action_space.low
141 | upper_bound = self.action_space.high
142 | action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
143 | action = np.clip(action, low_bound, upper_bound)
144 |
145 | return action
146 |
147 | class SAC(object):
148 | def __init__(self, action_dim, state_dim, hidden_dim):
149 | super(SAC, self).__init__()
150 |
151 | self.replay_buffer_size = 1000000
152 | self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
153 |
154 | #与DDPG略显不同,少了target police,多了 soft q net
155 | self.value_net = ValueNetwork(state_dim, hidden_dim).to(device)
156 | self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(device)
157 |
158 | self.soft_q_net = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device)
159 | self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)
160 |
161 | #复制value network参数至target网络中
162 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
163 | target_param.data.copy_(param.data)
164 |
165 | self.value_criterion = nn.MSELoss()
166 | #多了soft q 的mseloss
167 | self.soft_q_criterion = nn.MSELoss()
168 |
169 | self.value_lr = 3e-4
170 | self.soft_q_lr = 3e-4
171 | self.policy_lr = 3e-4
172 |
173 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr)
174 | self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=self.soft_q_lr)
175 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr)
176 |
177 | def soft_q_update(self, batch_size,
178 | gamma=0.99,
179 | mean_lambda=1e-3,
180 | std_lambda=1e-3,
181 | z_lambda=0.0,
182 | soft_tau=1e-2,
183 | ):
184 | state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
185 |
186 | state = torch.FloatTensor(state).to(device)
187 | next_state = torch.FloatTensor(next_state).to(device)
188 | action = torch.FloatTensor(action).to(device)
189 | reward = torch.FloatTensor(reward).unsqueeze(1).to(device)
190 | done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)
191 |
192 | expected_q_value = self.soft_q_net(state, action)
193 | expected_value = self.value_net(state)
194 | new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
195 |
196 | target_value = self.target_value_net(next_state)
197 | next_q_value = reward + (1 - done) * gamma * target_value
198 | q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach())
199 |
200 | expected_new_q_value = self.soft_q_net(state, new_action)
201 | next_value = expected_new_q_value - log_prob
202 | value_loss = self.value_criterion(expected_value, next_value.detach())
203 |
204 | log_prob_target = expected_new_q_value - expected_value
205 | policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
206 |
207 | mean_loss = mean_lambda * mean.pow(2).mean()
208 | std_loss = std_lambda * log_std.pow(2).mean()
209 | z_loss = z_lambda * z.pow(2).sum(1).mean()
210 |
211 | policy_loss += mean_loss + std_loss + z_loss
212 |
213 | self.soft_q_optimizer.zero_grad()
214 | q_value_loss.backward()
215 | self.soft_q_optimizer.step()
216 |
217 | self.value_optimizer.zero_grad()
218 | value_loss.backward()
219 | self.value_optimizer.step()
220 |
221 | self.policy_optimizer.zero_grad()
222 | policy_loss.backward()
223 | self.policy_optimizer.step()
224 |
225 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
226 | target_param.data.copy_(
227 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau
228 | )
229 |
230 | def plot(frame_idx, rewards):
231 | #plt.figure(figsize=(20,5))
232 | #plt.subplot(131)
233 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
234 | plt.plot(rewards)
235 | plt.show()
236 |
237 | def main():
238 | env = gym.make("Pendulum-v0")
239 | env = NormalizedActions(env)
240 |
241 | action_dim = env.action_space.shape[0]
242 | state_dim = env.observation_space.shape[0]
243 | hidden_dim = 256
244 |
245 | model = SAC(action_dim, state_dim, hidden_dim)
246 |
247 | max_frames = 40000
248 | max_steps = 500
249 | frame_idx = 0
250 | rewards = []
251 | batch_size = 128
252 |
253 | max_frames = 40000
254 |
255 | while frame_idx < max_frames:
256 | state = env.reset()
257 | episode_reward = 0
258 |
259 | for step in range(max_steps):
260 | action = model.policy_net.get_action(state)
261 | next_state, reward, done, _ = env.step(action)
262 |
263 | model.replay_buffer.push(state, action, reward, next_state, done)
264 | if len(model.replay_buffer) > batch_size:
265 | model.soft_q_update(batch_size)
266 |
267 | state = next_state
268 | episode_reward += reward
269 | frame_idx += 1
270 |
271 | if frame_idx % 1000 == 0:
272 | plot(frame_idx, rewards)
273 |
274 | if done:
275 | break
276 |
277 | rewards.append(episode_reward)
278 |
279 |
280 | if __name__ == '__main__':
281 | main()
--------------------------------------------------------------------------------
/Continuous_action/TD3.py:
--------------------------------------------------------------------------------
1 |
2 | import math
3 | import random
4 |
5 | import gym
6 | import numpy as np
7 |
8 | import torch
9 | import torch.nn as nn
10 | import torch.optim as optim
11 | import torch.nn.functional as F
12 | from torch.distributions import Normal
13 | import matplotlib.pyplot as plt
14 |
15 |
16 | use_cuda = torch.cuda.is_available()
17 | device = torch.device("cuda" if use_cuda else "cpu")
18 |
19 |
20 | class ReplayBuffer:
21 | def __init__(self, capacity):
22 | self.capacity = capacity
23 | self.buffer = []
24 | self.position = 0
25 |
26 | def push(self, state, action, reward, next_state, done):
27 | if len(self.buffer) < self.capacity:
28 | self.buffer.append(None)
29 | self.buffer[self.position] = (state, action, reward, next_state, done)
30 | self.position = (self.position + 1) % self.capacity
31 |
32 | def sample(self, batch_size):
33 | batch = random.sample(self.buffer, batch_size)
34 | state, action, reward, next_state, done = map(np.stack, zip(*batch))
35 | return state, action, reward, next_state, done
36 |
37 | def __len__(self):
38 | return len(self.buffer)
39 |
40 |
41 | class NormalizedActions(gym.ActionWrapper):
42 | def action(self, action):
43 | low = self.action_space.low
44 | high = self.action_space.high
45 |
46 | action = low + (action + 1.0) * 0.5 * (high - low)
47 | action = np.clip(action, low, high)
48 |
49 | return action
50 |
51 | def reverse_action(self, action):
52 | low = self.action_space.low
53 | high = self.action_space.high
54 |
55 | action = 2 * (action - low) / (high - low) - 1
56 | action = np.clip(action, low, high)
57 |
58 | return action
59 |
60 |
61 | class GaussianExploration(object):
62 | def __init__(self, action_space, max_sigma=1.0, min_sigma=1.0, decay_period=1000000):
63 | self.low = action_space.low
64 | self.high = action_space.high
65 | self.max_sigma = max_sigma
66 | self.min_sigma = min_sigma
67 | self.decay_period = decay_period
68 |
69 | def get_action(self, action, t=0):
70 | sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
71 | action = action + np.random.normal(size=len(action)) * sigma
72 | return np.clip(action, self.low, self.high)
73 |
74 |
75 | def soft_update(net, target_net, soft_tau=1e-2):
76 | for target_param, param in zip(target_net.parameters(), net.parameters()):
77 | target_param.data.copy_(
78 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau
79 | )
80 |
81 |
82 | def plot(frame_idx, rewards):
83 | plt.figure(figsize=(20, 5))
84 | plt.subplot(131)
85 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
86 | plt.plot(rewards)
87 | plt.show()
88 |
89 |
90 | class ValueNetwork(nn.Module):
91 | def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
92 | super(ValueNetwork, self).__init__()
93 |
94 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
95 | self.linear2 = nn.Linear(hidden_size, hidden_size)
96 | self.linear3 = nn.Linear(hidden_size, 1)
97 |
98 | self.linear3.weight.data.uniform_(-init_w, init_w)
99 | self.linear3.bias.data.uniform_(-init_w, init_w)
100 |
101 | def forward(self, state, action):
102 | x = torch.cat([state, action], 1)
103 | x = F.relu(self.linear1(x))
104 | x = F.relu(self.linear2(x))
105 | x = self.linear3(x)
106 | return x
107 |
108 |
109 | class PolicyNetwork(nn.Module):
110 | def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
111 | super(PolicyNetwork, self).__init__()
112 |
113 | self.linear1 = nn.Linear(num_inputs, hidden_size)
114 | self.linear2 = nn.Linear(hidden_size, hidden_size)
115 | self.linear3 = nn.Linear(hidden_size, num_actions)
116 |
117 | self.linear3.weight.data.uniform_(-init_w, init_w)
118 | self.linear3.bias.data.uniform_(-init_w, init_w)
119 |
120 | def forward(self, state):
121 | x = F.relu(self.linear1(state))
122 | x = F.relu(self.linear2(x))
123 | x = F.tanh(self.linear3(x))
124 | return x
125 |
126 | def get_action(self, state):
127 | state = torch.FloatTensor(state).unsqueeze(0).to(device)
128 | action = self.forward(state)
129 | return action.detach().cpu().numpy()[0]
130 |
131 | class TD(object):
132 | def __init__(self, action_dim, state_dim, hidden_dim):
133 | super(TD, self).__init__()
134 | self.action_dim, self.state_dim, self.hidden_dim = action_dim, state_dim, hidden_dim
135 | self.batch_size = 128
136 | self.gamma = 0.99
137 | self.soft_tau = 1e-2
138 | self.noise_std = 0.2
139 | self.noise_clip = 0.5
140 | self.policy_update = 2
141 | self.soft_tau = 1e-2
142 | self.replay_buffer_size = 1000000
143 | self.value_lr = 1e-3
144 | self.policy_lr = 1e-3
145 |
146 | self.value_net1 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
147 | self.value_net2 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
148 | self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)
149 |
150 | self.target_value_net1 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
151 | self.target_value_net2 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
152 | self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)
153 |
154 | soft_update(self.value_net1, self.target_value_net1, soft_tau=1.0)
155 | soft_update(self.value_net2, self.target_value_net2, soft_tau=1.0)
156 | soft_update(self.policy_net, self.target_policy_net, soft_tau=1.0)
157 |
158 | self.value_criterion = nn.MSELoss()
159 |
160 | self.value_optimizer1 = optim.Adam(self.value_net1.parameters(), lr=self.value_lr)
161 | self.value_optimizer2 = optim.Adam(self.value_net2.parameters(), lr=self.value_lr)
162 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr)
163 |
164 | self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
165 |
166 |
167 | def td3_update(self, step, batch_size):
168 |
169 | state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
170 |
171 | state = torch.FloatTensor(state).to(device)
172 | next_state = torch.FloatTensor(next_state).to(device)
173 | action = torch.FloatTensor(action).to(device)
174 | reward = torch.FloatTensor(reward).unsqueeze(1).to(device)
175 | done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)
176 |
177 | next_action = self.target_policy_net(next_state)
178 | noise = torch.normal(torch.zeros(next_action.size()), self.noise_std).to(device)
179 | noise = torch.clamp(noise, -self.noise_clip, self.noise_clip)
180 | next_action += noise
181 |
182 | target_q_value1 = self.target_value_net1(next_state, next_action)
183 | target_q_value2 = self.target_value_net2(next_state, next_action)
184 | target_q_value = torch.min(target_q_value1, target_q_value2)
185 | expected_q_value = reward + (1.0 - done) * self.gamma * target_q_value
186 |
187 | q_value1 = self.value_net1(state, action)
188 | q_value2 = self.value_net2(state, action)
189 |
190 | value_loss1 = self.value_criterion(q_value1, expected_q_value.detach())
191 | value_loss2 = self.value_criterion(q_value2, expected_q_value.detach())
192 |
193 | self.value_optimizer1.zero_grad()
194 | value_loss1.backward()
195 | self.value_optimizer1.step()
196 |
197 | self.value_optimizer2.zero_grad()
198 | value_loss2.backward()
199 | self.value_optimizer2.step()
200 |
201 | if step % self.policy_update == 0:
202 | policy_loss = self.value_net1(state, self.policy_net(state))
203 | policy_loss = -policy_loss.mean()
204 |
205 | self.policy_optimizer.zero_grad()
206 | policy_loss.backward()
207 | self.policy_optimizer.step()
208 |
209 | soft_update(self.value_net1, self.target_value_net1, soft_tau=self.soft_tau)
210 | soft_update(self.value_net2, self.target_value_net2, soft_tau=self.soft_tau)
211 | soft_update(self.policy_net, self.target_policy_net, soft_tau=self.soft_tau)
212 |
213 | def main():
214 | env = NormalizedActions(gym.make('Pendulum-v0'))
215 | noise = GaussianExploration(env.action_space)
216 |
217 | state_dim = env.observation_space.shape[0]
218 | action_dim = env.action_space.shape[0]
219 | hidden_dim = 256
220 |
221 | TD3 = TD(action_dim, state_dim, hidden_dim)
222 |
223 |
224 | max_frames = 10000
225 | max_steps = 500
226 | frame_idx = 0
227 | rewards = []
228 | batch_size = 128
229 |
230 | while frame_idx < max_frames:
231 | state = env.reset()
232 | episode_reward = 0
233 |
234 | for step in range(max_steps):
235 | action = TD3.policy_net.get_action(state)
236 | action = noise.get_action(action, step)
237 | next_state, reward, done, _ = env.step(action)
238 |
239 | TD3.replay_buffer.push(state, action, reward, next_state, done)
240 | if len(TD3.replay_buffer) > batch_size:
241 | TD3.td3_update(step, batch_size)
242 |
243 | state = next_state
244 | episode_reward += reward
245 | frame_idx += 1
246 |
247 | if frame_idx % 1000 == 0:
248 | plot(frame_idx, rewards)
249 |
250 | if done:
251 | break
252 |
253 | rewards.append(episode_reward)
254 |
255 | if __name__ == '__main__':
256 | main()
257 |
258 |
259 |
260 |
261 |
--------------------------------------------------------------------------------
/Discrete_action/Actor_Critic.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Mar 3 2021
3 | @author: wangmeng
4 | """
5 | import math
6 | import random
7 |
8 | import gym
9 | import numpy as np
10 |
11 | import torch
12 | import torch.nn as nn
13 | import torch.optim as optim
14 | import torch.nn.functional as F
15 | from torch.distributions import Categorical
16 | import matplotlib.pyplot as plt
17 |
18 | from multiprocessing_env import SubprocVecEnv
19 |
20 | use_cuda = torch.cuda.is_available()
21 | device = torch.device("cuda" if use_cuda else "cpu")
22 |
23 | class ActorCritic(nn.Module):
24 | def __init__(self, inputs, outputs, hidden_size, std=0.0):
25 | super(ActorCritic, self).__init__()
26 | self.Actor = nn.Sequential(
27 | nn.Linear(inputs, hidden_size),
28 | nn.ReLU(),
29 | nn.Linear(hidden_size, outputs),
30 | #使得在softmax操作之后在dim这个维度相加等于1
31 | #注意,默认的方法已经弃用,最好在使用的时候声明dim
32 | nn.Softmax(dim=1)
33 |
34 | )
35 | self.Critic = nn.Sequential(
36 | nn.Linear(inputs, hidden_size),
37 | nn.ReLU(),
38 | nn.Linear(hidden_size,1)
39 | )
40 | def forward(self,x):
41 | value = self.Critic(x)
42 | probs = self.Actor(x)
43 | #分类,对actor输出的动作概率进行分类统计
44 | dist = Categorical(probs)
45 | return dist, value
46 |
47 | def make_env():
48 | def _thunk():
49 | env = gym.make("CartPole-v0")
50 | return env
51 | return _thunk
52 |
53 | #通过N步采样,以加速收敛,这里是计算优势函数
54 | def compute_returns(next_value, rewards, masks, gamma=0.99):
55 | R = next_value
56 | returns = []
57 | for step in reversed(range(len(rewards))):
58 | R = rewards[step] + gamma * R * masks[step]
59 | #list.insert(index, obj),index -- 对象 obj 需要插入的索引位置。
60 | returns.insert(0, R)
61 | return returns
62 |
63 | def test_env(model, env,vis=False):
64 | state = env.reset()
65 | if vis: env.render()
66 | done = False
67 | total_reward = 0
68 | while not done:
69 | state = torch.FloatTensor(state).unsqueeze(0).to(device)
70 | dist, _ = model(state)
71 | next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
72 | state = next_state
73 | if vis: env.render()
74 | total_reward += reward
75 | return total_reward
76 |
77 | def plot(frame_idx, rewards):
78 | plt.figure(figsize=(20,5))
79 | plt.subplot(131)
80 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
81 | plt.plot(rewards)
82 | plt.show()
83 |
84 | def main():
85 | num_envs = 16
86 | envs = [make_env() for i in range(num_envs)]
87 | envs = SubprocVecEnv(envs)
88 | env = gym.make("CartPole-v0")
89 |
90 | num_inputs = envs.observation_space.shape[0]
91 | num_outputs = envs.action_space.n
92 | # Hyper params:
93 | hidden_size = 256
94 | lr = 3e-4
95 | num_steps = 5
96 |
97 | model = ActorCritic(num_inputs,num_outputs,hidden_size).to(device)
98 |
99 | optimizer = optim.Adam(model.parameters())
100 |
101 | max_frames = 20000
102 | frame_idx = 0
103 | test_rewards = []
104 | state = envs.reset()
105 |
106 | while frame_idx < max_frames:
107 |
108 | log_probs = []
109 | values = []
110 | rewards = []
111 | masks = []
112 | entropy = 0
113 |
114 | #每个子网络运行num_steps个steps,实现n步采样
115 | for _ in range(num_steps):
116 | state = torch.FloatTensor(state).to(device)
117 | dist, value = model(state)
118 | action = dist.sample()
119 | next_state, reward, done, _ = envs.step(action.cpu().numpy())
120 | log_prob = dist.log_prob(action)
121 | entropy += dist.entropy().mean()
122 |
123 | #记录下这num_steps步的各子网络相关参数
124 | log_probs.append(log_prob)
125 | values.append(value)
126 | rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
127 | masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
128 |
129 | state = next_state
130 | frame_idx += 1
131 |
132 | if frame_idx % 100 == 0:
133 | test_rewards.append(np.mean([test_env(model, env) for _ in range(10)]))
134 | plot(frame_idx, test_rewards)
135 |
136 | #将子网络的参数传给主网络,并进行参数更新
137 | next_state = torch.FloatTensor(next_state).to(device)
138 | _, next_value = model(next_state)
139 | returns = compute_returns(next_value, rewards, masks)
140 |
141 | #将5个step的值串起来
142 | log_probs = torch.cat(log_probs)
143 | returns = torch.cat(returns).detach()
144 | values = torch.cat(values)
145 |
146 | advantage = returns - values
147 | #计算loss均值
148 | actor_loss = -(log_probs * advantage.detach()).mean()
149 | critic_loss = advantage.pow(2).mean()
150 |
151 | loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
152 |
153 | optimizer.zero_grad()
154 | loss.backward()
155 | optimizer.step()
156 |
157 | if __name__ == '__main__':
158 | main()
--------------------------------------------------------------------------------
/Discrete_action/CnnDQN.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Feb 27 2021
3 | @author: wangmeng
4 | """
5 | import math, random
6 | import gym
7 | import numpy as np
8 | import torch
9 | import torch.nn as nn
10 | import torch.optim as optim
11 | import torch.autograd as autograd
12 | import torch.nn.functional as F
13 | from collections import Counter
14 | from collections import deque
15 | import matplotlib.pyplot as plt
16 | from wrappers import make_atari, wrap_deepmind, wrap_pytorch
17 |
18 | USE_CUDA = torch.cuda.is_available()
19 | #将变量放到cuda上
20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
21 |
22 | class CnnDQN(nn.Module):
23 | def __init__(self, observation_space, action_sapce):
24 | super(CnnDQN, self).__init__()
25 |
26 | self.observation_space = observation_space
27 | self.action_sapce = action_sapce
28 |
29 | self.features = nn.Sequential(
30 | nn.Conv2d(self.observation_space[0], 32, kernel_size=8, stride=4),
31 | nn.ReLU(),
32 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
33 | nn.ReLU(),
34 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
35 | nn.ReLU()
36 | )
37 |
38 | self.fc = nn.Sequential(
39 | nn.Linear(7 * 7 * 64, 512),
40 | nn.ReLU(),
41 | nn.Linear(512,self.action_sapce)
42 | )
43 |
44 | def forward(self,x):
45 | x = self.features(x)
46 | x = x.view(x.size(0), -1)#将多维度的Tensor展平成一维
47 | # x.size(0)指batchsize的值,x = x.view(x.size(0), -1)简化x = x.view(batchsize, -1),view()函数的功能根reshape类似,用来转换size大小。
48 | # x = x.view(batchsize, -1)中batchsize指转换后有几行,而-1指在不告诉函数有多少列的情况下,根据原tensor数据和batchsize自动分配列数。
49 | x = self.fc(x)
50 | return x
51 |
52 | # def feature_size(self):
53 | # #这里就很粗暴,先建立一个大小和预期输入的全0tensor,送入features中运行,最后得到输出,展平,读取长度。这里是7 * 7 * 64
54 | # return self.features(autograd.Variable(torch.zeros(1, *self.observation_space))).view(1, -1).size(1)
55 |
56 | def act(self, state, epsilon):
57 | if random.random() > epsilon:
58 | state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)#(1,1,84,84)
59 | q_value = self.forward(state)
60 | action = q_value.max(1)[1].data[0]
61 | action = action.cpu().numpy() # 从网络中得到的tensor形式,因为之后要输入给gym环境中,这里把它放回cpu,转为数组形式
62 | action = int(action)
63 |
64 | else:
65 | action = random.randrange(self.action_sapce)
66 | return action
67 |
68 | class ReplayBuffer(object):
69 | def __init__(self, capacity):
70 | #deque模块是python标准库collections中的一项,它提供了两端都可以操作的序列,其实就是双向队列,
71 | #可以从左右两端增加元素,或者是删除元素。如果设置了最大长度,非输入端的数据会逐步移出窗口。
72 | self.buffer = deque (maxlen = capacity)
73 |
74 | def push (self, state ,aciton, reward, next_state, done):
75 | state = np.expand_dims(state,0)
76 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接
77 | next_state = np.expand_dims(next_state,0)
78 | self.buffer.append((state, aciton, reward, next_state, done))
79 |
80 | def sample(self, batch_size):
81 | # 将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
82 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
83 | #最后使用concatenate对数组进行拼接,相当于少了一个维度
84 | return np.concatenate(state), action, reward, np.concatenate(next_state), done
85 |
86 |
87 | def compute_td_loss(model,optimizer, replay_buffer, gamma, batch_size):
88 | state, action, reward, next_state, done = replay_buffer.sample(batch_size)
89 | #通通丢到GPU上去
90 | state = Variable(torch.FloatTensor(np.float32(state)))
91 | next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
92 | action = Variable(torch.LongTensor(action))
93 | reward = Variable(torch.FloatTensor(reward))
94 | done = Variable(torch.FloatTensor(done))
95 |
96 | q_values = model(state)
97 | next_q_values = model(next_state)
98 |
99 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
100 | #gather可以看作是对q_values的查询,即元素都是q_values中的元素,查询索引都存在action中。输出大小与action.unsqueeze(1)一致。
101 | #dim=1,它存放的都是第1维度的索引;dim=0,它存放的都是第0维度的索引;
102 | #这里增加维度主要是为了方便gather操作,之后再删除该维度
103 | next_q_value = next_q_values.max(1)[0]
104 |
105 | expected_q_value = reward + gamma * next_q_value * (1 - done)
106 |
107 | loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()
108 |
109 | optimizer.zero_grad()
110 | loss.backward()
111 | optimizer.step()
112 |
113 | return loss
114 |
115 |
116 | def main():
117 | env_id = "PongNoFrameskip-v4"
118 | env = make_atari(env_id)
119 | env = wrap_deepmind(env)
120 | env = wrap_pytorch(env)
121 |
122 | observation_space = env.observation_space.shape
123 | action_sapce = env.action_space.n
124 |
125 | model = CnnDQN(observation_space, action_sapce)
126 |
127 | if USE_CUDA:
128 | model = model.cuda()
129 |
130 | optimizer = optim.Adam(model.parameters())
131 |
132 | replay_buffer = ReplayBuffer(1000)
133 |
134 | batch_size = 32
135 | gamma = 0.99
136 | replay_initial = 100
137 | num_frames = 14000
138 |
139 | losses = []
140 | all_rewards = []
141 | x_axis1 = []
142 | x_axis2= []
143 | episode_reward = 0
144 |
145 | epsilon_start = 1.0
146 | epsilon_final = 0.01
147 | epsilon_decay = 30000
148 |
149 | # 要求探索率随着迭代次数增加而减小
150 | epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)
151 |
152 | state = env.reset()
153 |
154 | for frame_idx in range(1, num_frames + 1):
155 | #显示动画
156 | env.render()
157 | epsilon = epsilon_by_frame(frame_idx)
158 | action = model.act(state, epsilon)
159 | next_state, reward, done, _ = env.step(action)
160 | replay_buffer.push(state, action, reward, next_state, done)
161 | state = next_state
162 | episode_reward += reward
163 |
164 | if done:
165 | state = env.reset()
166 | x_axis1.append(frame_idx)
167 | all_rewards.append(episode_reward)
168 | episode_reward = 0
169 |
170 | if frame_idx+1 > replay_initial:
171 | loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size)
172 | x_axis2.append(frame_idx)
173 | losses.append(np.array(loss.data.cpu()))
174 |
175 |
176 |
177 | if frame_idx % 100 == 0:
178 | plt.figure(1)
179 | plt.subplot(121)
180 | plt.plot(x_axis1, all_rewards)
181 | plt.subplot(122)
182 | plt.plot(x_axis2, losses)
183 | plt.show()
184 |
185 | env.close()
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 | if __name__ == '__main__':
195 | main()
--------------------------------------------------------------------------------
/Discrete_action/D3QN.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Mae 1 2021
3 | @author: wangmeng
4 | """
5 |
6 | import math, random
7 | import gym
8 | import numpy as np
9 | import torch
10 | import torch.nn as nn
11 | import torch.optim as optim
12 | import torch.autograd as autograd
13 | import torch.nn.functional as F
14 | from collections import Counter
15 | from collections import deque
16 | import matplotlib.pyplot as plt
17 |
18 | USE_CUDA = torch.cuda.is_available()
19 | #将变量放到cuda上
20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
21 |
22 | class DuelingDQN(nn.Module):
23 | def __init__(self, observation_space, action_sapce):
24 | super(DuelingDQN, self).__init__()
25 |
26 | self.observation_space = observation_space
27 | self.action_sapce = action_sapce
28 |
29 | #######################改动部分#############################
30 | self.feature = nn.Sequential(
31 | nn.Linear(observation_space,128),
32 | nn.ReLU()
33 | )
34 | self.advantage = nn.Sequential(
35 | nn.Linear(128, 128),
36 | nn.ReLU(),
37 | nn.Linear(128, action_sapce),
38 | )
39 | self.value = nn.Sequential(
40 | nn.Linear(128,128),
41 | nn.ReLU(),
42 | nn.Linear(128,1),
43 |
44 | )
45 |
46 |
47 | def forward(self, x):
48 | x = self.feature(x)
49 | advantage = self.advantage(x)
50 | value = self.value(x)
51 | #这里不减去advantage均值的话会导致训练不稳定,因为value的作用可能有的时候被忽略掉了,有的时候又突然非常大。
52 | return value + advantage - advantage.mean()
53 |
54 | def act(self, state, epsilon):
55 | if random.random() > epsilon:
56 | #如果使用的是GPU,这里需要把数据丢到GPU上
57 | state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)#volatile的作用是作为指令关键字,确保本条指令不会因编译器的优化而省略,且要求每次直接读值。
58 | #.squeeze() 把数据条目中维度为1 的删除掉
59 | q_value = self.forward(state)
60 | action = q_value.max(1)[1].data[0]
61 | #max(1)返回每一行中最大值的那个元素,且返回其索引,max(0)是列
62 | #max()[1]只返回最大值的每个索引,max()[0], 只返回最大值的每个数
63 |
64 | action = action.cpu().numpy()#从网络中得到的tensor形式,因为之后要输入给gym环境中,这里把它放回cpu,转为数组形式
65 | action =int(action)
66 | else:
67 | action = random.randrange(self.action_sapce)#返回指定递增基数集合中的一个随机数,基数默认值为1。
68 | return action
69 |
70 | class ReplayBuffer(object):
71 | def __init__(self, capacity):
72 | #deque模块是python标准库collections中的一项,它提供了两端都可以操作的序列,其实就是双向队列,
73 | #可以从左右两端增加元素,或者是删除元素。如果设置了最大长度,非输入端的数据会逐步移出窗口。
74 | self.buffer = deque(maxlen = capacity)
75 |
76 | def push (self, state, aciton, reward, next_state, done):
77 | state = np.expand_dims(state,0)
78 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接
79 | next_state = np.expand_dims(next_state,0)
80 | self.buffer.append((state, aciton, reward, next_state, done))
81 |
82 | def sample(self, batch_size):
83 | # 将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
84 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
85 | #最后使用concatenate对数组进行拼接,相当于少了一个维度
86 | return np.concatenate(state), action, reward, np.concatenate(next_state), done
87 |
88 |
89 | def compute_td_loss(current_model, target_model,optimizer, replay_buffer, gamma, batch_size):
90 | state, action, reward, next_state, done = replay_buffer.sample(batch_size)
91 | #通通丢到GPU上去
92 | state = Variable(torch.FloatTensor(np.float32(state)))
93 | next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
94 | action = Variable(torch.LongTensor(action))
95 | reward = Variable(torch.FloatTensor(reward))
96 | done = Variable(torch.FloatTensor(done))
97 |
98 | q_values = current_model(state)
99 | next_q_values = target_model(next_state)
100 |
101 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
102 | #gather可以看作是对q_values的查询,即元素都是q_values中的元素,查询索引都存在action中。输出大小与action.unsqueeze(1)一致。
103 | #dim=1,它存放的都是第1维度的索引;dim=0,它存放的都是第0维度的索引;
104 | #这里增加维度主要是为了方便gather操作,之后再删除该维度
105 | next_q_value = next_q_values.max(1)[0]
106 |
107 | expected_q_value = reward + gamma * next_q_value * (1 - done)
108 |
109 | #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()
110 | loss = (q_value - expected_q_value.detach()).pow(2).mean()
111 |
112 | optimizer.zero_grad()
113 | loss.backward()
114 | optimizer.step()
115 |
116 | return loss
117 |
118 | def update_target(current_model, target_model):
119 | target_model.load_state_dict(current_model.state_dict())
120 |
121 |
122 | def main():
123 | env_id = "CartPole-v0"
124 | env = gym.make(env_id)
125 |
126 | observation_space = env.observation_space.shape[0]
127 | action_sapce = env.action_space.n
128 |
129 | current_model = DuelingDQN(observation_space, action_sapce)
130 | target_model = DuelingDQN(observation_space, action_sapce)
131 |
132 | if USE_CUDA:
133 | current_model = current_model.cuda()
134 | target_model = target_model.cuda()
135 |
136 | optimizer = optim.Adam(current_model.parameters())
137 |
138 | replay_buffer = ReplayBuffer(1000)
139 |
140 | update_target(current_model, target_model)
141 |
142 | batch_size = 32
143 | gamma = 0.99
144 |
145 | num_frames = 10000
146 |
147 | losses = []
148 | all_rewards = []
149 | x_axis1 = []
150 | x_axis2 = []
151 | episode_reward = 0
152 |
153 | epsilon_start = 1.0
154 | epsilon_final = 0.01
155 | epsilon_decay = 500
156 |
157 | #要求探索率随着迭代次数增加而减小
158 | epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * frame_idx / epsilon_decay)
159 |
160 | state = env.reset()
161 | for frame_idx in range(1, num_frames + 1):
162 | #显示动画
163 | #env.render()
164 | epsilon = epsilon_by_frame(frame_idx)
165 | action = current_model.act(state, epsilon)
166 | next_state, reward, done, _ = env.step(action)
167 | replay_buffer.push(state, action, reward, next_state, done)
168 | state = next_state
169 | episode_reward += reward
170 |
171 | if done:
172 | state = env.reset()
173 | x_axis1.append(frame_idx)
174 | all_rewards.append(episode_reward)
175 | episode_reward = 0
176 |
177 | if frame_idx+1 > batch_size:
178 | x_axis2.append(frame_idx)
179 | loss = compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size)
180 | losses.append(np.array(loss.data.cpu()))
181 |
182 | if frame_idx % 100 == 0:
183 | update_target(current_model, target_model)
184 |
185 |
186 | if frame_idx % 200 == 0:
187 | plt.figure(1)
188 | plt.subplot(121)
189 | plt.plot(x_axis1, all_rewards)
190 | plt.subplot(122)
191 | plt.plot(x_axis2, losses)
192 | plt.show()
193 |
194 |
195 | if __name__ == '__main__':
196 | main()
--------------------------------------------------------------------------------
/Discrete_action/D3QN2.py:
--------------------------------------------------------------------------------
1 | #https://github.com/awill139/d3qn_pytorch
2 | import torch
3 | import numpy as np
4 | from torch import optim
5 | import torch.nn as nn
6 |
7 | import gym
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
11 |
12 |
13 | class ReplayBuffer():
14 | def __init__(self, max_size, input_shape):
15 | self.mem_size = max_size
16 | self.mem_cntr = 0
17 |
18 | self.state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float32)
19 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float32)
20 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
21 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
22 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
23 |
24 | def store_transition(self, state, action, reward, state_, done):
25 | idx = self.mem_cntr % self.mem_size
26 |
27 | self.state_memory[idx] = state
28 | self.new_state_memory[idx] = state_
29 | self.action_memory[idx] = action
30 | self.reward_memory[idx] = reward
31 | self.terminal_memory[idx] = done
32 |
33 | self.mem_cntr += 1
34 |
35 | def sample(self, batch_size):
36 | max_mem = min(self.mem_cntr, self.mem_size)
37 | batch = np.random.choice(max_mem, batch_size, replace=False)
38 |
39 | states = self.state_memory[batch]
40 | states_ = self.new_state_memory[batch]
41 | actions = self.action_memory[batch]
42 | rewards = self.reward_memory[batch]
43 | dones = self.terminal_memory[batch]
44 |
45 | return states, actions, rewards, states_, dones
46 |
47 |
48 | class DuelingDeepQNet(nn.Module):
49 | def __init__(self, n_actions, input_dim, fc1_dims, fc2_dims, lr=0.0003):
50 | super(DuelingDeepQNet, self).__init__()
51 |
52 | self.fc1 = nn.Linear(*input_dim, fc1_dims)
53 | self.fc2 = nn.Linear(fc1_dims, fc2_dims)
54 | self.V = nn.Linear(fc2_dims, 1)
55 | self.A = nn.Linear(fc2_dims, n_actions)
56 |
57 | self.relu1 = nn.ReLU()
58 | self.relu2 = nn.ReLU()
59 |
60 | self.optim = optim.Adam(self.parameters(), lr=lr)
61 | self.crit = nn.MSELoss()
62 |
63 | def forward(self, state):
64 | x = self.relu1(self.fc1(state))
65 | x = self.relu2(self.fc2(x))
66 |
67 | V = self.V(x)
68 | A = self.A(x)
69 |
70 | Q = V + (A - torch.mean(A, dim=1, keepdim=True))
71 |
72 | return Q
73 |
74 | def advantage(self, state):
75 | x = self.relu1(self.fc1(state))
76 | x = self.relu2(self.fc2(x))
77 |
78 | return self.A(x)
79 |
80 |
81 | class Agent:
82 | def __init__(self, gamma, n_actions, epsilon, batch_size,
83 | input_dims, epsilon_decay=1e-8, eps_min=0.01,
84 | mem_size=1000000, fc1_dims=128, fc2_dims=128, replace=100):
85 | self.action_space = [i for i in range(n_actions)]
86 | self.gamma = gamma
87 | self.epsilon = epsilon
88 | self.epsilon_decay = epsilon_decay
89 | self.eps_min = eps_min
90 | self.replace = replace
91 | self.batch_size = batch_size
92 |
93 | self.learn_step_counter = 0
94 | self.memory = ReplayBuffer(max_size=mem_size, input_shape=input_dims)
95 | self.q_eval = DuelingDeepQNet(n_actions=n_actions, input_dim=input_dims, fc1_dims=fc1_dims, fc2_dims=fc2_dims)
96 | self.q_next = DuelingDeepQNet(n_actions=n_actions, input_dim=input_dims, fc1_dims=fc1_dims, fc2_dims=fc2_dims)
97 |
98 | self.q_eval.to(device)
99 | self.q_next.to(device)
100 |
101 | def store_transition(self, state, action, reward, new_state, done):
102 | self.memory.store_transition(state, action, reward, new_state, done)
103 |
104 | def choose_action(self, observation):
105 | if np.random.random() < self.epsilon:
106 | state = torch.Tensor([observation]).to(device)
107 | advantage = self.q_eval.advantage(state)
108 | action = torch.argmax(advantage).item()
109 | else:
110 | action = np.random.choice(self.action_space)
111 |
112 | return action
113 |
114 | def learn(self):
115 | if self.memory.mem_cntr < self.batch_size:
116 | return
117 | if self.learn_step_counter % self.replace == 0:
118 | self.q_next.load_state_dict(self.q_eval.state_dict())
119 |
120 | states, actions, rewards, states_, dones = self.memory.sample(self.batch_size)
121 |
122 | states = torch.tensor(states).to(device)
123 | rewards = torch.tensor(rewards).to(device)
124 | dones = torch.tensor(dones).to(device)
125 | actions = torch.tensor(actions).to(device)
126 | states_ = torch.tensor(states_).to(device)
127 |
128 | indices = np.arange(self.batch_size)
129 |
130 | q_pred = self.q_eval(states)[indices, actions]
131 | q_next = self.q_next(states_)
132 |
133 | max_actions = torch.argmax(self.q_eval(states_), dim=1)
134 | # q_eval = self.q_eval(torch.Tensor(states_).to(device))[indices, actions]
135 | q_target = rewards + self.gamma * q_next[indices, max_actions]
136 |
137 | q_next[dones] = 0.0
138 | self.q_eval.optim.zero_grad()
139 |
140 | loss = self.q_eval.crit(q_target, q_pred)
141 | loss.backward()
142 |
143 | self.q_eval.optim.step()
144 |
145 | self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.eps_min else self.eps_min
146 | self.learn_step_counter += 1
147 |
148 |
149 | def main():
150 | env = gym.make('LunarLander-v2')
151 | # env_id = "CartPole-v0"
152 | # env = gym.make(env_id)
153 | n_actions = env.action_space.n
154 | obs_shape = list(env.observation_space.shape)
155 | agent = Agent(gamma=0.99, n_actions=n_actions, epsilon=1.0, batch_size=64, input_dims=obs_shape)
156 |
157 | n_games = 1000
158 | scores = []
159 | eps_history = []
160 |
161 |
162 |
163 | for i in range(n_games):
164 | done = False
165 | score = 0
166 | obs = env.reset()
167 |
168 |
169 | while not done:
170 | env.render()
171 | action = agent.choose_action(obs)
172 | obs_, reward, done, info = env.step(action)
173 | agent.store_transition(obs, action, reward, obs_, int(done))
174 | score += reward
175 | obs = obs_
176 | agent.learn()
177 |
178 | eps_history.append(agent.epsilon)
179 |
180 | scores.append(score)
181 |
182 |
183 | avg_score = np.mean(scores[-100:])
184 |
185 | print('episode: {}\t curr_score: {}\t avg score: {}'.format(i, score, avg_score))
186 |
187 |
188 | env.close()
189 |
190 | if __name__ == '__main__':
191 | main()
--------------------------------------------------------------------------------
/Discrete_action/DDQN.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Feb 26 2021
3 | @author: wangmeng
4 | """
5 |
6 | import math, random
7 | import gym
8 | import numpy as np
9 | import torch
10 | import torch.nn as nn
11 | import torch.optim as optim
12 | import torch.autograd as autograd
13 | import torch.nn.functional as F
14 | from collections import Counter
15 | from collections import deque
16 | import matplotlib.pyplot as plt
17 |
18 | USE_CUDA = torch.cuda.is_available()
19 | #将变量放到cuda上
20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
21 |
22 | class DQN(nn.Module):
23 | def __init__(self, observation_space, action_sapce):
24 | super(DQN, self).__init__()
25 |
26 | self.observation_space = observation_space
27 | self.action_sapce = action_sapce
28 |
29 | self.layers = nn.Sequential(
30 | nn.Linear(observation_space,128),
31 | nn.ReLU(),
32 | nn.Linear(128,128),
33 | nn.ReLU(),
34 | nn.Linear(128, action_sapce)
35 | )
36 |
37 |
38 | def forward(self, x):
39 | return self.layers(x)
40 |
41 | def act(self, state, epsilon):
42 | if random.random() > epsilon:
43 | #如果使用的是GPU,这里需要把数据丢到GPU上
44 | state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)#volatile的作用是作为指令关键字,确保本条指令不会因编译器的优化而省略,且要求每次直接读值。
45 | #.squeeze() 把数据条目中维度为1 的删除掉
46 | q_value = self.forward(state)
47 | action = q_value.max(1)[1].data[0]
48 | #max(1)返回每一行中最大值的那个元素,且返回其索引,max(0)是列
49 | #max()[1]只返回最大值的每个索引,max()[0], 只返回最大值的每个数
50 |
51 | action = action.cpu().numpy()#从网络中得到的tensor形式,因为之后要输入给gym环境中,这里把它放回cpu,转为数组形式
52 | action =int(action)
53 | else:
54 | action = random.randrange(self.action_sapce)#返回指定递增基数集合中的一个随机数,基数默认值为1。
55 | return action
56 |
57 | class ReplayBuffer(object):
58 | def __init__(self, capacity):
59 | #deque模块是python标准库collections中的一项,它提供了两端都可以操作的序列,其实就是双向队列,
60 | #可以从左右两端增加元素,或者是删除元素。如果设置了最大长度,非输入端的数据会逐步移出窗口。
61 | self.buffer = deque(maxlen = capacity)
62 |
63 | def push (self, state, aciton, reward, next_state, done):
64 | state = np.expand_dims(state,0)
65 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接
66 | next_state = np.expand_dims(next_state,0)
67 | self.buffer.append((state, aciton, reward, next_state, done))
68 |
69 | def sample(self, batch_size):
70 | # 将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
71 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
72 | #最后使用concatenate对数组进行拼接,相当于少了一个维度
73 | return np.concatenate(state), action, reward, np.concatenate(next_state), done
74 |
75 |
76 | def compute_td_loss(model,optimizer, replay_buffer, gamma, batch_size):
77 | state, action, reward, next_state, done = replay_buffer.sample(batch_size)
78 | #通通丢到GPU上去
79 | state = Variable(torch.FloatTensor(np.float32(state)))
80 | next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
81 | action = Variable(torch.LongTensor(action))
82 | reward = Variable(torch.FloatTensor(reward))
83 | done = Variable(torch.FloatTensor(done))
84 |
85 | q_values = model(state)
86 | next_q_values = model(next_state)
87 |
88 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
89 | #gather可以看作是对q_values的查询,即元素都是q_values中的元素,查询索引都存在action中。输出大小与action.unsqueeze(1)一致。
90 | #dim=1,它存放的都是第1维度的索引;dim=0,它存放的都是第0维度的索引;
91 | #这里增加维度主要是为了方便gather操作,之后再删除该维度
92 | next_q_value = next_q_values.max(1)[0]
93 |
94 | expected_q_value = reward + gamma * next_q_value * (1 - done)
95 |
96 | loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()
97 |
98 | optimizer.zero_grad()
99 | loss.backward()
100 | optimizer.step()
101 |
102 | return loss
103 |
104 |
105 | def update_target(current_model, target_model):
106 | target_model.load_state_dict(current_model.state_dict())
107 |
108 |
109 | def main():
110 | env_id = "CartPole-v0"
111 | env = gym.make(env_id)
112 |
113 | observation_space = env.observation_space.shape[0]
114 | action_sapce = env.action_space.n
115 |
116 | #######################改动之处###############################
117 | current_model = DQN (observation_space, action_sapce)
118 | target_model = DQN(observation_space, action_sapce)
119 |
120 | if USE_CUDA:
121 | current_model = current_model.cuda()
122 | target_model = target_model.cuda()
123 |
124 | update_target(current_model, target_model)
125 |
126 | optimizer = optim.Adam(current_model.parameters())
127 | #############################################################
128 | replay_buffer = ReplayBuffer(1000)
129 |
130 | batch_size = 32
131 | gamma = 0.99
132 |
133 | num_frames = 10000
134 |
135 | losses = []
136 | all_rewards = []
137 | x_axis1 = []
138 | x_axis2 = []
139 | episode_reward = 0
140 |
141 | epsilon_start = 1.0
142 | epsilon_final = 0.01
143 | epsilon_decay = 500
144 |
145 | #要求探索率随着迭代次数增加而减小
146 | epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * frame_idx / epsilon_decay)
147 |
148 | state = env.reset()
149 | for frame_idx in range(1, num_frames + 1):
150 | #显示动画
151 | #env.render()
152 | epsilon = epsilon_by_frame(frame_idx)
153 | action = current_model.act(state, epsilon)
154 | next_state, reward, done, _ = env.step(action)
155 | replay_buffer.push(state, action, reward, next_state, done)
156 | state = next_state
157 | episode_reward += reward
158 |
159 | if done:
160 | state = env.reset()
161 | x_axis1.append(frame_idx)
162 | all_rewards.append(episode_reward)
163 | episode_reward = 0
164 |
165 | if frame_idx+1 > batch_size:
166 | x_axis2.append(frame_idx)
167 | loss = compute_td_loss(current_model, optimizer, replay_buffer, gamma, batch_size)
168 | losses.append(np.array(loss.data.cpu()))
169 |
170 | #########################改动之处#############################
171 | if frame_idx % 100 == 0:
172 | update_target(current_model, target_model)
173 |
174 |
175 |
176 | if frame_idx % 200 == 0:
177 | plt.figure(1)
178 | plt.subplot(121)
179 | plt.plot(x_axis1, all_rewards)
180 | plt.subplot(122)
181 | plt.plot(x_axis2, losses)
182 | plt.show()
183 |
184 |
185 | if __name__ == '__main__':
186 | main()
--------------------------------------------------------------------------------
/Discrete_action/DQN.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Feb 26 2021
3 | @author: wangmeng
4 | """
5 |
6 | import math, random
7 | import gym
8 | import numpy as np
9 | import torch
10 | import torch.nn as nn
11 | import torch.optim as optim
12 | import torch.autograd as autograd
13 | import torch.nn.functional as F
14 | from collections import Counter
15 | from collections import deque
16 | import matplotlib.pyplot as plt
17 |
18 | USE_CUDA = torch.cuda.is_available()
19 | #将变量放到cuda上
20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
21 |
22 | class DQN(nn.Module):
23 | def __init__(self, observation_space, action_sapce):
24 | super(DQN, self).__init__()
25 |
26 | self.observation_space = observation_space
27 | self.action_sapce = action_sapce
28 |
29 | self.layers = nn.Sequential(
30 | nn.Linear(observation_space,128),
31 | nn.ReLU(),
32 | nn.Linear(128,128),
33 | nn.ReLU(),
34 | nn.Linear(128, action_sapce)
35 | )
36 |
37 |
38 | def forward(self, x):
39 | return self.layers(x)
40 |
41 | def act(self, state, epsilon):
42 | if random.random() > epsilon:
43 | #如果使用的是GPU,这里需要把数据丢到GPU上
44 | state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)#volatile的作用是作为指令关键字,确保本条指令不会因编译器的优化而省略,且要求每次直接读值。
45 | #.squeeze() 把数据条目中维度为1 的删除掉
46 | q_value = self.forward(state)
47 | action = q_value.max(1)[1].data[0]
48 | #max(1)返回每一行中最大值的那个元素,且返回其索引,max(0)是列
49 | #max()[1]只返回最大值的每个索引,max()[0], 只返回最大值的每个数
50 |
51 | action = action.cpu().numpy()#从网络中得到的tensor形式,因为之后要输入给gym环境中,这里把它放回cpu,转为数组形式
52 | action =int(action)
53 | else:
54 | action = random.randrange(self.action_sapce)#返回指定递增基数集合中的一个随机数,基数默认值为1。
55 | return action
56 |
57 | class ReplayBuffer(object):
58 | def __init__(self, capacity):
59 | #deque模块是python标准库collections中的一项,它提供了两端都可以操作的序列,其实就是双向队列,
60 | #可以从左右两端增加元素,或者是删除元素。如果设置了最大长度,非输入端的数据会逐步移出窗口。
61 | self.buffer = deque(maxlen = capacity)
62 |
63 | def push (self, state, aciton, reward, next_state, done):
64 | state = np.expand_dims(state,0)
65 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接
66 | next_state = np.expand_dims(next_state,0)
67 | self.buffer.append((state, aciton, reward, next_state, done))
68 |
69 | def sample(self, batch_size):
70 | # 将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
71 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
72 | #最后使用concatenate对数组进行拼接,相当于少了一个维度
73 | return np.concatenate(state), action, reward, np.concatenate(next_state), done
74 |
75 |
76 | def compute_td_loss(model,optimizer, replay_buffer, gamma, batch_size):
77 | state, action, reward, next_state, done = replay_buffer.sample(batch_size)
78 | #通通丢到GPU上去
79 | state = Variable(torch.FloatTensor(np.float32(state)))
80 | next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
81 | action = Variable(torch.LongTensor(action))
82 | reward = Variable(torch.FloatTensor(reward))
83 | done = Variable(torch.FloatTensor(done))
84 |
85 | q_values = model(state)
86 | next_q_values = model(next_state)
87 |
88 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
89 | #gather可以看作是对q_values的查询,即元素都是q_values中的元素,查询索引都存在action中。输出大小与action.unsqueeze(1)一致。
90 | #dim=1,它存放的都是第1维度的索引;dim=0,它存放的都是第0维度的索引;
91 | #这里增加维度主要是为了方便gather操作,之后再删除该维度
92 | next_q_value = next_q_values.max(1)[0]
93 |
94 | expected_q_value = reward + gamma * next_q_value * (1 - done)
95 |
96 | loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()
97 |
98 | optimizer.zero_grad()
99 | loss.backward()
100 | optimizer.step()
101 |
102 | return loss
103 |
104 |
105 |
106 |
107 | def main():
108 | env_id = "CartPole-v0"
109 | env = gym.make(env_id)
110 |
111 | observation_space = env.observation_space.shape[0]
112 | action_sapce = env.action_space.n
113 |
114 | model = DQN (observation_space, action_sapce)
115 |
116 | if USE_CUDA:
117 | model = model.cuda()
118 |
119 | optimizer = optim.Adam(model.parameters())
120 |
121 | replay_buffer = ReplayBuffer(1000)
122 |
123 | batch_size = 32
124 | gamma = 0.99
125 |
126 | num_frames = 10000
127 |
128 | losses = []
129 | all_rewards = []
130 | x_axis1 = []
131 | x_axis2 = []
132 | episode_reward = 0
133 |
134 | epsilon_start = 1.0
135 | epsilon_final = 0.01
136 | epsilon_decay = 500
137 |
138 | #要求探索率随着迭代次数增加而减小
139 | epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * frame_idx / epsilon_decay)
140 |
141 | state = env.reset()
142 | for frame_idx in range(1, num_frames + 1):
143 | #显示动画
144 | #env.render()
145 | epsilon = epsilon_by_frame(frame_idx)
146 | action = model.act(state, epsilon)
147 | next_state, reward, done, _ = env.step(action)
148 | replay_buffer.push(state, action, reward, next_state, done)
149 | state = next_state
150 | episode_reward += reward
151 |
152 | if done:
153 | state = env.reset()
154 | x_axis1.append(frame_idx)
155 | all_rewards.append(episode_reward)
156 | episode_reward = 0
157 |
158 | if frame_idx+1 > batch_size:
159 | x_axis2.append(frame_idx)
160 | loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size)
161 | losses.append(np.array(loss.data.cpu()))
162 |
163 |
164 |
165 | if frame_idx % 200 == 0:
166 | plt.figure(1)
167 | plt.subplot(121)
168 | plt.plot(x_axis1, all_rewards)
169 | plt.subplot(122)
170 | plt.plot(x_axis2, losses)
171 | plt.show()
172 |
173 |
174 | if __name__ == '__main__':
175 | main()
--------------------------------------------------------------------------------
/Discrete_action/Noise DQN.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Mae 5 2021
3 | @author: wangmeng
4 |
5 | NoisyNet 一种将参数化的噪音加入到神经网络权重上去的方法来增加强化学习中的探索,称为 NoisyNet
6 | 噪音的参数可以通过梯度来进行学习,非常容易就能实现,而且只增加了一点计算量,在 A3C ,DQN 算法上效果不错。
7 | NoisyNet 的思想很简单,就是在神经网络的权重上增加一些噪音来起到探索的目的。
8 | """
9 |
10 | import math, random
11 | import gym
12 | import numpy as np
13 | import torch
14 | import torch.nn as nn
15 | import torch.optim as optim
16 | import torch.autograd as autograd
17 | import torch.nn.functional as F
18 | from collections import Counter
19 | from collections import deque
20 | import matplotlib.pyplot as plt
21 | from replay_buffer import *
22 |
23 | USE_CUDA = torch.cuda.is_available()
24 | #将变量放到cuda上
25 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
26 |
27 | #定义一个添加噪声的网络层
28 | class NoisyLinear(nn.Module):
29 | def __init__(self, in_features, out_features, std_init=0.4):
30 | super(NoisyLinear, self).__init__()
31 |
32 | self.in_features = in_features
33 | self.out_features = out_features
34 | self.std_init = std_init
35 |
36 | self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
37 | self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
38 | #向模块添加持久缓冲区,这通常用于注册不应被视为模型参数的缓冲区。例如,BatchNorm的running_mean不是一个参数,而是持久状态的一部分。
39 | #缓冲区可以使用给定的名称作为属性访问。
40 | self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))
41 |
42 | self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
43 | self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
44 | self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
45 |
46 | self.reset_parameters()
47 | self.reset_noise()
48 |
49 | def forward(self,x):
50 | if self.training:
51 | weight = self.weight_mu + self.weight_sigma.mul(Variable(self.weight_epsilon))
52 | bias = self.bias_mu + self.bias_sigma.mul(Variable(self.bias_epsilon))
53 | else:
54 | weight = self.weight_mu
55 | bias = self.bias_mu
56 | return F.linear(x, weight, bias)
57 |
58 | def reset_parameters(self):
59 | mu_range = 1 / math.sqrt(self.weight_mu.size(1))
60 |
61 | self.weight_mu.data.uniform_(-mu_range, mu_range)
62 | self.weight_sigma.data.uniform_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
63 |
64 | self.bias_mu.data.uniform_(-mu_range, mu_range)
65 | self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
66 |
67 | def reset_noise(self):
68 | epsilon_in = self._scale_noise(self.in_features)
69 | epsilon_out = self._scale_noise(self.out_features)
70 |
71 | self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
72 | self.bias_epsilon.copy_(self._scale_noise(self.out_features))
73 |
74 | def _scale_noise(self, size):
75 | x = torch.randn(size)
76 | x = x.sign().mul(x.abs().sqrt())
77 | return x
78 |
79 | class NoisyDQN(nn.Module):
80 | def __init__(self, observation_space, action_sapce):
81 | super(NoisyDQN, self).__init__()
82 |
83 | self.linear = nn.Linear(observation_space, 128)
84 | self.noisy1 = NoisyLinear(128, 128)
85 | self.noisy2 = NoisyLinear(128, action_sapce)
86 | def forward(self, x):
87 | x = F.relu(self.linear(x))
88 | x = F.relu(self.noisy1(x))
89 | x = self.noisy2(x)
90 | return x
91 | def act(self, state):
92 | state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile = True)
93 | q_value = self.forward(state)
94 | action = q_value.max(1)[1].data[0]
95 | action = action.cpu().numpy() # 从网络中得到的tensor形式,因为之后要输入给gym环境中,这里把它放回cpu,转为数组形式
96 | action = int(action)
97 | return action
98 |
99 | def reset_noise(self):
100 | self.noisy1.reset_noise()
101 | self.noisy2.reset_noise()
102 |
103 |
104 | class ReplayBuffer(object):
105 | def __init__(self, capacity):
106 | #deque模块是python标准库collections中的一项,它提供了两端都可以操作的序列,其实就是双向队列,
107 | #可以从左右两端增加元素,或者是删除元素。如果设置了最大长度,非输入端的数据会逐步移出窗口。
108 | self.buffer = deque(maxlen = capacity)
109 |
110 | def push (self, state, aciton, reward, next_state, done):
111 | state = np.expand_dims(state,0)
112 | #这里增加维度的操作是为了便于之后使用concatenate进行拼接
113 | next_state = np.expand_dims(next_state,0)
114 | self.buffer.append((state, aciton, reward, next_state, done))
115 |
116 | def sample(self, batch_size):
117 | # 将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
118 | state , action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
119 | #最后使用concatenate对数组进行拼接,相当于少了一个维度
120 | return np.concatenate(state), action, reward, np.concatenate(next_state), done
121 |
122 |
123 | def compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size, beta):
124 | state, action, reward, next_state, done, weights, indices = replay_buffer.sample(batch_size, beta)
125 |
126 | state = Variable(torch.FloatTensor(np.float32(state)))
127 | next_state = Variable(torch.FloatTensor(np.float32(next_state)))
128 | action = Variable(torch.LongTensor(action))
129 | reward = Variable(torch.FloatTensor(reward))
130 | done = Variable(torch.FloatTensor(np.float32(done)))
131 | weights = Variable(torch.FloatTensor(weights))
132 |
133 | q_values = current_model(state)
134 | next_q_values = target_model(next_state)
135 |
136 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
137 | #gather可以看作是对q_values的查询,即元素都是q_values中的元素,查询索引都存在action中。输出大小与action.unsqueeze(1)一致。
138 | #dim=1,它存放的都是第1维度的索引;dim=0,它存放的都是第0维度的索引;
139 | #这里增加维度主要是为了方便gather操作,之后再删除该维度
140 | next_q_value = next_q_values.max(1)[0]
141 |
142 | expected_q_value = reward + gamma * next_q_value * (1 - done)
143 |
144 | loss = (q_value - expected_q_value.detach()).pow(2) * weights
145 | prios = loss + 1e-5
146 | loss = loss.mean()
147 |
148 | optimizer.zero_grad()
149 | loss.backward()
150 | optimizer.step()
151 |
152 | replay_buffer.update_priorities(indices, prios.data.cpu().numpy())
153 | current_model.reset_noise()
154 | target_model.reset_noise()
155 |
156 | return loss
157 |
158 | def update_target(current_model, target_model):
159 | target_model.load_state_dict(current_model.state_dict())#加载模型
160 |
161 | def plot(frame_idx, rewards, losses):
162 | plt.figure(figsize=(20,5))
163 | plt.subplot(131)
164 | plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
165 | plt.plot(rewards)
166 | plt.subplot(132)
167 | plt.title('loss')
168 | plt.plot(losses)
169 | plt.show()
170 |
171 | def main():
172 | env_id = "CartPole-v0"
173 | env = gym.make(env_id)
174 |
175 | observation_space = env.observation_space.shape[0]
176 | action_sapce = env.action_space.n
177 |
178 | current_model = NoisyDQN(observation_space, action_sapce)
179 | target_model = NoisyDQN(observation_space, action_sapce)
180 |
181 | if USE_CUDA:
182 | current_model = current_model.cuda()
183 | target_model = target_model.cuda()
184 |
185 | optimizer = optim.Adam(current_model.parameters())
186 |
187 | beta_start = 0.4
188 | beta_frames = 1000
189 | beta_by_frame = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames)
190 |
191 |
192 | replay_buffer = PrioritizedReplayBuffer(10000, alpha=0.6)
193 |
194 | update_target(current_model, target_model)
195 |
196 | num_frames = 10000
197 | batch_size = 32
198 | gamma = 0.99
199 |
200 | losses = []
201 | all_rewards = []
202 | episode_reward = 0
203 |
204 | state = env.reset()
205 | for frame_idx in range(1, num_frames + 1):
206 | #显示动画
207 | #env.render()
208 | action = current_model.act(state)
209 |
210 | next_state, reward, done, _ = env.step(action)
211 | replay_buffer.push(state, action, reward, next_state, done)
212 |
213 | state = next_state
214 | episode_reward += reward
215 |
216 | if done:
217 | state = env.reset()
218 | all_rewards.append(episode_reward)
219 | episode_reward = 0
220 |
221 | if len(replay_buffer) > batch_size:
222 | beta = beta_by_frame(frame_idx)
223 | loss = compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size, beta)
224 | losses.append(np.array(loss.data.cpu()))
225 |
226 | if frame_idx % 200 == 0:
227 | plot(frame_idx, all_rewards, losses)
228 |
229 | if frame_idx % 1000 == 0:
230 | update_target(current_model, target_model)
231 |
232 |
233 | if __name__ == '__main__':
234 | main()
--------------------------------------------------------------------------------
/Discrete_action/__pycache__/multiprocessing_env.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mengwanglalala/RL-algorithms/97f5b3e3b570ecb3c88ecf5f1ade148552103071/Discrete_action/__pycache__/multiprocessing_env.cpython-37.pyc
--------------------------------------------------------------------------------
/Discrete_action/__pycache__/replay_buffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mengwanglalala/RL-algorithms/97f5b3e3b570ecb3c88ecf5f1ade148552103071/Discrete_action/__pycache__/replay_buffer.cpython-37.pyc
--------------------------------------------------------------------------------
/Discrete_action/multiprocessing_env.py:
--------------------------------------------------------------------------------
1 | # This code is from openai baseline
2 | # https://github.com/openai/baselines/tree/master/baselines/common/vec_env
3 |
4 | import numpy as np
5 | from multiprocessing import Process, Pipe
6 |
7 |
8 | def worker(remote, parent_remote, env_fn_wrapper):
9 | parent_remote.close()
10 | env = env_fn_wrapper.x()
11 | while True:
12 | cmd, data = remote.recv()
13 | if cmd == 'step':
14 | ob, reward, done, info = env.step(data)
15 | if done:
16 | ob = env.reset()
17 | remote.send((ob, reward, done, info))
18 | elif cmd == 'reset':
19 | ob = env.reset()
20 | remote.send(ob)
21 | elif cmd == 'reset_task':
22 | ob = env.reset_task()
23 | remote.send(ob)
24 | elif cmd == 'close':
25 | remote.close()
26 | break
27 | elif cmd == 'get_spaces':
28 | remote.send((env.observation_space, env.action_space))
29 | else:
30 | raise NotImplementedError
31 |
32 |
33 | class VecEnv(object):
34 | """
35 | An abstract asynchronous, vectorized environment.
36 | """
37 |
38 | def __init__(self, num_envs, observation_space, action_space):
39 | self.num_envs = num_envs
40 | self.observation_space = observation_space
41 | self.action_space = action_space
42 |
43 | def reset(self):
44 | """
45 | Reset all the environments and return an array of
46 | observations, or a tuple of observation arrays.
47 | If step_async is still doing work, that work will
48 | be cancelled and step_wait() should not be called
49 | until step_async() is invoked again.
50 | """
51 | pass
52 |
53 | def step_async(self, actions):
54 | """
55 | Tell all the environments to start taking a step
56 | with the given actions.
57 | Call step_wait() to get the results of the step.
58 | You should not call this if a step_async run is
59 | already pending.
60 | """
61 | pass
62 |
63 | def step_wait(self):
64 | """
65 | Wait for the step taken with step_async().
66 | Returns (obs, rews, dones, infos):
67 | - obs: an array of observations, or a tuple of
68 | arrays of observations.
69 | - rews: an array of rewards
70 | - dones: an array of "episode done" booleans
71 | - infos: a sequence of info objects
72 | """
73 | pass
74 |
75 | def close(self):
76 | """
77 | Clean up the environments' resources.
78 | """
79 | pass
80 |
81 | def step(self, actions):
82 | self.step_async(actions)
83 | return self.step_wait()
84 |
85 |
86 | class CloudpickleWrapper(object):
87 | """
88 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
89 | """
90 |
91 | def __init__(self, x):
92 | self.x = x
93 |
94 | def __getstate__(self):
95 | import cloudpickle
96 | return cloudpickle.dumps(self.x)
97 |
98 | def __setstate__(self, ob):
99 | import pickle
100 | self.x = pickle.loads(ob)
101 |
102 |
103 | class SubprocVecEnv(VecEnv):
104 | def __init__(self, env_fns, spaces=None):
105 | """
106 | envs: list of gym environments to run in subprocesses
107 | """
108 | self.waiting = False
109 | self.closed = False
110 | nenvs = len(env_fns)
111 | self.nenvs = nenvs
112 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
113 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
114 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
115 | for p in self.ps:
116 | p.daemon = True # if the main process crashes, we should not cause things to hang
117 | p.start()
118 | for remote in self.work_remotes:
119 | remote.close()
120 |
121 | self.remotes[0].send(('get_spaces', None))
122 | observation_space, action_space = self.remotes[0].recv()
123 | VecEnv.__init__(self, len(env_fns), observation_space, action_space)
124 |
125 | def step_async(self, actions):
126 | for remote, action in zip(self.remotes, actions):
127 | remote.send(('step', action))
128 | self.waiting = True
129 |
130 | def step_wait(self):
131 | results = [remote.recv() for remote in self.remotes]
132 | self.waiting = False
133 | obs, rews, dones, infos = zip(*results)
134 | return np.stack(obs), np.stack(rews), np.stack(dones), infos
135 |
136 | def reset(self):
137 | for remote in self.remotes:
138 | remote.send(('reset', None))
139 | return np.stack([remote.recv() for remote in self.remotes])
140 |
141 | def reset_task(self):
142 | for remote in self.remotes:
143 | remote.send(('reset_task', None))
144 | return np.stack([remote.recv() for remote in self.remotes])
145 |
146 | def close(self):
147 | if self.closed:
148 | return
149 | if self.waiting:
150 | for remote in self.remotes:
151 | remote.recv()
152 | for remote in self.remotes:
153 | remote.send(('close', None))
154 | for p in self.ps:
155 | p.join()
156 | self.closed = True
157 |
158 | def __len__(self):
159 | return self.nenvs
--------------------------------------------------------------------------------
/Discrete_action/replay_buffer.py:
--------------------------------------------------------------------------------
1 | #code from openai
2 | #https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
3 |
4 | import numpy as np
5 | import random
6 |
7 | import operator
8 |
9 |
10 | class SegmentTree(object):
11 | def __init__(self, capacity, operation, neutral_element):
12 | """Build a Segment Tree data structure.
13 | https://en.wikipedia.org/wiki/Segment_tree
14 | Can be used as regular array, but with two
15 | important differences:
16 | a) setting item's value is slightly slower.
17 | It is O(lg capacity) instead of O(1).
18 | b) user has access to an efficient `reduce`
19 | operation which reduces `operation` over
20 | a contiguous subsequence of items in the
21 | array.
22 | Paramters
23 | ---------
24 | capacity: int
25 | Total size of the array - must be a power of two.
26 | operation: lambda obj, obj -> obj
27 | and operation for combining elements (eg. sum, max)
28 | must for a mathematical group together with the set of
29 | possible values for array elements.
30 | neutral_element: obj
31 | neutral element for the operation above. eg. float('-inf')
32 | for max and 0 for sum.
33 | """
34 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
35 | self._capacity = capacity
36 | self._value = [neutral_element for _ in range(2 * capacity)]
37 | self._operation = operation
38 |
39 | def _reduce_helper(self, start, end, node, node_start, node_end):
40 | if start == node_start and end == node_end:
41 | return self._value[node]
42 | mid = (node_start + node_end) // 2
43 | if end <= mid:
44 | return self._reduce_helper(start, end, 2 * node, node_start, mid)
45 | else:
46 | if mid + 1 <= start:
47 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
48 | else:
49 | return self._operation(
50 | self._reduce_helper(start, mid, 2 * node, node_start, mid),
51 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
52 | )
53 |
54 | def reduce(self, start=0, end=None):
55 | """Returns result of applying `self.operation`
56 | to a contiguous subsequence of the array.
57 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
58 | Parameters
59 | ----------
60 | start: int
61 | beginning of the subsequence
62 | end: int
63 | end of the subsequences
64 | Returns
65 | -------
66 | reduced: obj
67 | result of reducing self.operation over the specified range of array elements.
68 | """
69 | if end is None:
70 | end = self._capacity
71 | if end < 0:
72 | end += self._capacity
73 | end -= 1
74 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
75 |
76 | def __setitem__(self, idx, val):
77 | # index of the leaf
78 | idx += self._capacity
79 | self._value[idx] = val
80 | idx //= 2
81 | while idx >= 1:
82 | self._value[idx] = self._operation(
83 | self._value[2 * idx],
84 | self._value[2 * idx + 1]
85 | )
86 | idx //= 2
87 |
88 | def __getitem__(self, idx):
89 | assert 0 <= idx < self._capacity
90 | return self._value[self._capacity + idx]
91 |
92 |
93 | class SumSegmentTree(SegmentTree):
94 | def __init__(self, capacity):
95 | super(SumSegmentTree, self).__init__(
96 | capacity=capacity,
97 | operation=operator.add,
98 | neutral_element=0.0
99 | )
100 |
101 | def sum(self, start=0, end=None):
102 | """Returns arr[start] + ... + arr[end]"""
103 | return super(SumSegmentTree, self).reduce(start, end)
104 |
105 | def find_prefixsum_idx(self, prefixsum):
106 | """Find the highest index `i` in the array such that
107 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
108 | if array values are probabilities, this function
109 | allows to sample indexes according to the discrete
110 | probability efficiently.
111 | Parameters
112 | ----------
113 | perfixsum: float
114 | upperbound on the sum of array prefix
115 | Returns
116 | -------
117 | idx: int
118 | highest index satisfying the prefixsum constraint
119 | """
120 | assert 0 <= prefixsum <= self.sum() + 1e-5
121 | idx = 1
122 | while idx < self._capacity: # while non-leaf
123 | if self._value[2 * idx] > prefixsum:
124 | idx = 2 * idx
125 | else:
126 | prefixsum -= self._value[2 * idx]
127 | idx = 2 * idx + 1
128 | return idx - self._capacity
129 |
130 |
131 | class MinSegmentTree(SegmentTree):
132 | def __init__(self, capacity):
133 | super(MinSegmentTree, self).__init__(
134 | capacity=capacity,
135 | operation=min,
136 | neutral_element=float('inf')
137 | )
138 |
139 | def min(self, start=0, end=None):
140 | """Returns min(arr[start], ..., arr[end])"""
141 |
142 | return super(MinSegmentTree, self).reduce(start, end)
143 |
144 |
145 | class ReplayBuffer(object):
146 | def __init__(self, size):
147 | """Create Replay buffer.
148 | Parameters
149 | ----------
150 | size: int
151 | Max number of transitions to store in the buffer. When the buffer
152 | overflows the old memories are dropped.
153 | """
154 | self._storage = []
155 | self._maxsize = size
156 | self._next_idx = 0
157 |
158 | def __len__(self):
159 | return len(self._storage)
160 |
161 | def push(self, state, action, reward, next_state, done):
162 | data = (state, action, reward, next_state, done)
163 |
164 | if self._next_idx >= len(self._storage):
165 | self._storage.append(data)
166 | else:
167 | self._storage[self._next_idx] = data
168 | self._next_idx = (self._next_idx + 1) % self._maxsize
169 |
170 | def _encode_sample(self, idxes):
171 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
172 | for i in idxes:
173 | data = self._storage[i]
174 | obs_t, action, reward, obs_tp1, done = data
175 | obses_t.append(np.array(obs_t, copy=False))
176 | actions.append(np.array(action, copy=False))
177 | rewards.append(reward)
178 | obses_tp1.append(np.array(obs_tp1, copy=False))
179 | dones.append(done)
180 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
181 |
182 | def sample(self, batch_size):
183 | """Sample a batch of experiences.
184 | Parameters
185 | ----------
186 | batch_size: int
187 | How many transitions to sample.
188 | Returns
189 | -------
190 | obs_batch: np.array
191 | batch of observations
192 | act_batch: np.array
193 | batch of actions executed given obs_batch
194 | rew_batch: np.array
195 | rewards received as results of executing act_batch
196 | next_obs_batch: np.array
197 | next set of observations seen after executing act_batch
198 | done_mask: np.array
199 | done_mask[i] = 1 if executing act_batch[i] resulted in
200 | the end of an episode and 0 otherwise.
201 | """
202 | idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
203 | return self._encode_sample(idxes)
204 |
205 |
206 | class PrioritizedReplayBuffer(ReplayBuffer):
207 | def __init__(self, size, alpha):
208 | """Create Prioritized Replay buffer.
209 | Parameters
210 | ----------
211 | size: int
212 | Max number of transitions to store in the buffer. When the buffer
213 | overflows the old memories are dropped.
214 | alpha: float
215 | how much prioritization is used
216 | (0 - no prioritization, 1 - full prioritization)
217 | See Also
218 | --------
219 | ReplayBuffer.__init__
220 | """
221 | super(PrioritizedReplayBuffer, self).__init__(size)
222 | assert alpha > 0
223 | self._alpha = alpha
224 |
225 | it_capacity = 1
226 | while it_capacity < size:
227 | it_capacity *= 2
228 |
229 | self._it_sum = SumSegmentTree(it_capacity)
230 | self._it_min = MinSegmentTree(it_capacity)
231 | self._max_priority = 1.0
232 |
233 | def push(self, *args, **kwargs):
234 | """See ReplayBuffer.store_effect"""
235 | idx = self._next_idx
236 | super(PrioritizedReplayBuffer, self).push(*args, **kwargs)
237 | self._it_sum[idx] = self._max_priority ** self._alpha
238 | self._it_min[idx] = self._max_priority ** self._alpha
239 |
240 | def _sample_proportional(self, batch_size):
241 | res = []
242 | for _ in range(batch_size):
243 | # TODO(szymon): should we ensure no repeats?
244 | mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
245 | idx = self._it_sum.find_prefixsum_idx(mass)
246 | res.append(idx)
247 | return res
248 |
249 | def sample(self, batch_size, beta):
250 | """Sample a batch of experiences.
251 | compared to ReplayBuffer.sample
252 | it also returns importance weights and idxes
253 | of sampled experiences.
254 | Parameters
255 | ----------
256 | batch_size: int
257 | How many transitions to sample.
258 | beta: float
259 | To what degree to use importance weights
260 | (0 - no corrections, 1 - full correction)
261 | Returns
262 | -------
263 | obs_batch: np.array
264 | batch of observations
265 | act_batch: np.array
266 | batch of actions executed given obs_batch
267 | rew_batch: np.array
268 | rewards received as results of executing act_batch
269 | next_obs_batch: np.array
270 | next set of observations seen after executing act_batch
271 | done_mask: np.array
272 | done_mask[i] = 1 if executing act_batch[i] resulted in
273 | the end of an episode and 0 otherwise.
274 | weights: np.array
275 | Array of shape (batch_size,) and dtype np.float32
276 | denoting importance weight of each sampled transition
277 | idxes: np.array
278 | Array of shape (batch_size,) and dtype np.int32
279 | idexes in buffer of sampled experiences
280 | """
281 | assert beta > 0
282 |
283 | idxes = self._sample_proportional(batch_size)
284 |
285 | weights = []
286 | p_min = self._it_min.min() / self._it_sum.sum()
287 | max_weight = (p_min * len(self._storage)) ** (-beta)
288 |
289 | for idx in idxes:
290 | p_sample = self._it_sum[idx] / self._it_sum.sum()
291 | weight = (p_sample * len(self._storage)) ** (-beta)
292 | weights.append(weight / max_weight)
293 | weights = np.array(weights)
294 | encoded_sample = self._encode_sample(idxes)
295 | return tuple(list(encoded_sample) + [weights, idxes])
296 |
297 | def update_priorities(self, idxes, priorities):
298 | """Update priorities of sampled transitions.
299 | sets priority of transition at index idxes[i] in buffer
300 | to priorities[i].
301 | Parameters
302 | ----------
303 | idxes: [int]
304 | List of idxes of sampled transitions
305 | priorities: [float]
306 | List of updated priorities corresponding to
307 | transitions at the sampled idxes denoted by
308 | variable `idxes`.
309 | """
310 | assert len(idxes) == len(priorities)
311 | for idx, priority in zip(idxes, priorities):
312 | assert priority > 0
313 | assert 0 <= idx < len(self._storage)
314 | self._it_sum[idx] = priority ** self._alpha
315 | self._it_min[idx] = priority ** self._alpha
316 |
317 | self._max_priority = max(self._max_priority, priority)
--------------------------------------------------------------------------------
/Discrete_action/wrappers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from collections import deque
3 | import gym
4 | from gym import spaces
5 | import cv2
6 | cv2.ocl.setUseOpenCL(False)
7 |
8 | class NoopResetEnv(gym.Wrapper):
9 | def __init__(self, env, noop_max=30):
10 | """Sample initial states by taking random number of no-ops on reset.
11 | No-op is assumed to be action 0.
12 | """
13 | gym.Wrapper.__init__(self, env)
14 | self.noop_max = noop_max
15 | self.override_num_noops = None
16 | self.noop_action = 0
17 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
18 |
19 | def reset(self, **kwargs):
20 | """ Do no-op action for a number of steps in [1, noop_max]."""
21 | self.env.reset(**kwargs)
22 | if self.override_num_noops is not None:
23 | noops = self.override_num_noops
24 | else:
25 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
26 | assert noops > 0
27 | obs = None
28 | for _ in range(noops):
29 | obs, _, done, _ = self.env.step(self.noop_action)
30 | if done:
31 | obs = self.env.reset(**kwargs)
32 | return obs
33 |
34 | def step(self, ac):
35 | return self.env.step(ac)
36 |
37 | class FireResetEnv(gym.Wrapper):
38 | def __init__(self, env):
39 | """Take action on reset for environments that are fixed until firing."""
40 | gym.Wrapper.__init__(self, env)
41 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
42 | assert len(env.unwrapped.get_action_meanings()) >= 3
43 |
44 | def reset(self, **kwargs):
45 | self.env.reset(**kwargs)
46 | obs, _, done, _ = self.env.step(1)
47 | if done:
48 | self.env.reset(**kwargs)
49 | obs, _, done, _ = self.env.step(2)
50 | if done:
51 | self.env.reset(**kwargs)
52 | return obs
53 |
54 | def step(self, ac):
55 | return self.env.step(ac)
56 |
57 | class EpisodicLifeEnv(gym.Wrapper):
58 | def __init__(self, env):
59 | """Make end-of-life == end-of-episode, but only reset on true game over.
60 | Done by DeepMind for the Discrete_action and co. since it helps value estimation.
61 | """
62 | gym.Wrapper.__init__(self, env)
63 | self.lives = 0
64 | self.was_real_done = True
65 |
66 | def step(self, action):
67 | obs, reward, done, info = self.env.step(action)
68 | self.was_real_done = done
69 | # check current lives, make loss of life terminal,
70 | # then update lives to handle bonus lives
71 | lives = self.env.unwrapped.ale.lives()
72 | if lives < self.lives and lives > 0:
73 | # for Qbert sometimes we stay in lives == 0 condtion for a few frames
74 | # so its important to keep lives > 0, so that we only reset once
75 | # the environment advertises done.
76 | done = True
77 | self.lives = lives
78 | return obs, reward, done, info
79 |
80 | def reset(self, **kwargs):
81 | """Reset only when lives are exhausted.
82 | This way all states are still reachable even though lives are episodic,
83 | and the learner need not know about any of this behind-the-scenes.
84 | """
85 | if self.was_real_done:
86 | obs = self.env.reset(**kwargs)
87 | else:
88 | # no-op step to advance from terminal/lost life state
89 | obs, _, _, _ = self.env.step(0)
90 | self.lives = self.env.unwrapped.ale.lives()
91 | return obs
92 |
93 | class MaxAndSkipEnv(gym.Wrapper):
94 | def __init__(self, env, skip=4):
95 | """Return only every `skip`-th frame"""
96 | gym.Wrapper.__init__(self, env)
97 | # most recent raw observations (for max pooling across time steps)
98 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
99 | self._skip = skip
100 |
101 | def reset(self):
102 | return self.env.reset()
103 |
104 | def step(self, action):
105 | """Repeat action, sum reward, and max over last observations."""
106 | total_reward = 0.0
107 | done = None
108 | for i in range(self._skip):
109 | obs, reward, done, info = self.env.step(action)
110 | if i == self._skip - 2: self._obs_buffer[0] = obs
111 | if i == self._skip - 1: self._obs_buffer[1] = obs
112 | total_reward += reward
113 | if done:
114 | break
115 | # Note that the observation on the done=True frame
116 | # doesn't matter
117 | max_frame = self._obs_buffer.max(axis=0)
118 |
119 | return max_frame, total_reward, done, info
120 |
121 | def reset(self, **kwargs):
122 | return self.env.reset(**kwargs)
123 |
124 | class ClipRewardEnv(gym.RewardWrapper):
125 | def __init__(self, env):
126 | gym.RewardWrapper.__init__(self, env)
127 |
128 | def reward(self, reward):
129 | """Bin reward to {+1, 0, -1} by its sign."""
130 | return np.sign(reward)
131 |
132 | class WarpFrame(gym.ObservationWrapper):
133 | def __init__(self, env):
134 | """Warp frames to 84x84 as done in the Nature paper and later work."""
135 | gym.ObservationWrapper.__init__(self, env)
136 | self.width = 84
137 | self.height = 84
138 | self.observation_space = spaces.Box(low=0, high=255,
139 | shape=(self.height, self.width, 1), dtype=np.uint8)
140 |
141 | def observation(self, frame):
142 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
143 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
144 | return frame[:, :, None]
145 |
146 | class FrameStack(gym.Wrapper):
147 | def __init__(self, env, k):
148 | """Stack k last frames.
149 | Returns lazy array, which is much more memory efficient.
150 | See Also
151 | --------
152 | baselines.common.atari_wrappers.LazyFrames
153 | """
154 | gym.Wrapper.__init__(self, env)
155 | self.k = k
156 | self.frames = deque([], maxlen=k)
157 | shp = env.observation_space.shape
158 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
159 |
160 | def reset(self):
161 | ob = self.env.reset()
162 | for _ in range(self.k):
163 | self.frames.append(ob)
164 | return self._get_ob()
165 |
166 | def step(self, action):
167 | ob, reward, done, info = self.env.step(action)
168 | self.frames.append(ob)
169 | return self._get_ob(), reward, done, info
170 |
171 | def _get_ob(self):
172 | assert len(self.frames) == self.k
173 | return LazyFrames(list(self.frames))
174 |
175 | class ScaledFloatFrame(gym.ObservationWrapper):
176 | def __init__(self, env):
177 | gym.ObservationWrapper.__init__(self, env)
178 |
179 | def observation(self, observation):
180 | # careful! This undoes the memory optimization, use
181 | # with smaller replay buffers only.
182 | return np.array(observation).astype(np.float32) / 255.0
183 |
184 | class LazyFrames(object):
185 | def __init__(self, frames):
186 | """This object ensures that common frames between the observations are only stored once.
187 | It exists purely to optimize memory usage which can be huge for Discrete_action's 1M frames replay
188 | buffers.
189 | This object should only be converted to numpy array before being passed to the model.
190 | You'd not believe how complex the previous solution was."""
191 | self._frames = frames
192 | self._out = None
193 |
194 | def _force(self):
195 | if self._out is None:
196 | self._out = np.concatenate(self._frames, axis=2)
197 | self._frames = None
198 | return self._out
199 |
200 | def __array__(self, dtype=None):
201 | out = self._force()
202 | if dtype is not None:
203 | out = out.astype(dtype)
204 | return out
205 |
206 | def __len__(self):
207 | return len(self._force())
208 |
209 | def __getitem__(self, i):
210 | return self._force()[i]
211 |
212 | def make_atari(env_id):
213 | env = gym.make(env_id)
214 | assert 'NoFrameskip' in env.spec.id
215 | env = NoopResetEnv(env, noop_max=30)
216 | env = MaxAndSkipEnv(env, skip=4)
217 | return env
218 |
219 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
220 | """Configure environment for DeepMind-style Atari.
221 | """
222 | if episode_life:
223 | env = EpisodicLifeEnv(env)
224 | if 'FIRE' in env.unwrapped.get_action_meanings():
225 | env = FireResetEnv(env)
226 | env = WarpFrame(env)
227 | if scale:
228 | env = ScaledFloatFrame(env)
229 | if clip_rewards:
230 | env = ClipRewardEnv(env)
231 | if frame_stack:
232 | env = FrameStack(env, 4)
233 | return env
234 |
235 |
236 |
237 | class ImageToPyTorch(gym.ObservationWrapper):
238 | """
239 | Image shape to num_channels x weight x height
240 | """
241 | def __init__(self, env):
242 | super(ImageToPyTorch, self).__init__(env)
243 | old_shape = self.observation_space.shape
244 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.uint8)
245 |
246 | def observation(self, observation):
247 | return np.swapaxes(observation, 2, 0)
248 |
249 |
250 | def wrap_pytorch(env):
251 | return ImageToPyTorch(env)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RL-algorithms
2 | 更新一些基础的RL代码
3 | - [离散的动作空间](#离散的动作空间)
4 | - [DQN](#DQN)
5 | - [DDQN](#DDQN)
6 | - [Dueling-DQN](#Dueling-DQN)
7 | - [D3QN](#D3QN)
8 | - [Noisy-DQN](#Noisy-DQN)
9 |
10 | - [连续的动作空间](#连续的动作空间)
11 | - [DDPG](#DDPG)
12 | - [A3C](#A3C)
13 | - [PPO](#PPO)
14 | - [PPO+GAE](#PPO+GAE)
15 | - [SAC](#SAC)
16 | - [TD3](#TD3)
17 |
18 |
19 |
20 | ## 离散的动作空间
21 | ### DQN
22 | 可用于入门深度强化学习,使用一个Q Network来估计Q值,从而替换了 Q-table,完成从离散状态空间到连续状态空间的跨越。Q Network 会对每一个离散动作的Q值进行估计,执行的时候选择Q值最高的动作(greedy 策略)。并使用 epslion-greedy 策略进行探索(探索的时候,有很小的概率随机执行动作),来获得各种动作的训练数据
23 |
24 | ### DDQN
25 | (Double DQN)更加稳定,因为最优化操作会传播高估误差,所以她同时训练两个Q network并选择较小的Q值用于计算TD-error,降低高估误差。
26 |
27 | ### Dueling-DQN
28 | 使用了优势函数 advantage function(A3C也用了):它只估计state的Q值,不考虑动作,好的策略能将state 导向一个更有优势的局面。然而不是任何时刻 action 都会影响 state的转移,因此Dueling DQN 结合了 优势函数估计的Q值 与 原本DQN对不同动作估计的Q值。DQN算法学习 state 与每个离散动作一一对应的Q值后才能知道学到 state 的Q值,而Dueling DQN 能通过优势函数直接学到state的价值,这使得Dueling DQN在一些action不影响环境的情况下能学比DQN更快
29 |
30 | ### D3QN
31 | Dueling DQN 与Double DQN相互兼容,一起用效果很好。简单,泛用,没有使用禁忌。任何一个刚入门的人都能独立地在前两种算法的基础上改出D3QN。在论文中使用了D3QN应该引用DuelingDQN 与 DoubleDQN的文章
32 |
33 | ### Noisy-DQN
34 | 探索能力稍强。Noisy DQN 把噪声添加到网络的输出层之前值。原本Q值较大的动作在添加噪声后Q值变大的概率也比较大。这种探索比epslion-greedy随机选一个动作去执行更好,至少这种针对性的探索既保证了探索动作多样,也提高了探索效率。
35 |
36 |
37 | ## 连续的动作空间
38 | ### DDPG
39 | DDPG(Deep DPG ),可用于入门连续动作空间的DRL算法。DPG 确定策略梯度算法,直接让策略网络输出action,成功在连续动作空间任务上训练出能用的策略,但是它使用 OU-noise 这种有很多超参数的方法去探索环境,训练慢,且不稳定。
40 |
41 | ### PPO
42 | (Proximal PO 近端策略搜索)训练稳定,调参简单,robust(稳健、耐操)。PPO对TRPO的信任域计算过程进行简化,论文中用的词是 surrogate objective。PPO动作的噪声方差是一个可训练的矢量(与动作矢量相同形状),而不由网络输出,这样做增强了PPO的稳健性 robustness。
43 |
44 | ### A3C
45 | (Asynchronous Advantage Actor-Critic)Asynchronous 指开启多个actor 在环境中探索,并异步更新。原本DDPG的Critic 是 Q(s, a),根据state-action pair 估计Q值,优势函数只使用 state 去估计Q值,这是很好的创新:降低了随机策略梯度算法估计Q值的难度。然而优势函数有明显缺陷:不是任何时刻 action 都会影响 state的转移(详见 Dueling DQN),因此这个算法只适合入门学习「优势函数 advantage function」。如果你看到新论文还在使用A3C,那么你要怀疑其作者RL的水平。此外,A3C算法有离散动作版本,也有连续动作版本。A2C 指的是没有Asynchronous 的版本。
46 |
47 | ### SAC
48 | (Soft Actor-Critic with maximum entropy 最大熵)训练很快,探索能力好,但是很依赖Reward Function,不像PPO那样随便整一个Reward function 也能训练。PPO算法会计算新旧策略的差异(计算两个分布之间的距离),并让这个差异保持在信任域内,且不至于太小。SAC算法不是on-policy算法,不容易计算新旧策略的差异,所以它在优化时最大化策略的熵(动作的方差越大,策略的熵越高)
49 | SAC也可以离散化到离散空间。对于SAC-discrete, 其更适合有很多不确定性的环境,对于一些确定状态的环境表现不如rainbow DQN
50 |
51 | ### TD3
52 | TD3(TDDD,Twin Delay DDPG),擅长调参的人才建议用,因为它影响训练的敏感超参数很多。它从Double DQN那里继承了Twin Critic,用来降低高估误差;它用来和随机策略梯度很像的方法:计算用于更新TD-error的Q值时,给action加上了噪声,用于让Critic拟合更平滑的Q值估计函数。TD3建议 延迟更新目标网络,即多更新几次网络后,再使用 soft update 将网络更新到target network上,我认为这没有多大用,后来的其他算法也不用这个技巧。TD3还建议在计算Q值时,为动作添加一个噪声,用于平滑Critic函数,在确定策略中,TD3这么用很像“随机策略”
53 |
54 | ### PPO+GAE
55 | (Generalized Advantage Estimation)训练最稳定,调参最简单,适合高维状态 High-dimensional state,但是环境不能有太多随机因数。GAE会根据经验轨迹 trajectory 生成优势函数估计值,然后让Critic去拟合这个值。在这样的调整下,在随机因素小的环境中,不需要太多 trajectory 即可描述当前的策略。尽管GAE可以用于多种RL算法,但是它与PPO这种On-policy 的相性最好。
56 |
57 | ### PPG
58 | PPG(Proximal Policy Gradient),A3C、PPO 都是同策略 On-policy,它要求:在环境中探索并产生训练数据的策略 与 被更新的策略网络 一定得是同一个策略。她们需要删掉已旧策略的数据,然后使用新策略在环境中重新收集。为了让PPO也能用 off-policy 的数据来训练,PPG诞生了,思路挺简单的,原本的On-policy PPO部分该干啥干啥,额外引入一个使用off-policy数据进行训练的Critic,让它与PPO的Critic共享参数,也就是Auxiliary Task,这种算法并不是在任何情况下都能比PPO好,因为PPG涉及到Auxiliary task,这要求她尽可能收集更多的训练数据,并在大batch size 下面才能表现得更好。
--------------------------------------------------------------------------------