├── Actor-Critic.py ├── DDPG+HER.py ├── DDPG.py ├── DQN+.py ├── DQN.py ├── DynaQ.py ├── MC.py ├── PPO.py ├── PPO_cleanrl.py ├── PPO_cleanrl_atari.py ├── PPO_cleanrl_env1.py ├── PolicyIteration.py ├── README.md ├── REINFORCE.py ├── SAC-continue.py ├── SAC.py ├── Sarsa.py ├── bandit.py └── rl_utils.py /Actor-Critic.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import rl_utils 7 | 8 | class PolicyNet(torch.nn.Module): 9 | def __init__(self, state_dim, hidden_dim, action_dim): 10 | super(PolicyNet, self).__init__() 11 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 12 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 13 | 14 | def forward(self, x): 15 | x = F.relu(self.fc1(x)) 16 | return F.softmax(self.fc2(x), dim=1) 17 | 18 | class ValueNet(torch.nn.Module): 19 | def __init__(self, state_dim, hidden_dim): 20 | super(ValueNet, self).__init__() 21 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 22 | self.fc2 = torch.nn.Linear(hidden_dim, 1) 23 | 24 | def forward(self, x): 25 | x = F.relu(self.fc1(x)) 26 | return self.fc2(x) 27 | 28 | class ActorCritic: 29 | def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, 30 | gamma, device): 31 | # 策略网络 32 | self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device) 33 | self.critic = ValueNet(state_dim, hidden_dim).to(device) # 价值网络 34 | # 策略网络优化器 35 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), 36 | lr=actor_lr) 37 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), 38 | lr=critic_lr) # 价值网络优化器 39 | self.gamma = gamma 40 | self.device = device 41 | 42 | def take_action(self, state): 43 | state = torch.tensor([state], dtype=torch.float).to(self.device) 44 | probs = self.actor(state) 45 | action_dist = torch.distributions.Categorical(probs) 46 | action = action_dist.sample() 47 | return action.item() 48 | 49 | def update(self, transition_dict): 50 | states = torch.tensor(transition_dict['states'], 51 | dtype=torch.float).to(self.device) 52 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 53 | self.device) 54 | rewards = torch.tensor(transition_dict['rewards'], 55 | dtype=torch.float).view(-1, 1).to(self.device) 56 | next_states = torch.tensor(transition_dict['next_states'], 57 | dtype=torch.float).to(self.device) 58 | dones = torch.tensor(transition_dict['dones'], 59 | dtype=torch.float).view(-1, 1).to(self.device) 60 | 61 | # 时序差分目标 62 | td_target = rewards + self.gamma * self.critic(next_states) * (1 - 63 | dones) 64 | td_delta = td_target - self.critic(states) # 时序差分误差 65 | log_probs = torch.log(self.actor(states).gather(1, actions)) 66 | actor_loss = torch.mean(-log_probs * td_delta.detach()) 67 | # 均方误差损失函数 68 | critic_loss = torch.mean( 69 | F.mse_loss(self.critic(states), td_target.detach())) 70 | self.actor_optimizer.zero_grad() 71 | self.critic_optimizer.zero_grad() 72 | actor_loss.backward() # 计算策略网络的梯度 73 | critic_loss.backward() # 计算价值网络的梯度 74 | self.actor_optimizer.step() # 更新策略网络的参数 75 | self.critic_optimizer.step() # 更新价值网络的参数 76 | 77 | actor_lr = 1e-3 78 | critic_lr = 1e-2 79 | num_episodes = 1000 80 | hidden_dim = 128 81 | gamma = 0.98 82 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 83 | "cpu") 84 | 85 | env_name = 'CartPole-v0' 86 | env = gym.make(env_name) 87 | env.seed(0) 88 | torch.manual_seed(0) 89 | state_dim = env.observation_space.shape[0] 90 | action_dim = env.action_space.n 91 | agent = ActorCritic(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, 92 | gamma, device) 93 | 94 | return_list = rl_utils.train_on_policy_agent(env, agent, num_episodes) 95 | 96 | episodes_list = list(range(len(return_list))) 97 | plt.plot(episodes_list, return_list) 98 | plt.xlabel('Episodes') 99 | plt.ylabel('Returns') 100 | plt.title('Actor-Critic on {}'.format(env_name)) 101 | plt.show() 102 | 103 | mv_return = rl_utils.moving_average(return_list, 9) 104 | plt.plot(episodes_list, mv_return) 105 | plt.xlabel('Episodes') 106 | plt.ylabel('Returns') 107 | plt.title('Actor-Critic on {}'.format(env_name)) 108 | plt.show() -------------------------------------------------------------------------------- /DDPG+HER.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | import random 5 | from tqdm import tqdm 6 | import collections 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | class WorldEnv: 11 | def __init__(self): 12 | self.distance_threshold = 0.15 13 | self.action_bound = 1 14 | 15 | def reset(self): # 重置环境 16 | # 生成一个目标状态, 坐标范围是[3.5~4.5, 3.5~4.5] 17 | self.goal = np.array( 18 | [4 + random.uniform(-0.5, 0.5), 4 + random.uniform(-0.5, 0.5)]) 19 | self.state = np.array([0, 0]) # 初始状态 20 | self.count = 0 21 | return np.hstack((self.state, self.goal)) # 水平方向上拼接 22 | 23 | def step(self, action): 24 | action = np.clip(action, -self.action_bound, self.action_bound) 25 | x = max(0, min(5, self.state[0] + action[0])) 26 | y = max(0, min(5, self.state[1] + action[1])) 27 | self.state = np.array([x, y]) 28 | self.count += 1 29 | 30 | dis = np.sqrt(np.sum(np.square(self.state - self.goal))) 31 | reward = -1.0 if dis > self.distance_threshold else 0 32 | if dis <= self.distance_threshold or self.count == 50: 33 | done = True 34 | else: 35 | done = False 36 | 37 | return np.hstack((self.state, self.goal)), reward, done 38 | 39 | 40 | class PolicyNet(torch.nn.Module): 41 | def __init__(self, state_dim, hidden_dim, action_dim, action_bound): 42 | super(PolicyNet, self).__init__() 43 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 44 | self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim) 45 | self.fc3 = torch.nn.Linear(hidden_dim, action_dim) 46 | self.action_bound = action_bound # action_bound是环境可以接受的动作最大值 47 | 48 | def forward(self, x): 49 | x = F.relu(self.fc2(F.relu(self.fc1(x)))) 50 | return torch.tanh(self.fc3(x)) * self.action_bound 51 | 52 | 53 | class QValueNet(torch.nn.Module): 54 | def __init__(self, state_dim, hidden_dim, action_dim): 55 | super(QValueNet, self).__init__() 56 | self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim) # Q(s,a) 57 | self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim) 58 | self.fc3 = torch.nn.Linear(hidden_dim, 1) 59 | 60 | def forward(self, x, a): 61 | cat = torch.cat([x, a], dim=1) # 拼接状态和动作 62 | x = F.relu(self.fc2(F.relu(self.fc1(cat)))) 63 | return self.fc3(x) 64 | 65 | class DDPG: 66 | ''' DDPG算法 ''' 67 | def __init__(self, state_dim, hidden_dim, action_dim, action_bound, 68 | actor_lr, critic_lr, sigma, tau, gamma, device): 69 | self.action_dim = action_dim 70 | self.actor = PolicyNet(state_dim, hidden_dim, action_dim, 71 | action_bound).to(device) 72 | self.critic = QValueNet(state_dim, hidden_dim, action_dim).to(device) 73 | self.target_actor = PolicyNet(state_dim, hidden_dim, action_dim, 74 | action_bound).to(device) 75 | self.target_critic = QValueNet(state_dim, hidden_dim, 76 | action_dim).to(device) 77 | # 初始化目标价值网络并使其参数和价值网络一样 78 | self.target_critic.load_state_dict(self.critic.state_dict()) 79 | # 初始化目标策略网络并使其参数和策略网络一样 80 | self.target_actor.load_state_dict(self.actor.state_dict()) 81 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), 82 | lr=actor_lr) 83 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), 84 | lr=critic_lr) 85 | self.gamma = gamma 86 | self.sigma = sigma # 高斯噪声的标准差,均值直接设为0 87 | self.tau = tau # 目标网络软更新参数 88 | self.action_bound = action_bound 89 | self.device = device 90 | 91 | def take_action(self, state): 92 | state = torch.tensor([state], dtype=torch.float).to(self.device) # Tensor:(1,4) 93 | action = self.actor(state).detach().cpu().numpy()[0] # ndarray:(2,) 94 | # 给动作添加噪声,增加探索 95 | action = action + self.sigma * np.random.randn(self.action_dim) 96 | return action 97 | 98 | def soft_update(self, net, target_net): 99 | for param_target, param in zip(target_net.parameters(), 100 | net.parameters()): 101 | param_target.data.copy_(param_target.data * (1.0 - self.tau) + 102 | param.data * self.tau) 103 | 104 | def update(self, transition_dict): 105 | states = torch.tensor(transition_dict['states'], 106 | dtype=torch.float).to(self.device) 107 | actions = torch.tensor(transition_dict['actions'], 108 | dtype=torch.float).to(self.device) 109 | rewards = torch.tensor(transition_dict['rewards'], 110 | dtype=torch.float).view(-1, 1).to(self.device) 111 | next_states = torch.tensor(transition_dict['next_states'], 112 | dtype=torch.float).to(self.device) 113 | dones = torch.tensor(transition_dict['dones'], 114 | dtype=torch.float).view(-1, 1).to(self.device) 115 | 116 | next_q_values = self.target_critic(next_states, 117 | self.target_actor(next_states)) 118 | q_targets = rewards + self.gamma * next_q_values * (1 - dones) 119 | # MSE损失函数 120 | critic_loss = torch.mean( 121 | F.mse_loss(self.critic(states, actions), q_targets)) 122 | self.critic_optimizer.zero_grad() 123 | critic_loss.backward() 124 | self.critic_optimizer.step() 125 | 126 | # 策略网络就是为了使Q值最大化 127 | actor_loss = -torch.mean(self.critic(states, self.actor(states))) 128 | self.actor_optimizer.zero_grad() 129 | actor_loss.backward() 130 | self.actor_optimizer.step() 131 | 132 | self.soft_update(self.actor, self.target_actor) # 软更新策略网络 133 | self.soft_update(self.critic, self.target_critic) # 软更新价值网络 134 | 135 | class Trajectory: 136 | ''' 用来记录一条完整轨迹 ''' 137 | def __init__(self, init_state): 138 | self.states = [init_state] 139 | self.actions = [] 140 | self.rewards = [] 141 | self.dones = [] 142 | self.length = 0 143 | 144 | def store_step(self, action, state, reward, done): 145 | self.actions.append(action) 146 | self.states.append(state) 147 | self.rewards.append(reward) 148 | self.dones.append(done) 149 | self.length += 1 150 | 151 | 152 | class ReplayBuffer_Trajectory: 153 | ''' 存储轨迹的经验回放池 ''' 154 | def __init__(self, capacity): 155 | self.buffer = collections.deque(maxlen=capacity) 156 | 157 | def add_trajectory(self, trajectory): 158 | self.buffer.append(trajectory) 159 | 160 | def size(self): 161 | return len(self.buffer) 162 | 163 | def sample(self, batch_size, use_her, dis_threshold=0.15, her_ratio=0.8): 164 | batch = dict(states=[], 165 | actions=[], 166 | next_states=[], 167 | rewards=[], 168 | dones=[]) 169 | for _ in range(batch_size): # batch_size=256 170 | traj = random.sample(self.buffer, 1)[0] # 从buffer中随机抽样一个轨迹 171 | step_state = np.random.randint(traj.length) # 从抽样的轨迹中随机抽样出一个transition 172 | state = traj.states[step_state] # 抽样出的transition的state 173 | next_state = traj.states[step_state + 1] # 抽样出的transition的next_state 174 | action = traj.actions[step_state] # 抽样出的transition的动作 175 | reward = traj.rewards[step_state] # 抽样出的transition的奖励 176 | done = traj.dones[step_state] # 抽样出的transition是否done 177 | 178 | if use_her and np.random.uniform() <= her_ratio: 179 | step_goal = np.random.randint(step_state + 1, traj.length + 1) # 从上面transition之后的轨迹选择一个设置之后的goal 180 | goal = traj.states[step_goal][:2] # 使用HER算法的future方案设置目标,选择此transition的当前位置作为goal 181 | dis = np.sqrt(np.sum(np.square(next_state[:2] - goal))) 182 | reward = -1.0 if dis > dis_threshold else 0 183 | done = False if dis > dis_threshold else True 184 | state = np.hstack((state[:2], goal)) # 将原来的初始位置和后来挑选的goal拼接 185 | next_state = np.hstack((next_state[:2], goal)) # 将原来的下一个transition的初始位置和goal拼接 186 | 187 | batch['states'].append(state) 188 | batch['next_states'].append(next_state) 189 | batch['actions'].append(action) 190 | batch['rewards'].append(reward) 191 | batch['dones'].append(done) 192 | 193 | batch['states'] = np.array(batch['states']) # 256*4 194 | batch['next_states'] = np.array(batch['next_states']) # 256*4 195 | batch['actions'] = np.array(batch['actions']) # 256*2 196 | return batch 197 | 198 | actor_lr = 1e-3 199 | critic_lr = 1e-3 200 | hidden_dim = 128 201 | state_dim = 4 202 | action_dim = 2 203 | action_bound = 1 204 | sigma = 0.1 205 | tau = 0.005 206 | gamma = 0.98 207 | num_episodes = 2000 208 | n_train = 20 209 | batch_size = 256 210 | minimal_episodes = 200 211 | buffer_size = 10000 212 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 213 | "cpu") 214 | 215 | random.seed(0) 216 | np.random.seed(0) 217 | torch.manual_seed(0) 218 | env = WorldEnv() 219 | replay_buffer = ReplayBuffer_Trajectory(buffer_size) 220 | agent = DDPG(state_dim, hidden_dim, action_dim, action_bound, actor_lr, 221 | critic_lr, sigma, tau, gamma, device) 222 | 223 | return_list = [] 224 | for i in range(10): 225 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 226 | for i_episode in range(int(num_episodes / 10)): 227 | episode_return = 0 228 | state = env.reset() # state为初始位置与目标位置的拼接 229 | traj = Trajectory(state) 230 | done = False 231 | while not done: 232 | action = agent.take_action(state) 233 | state, reward, done = env.step(action) 234 | episode_return += reward 235 | traj.store_step(action, state, reward, done) 236 | replay_buffer.add_trajectory(traj) 237 | return_list.append(episode_return) 238 | if replay_buffer.size() >= minimal_episodes: 239 | for _ in range(n_train): 240 | transition_dict = replay_buffer.sample(batch_size, True) 241 | agent.update(transition_dict) 242 | if (i_episode + 1) % 10 == 0: 243 | pbar.set_postfix({ 244 | 'episode': 245 | '%d' % (num_episodes / 10 * i + i_episode + 1), 246 | 'return': 247 | '%.3f' % np.mean(return_list[-10:]) 248 | }) 249 | pbar.update(1) 250 | episodes_list = list(range(len(return_list))) 251 | plt.plot(episodes_list, return_list) 252 | plt.xlabel('Episodes') 253 | plt.ylabel('Returns') 254 | plt.title('DDPG with HER on {}'.format('GridWorld')) 255 | plt.show() 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | random.seed(0) 265 | np.random.seed(0) 266 | torch.manual_seed(0) 267 | env = WorldEnv() 268 | replay_buffer = ReplayBuffer_Trajectory(buffer_size) 269 | agent = DDPG(state_dim, hidden_dim, action_dim, action_bound, actor_lr, 270 | critic_lr, sigma, tau, gamma, device) 271 | 272 | return_list = [] 273 | for i in range(10): 274 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 275 | for i_episode in range(int(num_episodes / 10)): 276 | episode_return = 0 277 | state = env.reset() 278 | traj = Trajectory(state) 279 | done = False 280 | while not done: 281 | action = agent.take_action(state) 282 | state, reward, done = env.step(action) 283 | episode_return += reward 284 | traj.store_step(action, state, reward, done) 285 | replay_buffer.add_trajectory(traj) 286 | return_list.append(episode_return) 287 | if replay_buffer.size() >= minimal_episodes: 288 | for _ in range(n_train): 289 | # 和使用HER训练的唯一区别 290 | transition_dict = replay_buffer.sample(batch_size, False) 291 | agent.update(transition_dict) 292 | if (i_episode + 1) % 10 == 0: 293 | pbar.set_postfix({ 294 | 'episode': 295 | '%d' % (num_episodes / 10 * i + i_episode + 1), 296 | 'return': 297 | '%.3f' % np.mean(return_list[-10:]) 298 | }) 299 | pbar.update(1) 300 | 301 | episodes_list = list(range(len(return_list))) 302 | plt.plot(episodes_list, return_list) 303 | plt.xlabel('Episodes') 304 | plt.ylabel('Returns') 305 | plt.title('DDPG without HER on {}'.format('GridWorld')) 306 | plt.show() -------------------------------------------------------------------------------- /DDPG.py: -------------------------------------------------------------------------------- 1 | import random 2 | import gym 3 | import numpy as np 4 | from tqdm import tqdm 5 | import torch 6 | from torch import nn 7 | import torch.nn.functional as F 8 | import matplotlib.pyplot as plt 9 | import rl_utils 10 | 11 | class PolicyNet(torch.nn.Module): 12 | def __init__(self, state_dim, hidden_dim, action_dim, action_bound): 13 | super(PolicyNet, self).__init__() 14 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 15 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 16 | self.action_bound = action_bound # action_bound是环境可以接受的动作最大值 17 | 18 | def forward(self, x): 19 | x = F.relu(self.fc1(x)) 20 | return torch.tanh(self.fc2(x)) * self.action_bound 21 | 22 | 23 | class QValueNet(torch.nn.Module): # Q(s,a) 24 | def __init__(self, state_dim, hidden_dim, action_dim): 25 | super(QValueNet, self).__init__() 26 | self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim) 27 | self.fc2 = torch.nn.Linear(hidden_dim, 1) 28 | 29 | def forward(self, x, a): 30 | cat = torch.cat([x, a], dim=1) # 拼接状态和动作 31 | x = F.relu(self.fc1(cat)) 32 | return self.fc2(x) 33 | 34 | 35 | class TwoLayerFC(torch.nn.Module): 36 | # 这是一个简单的两层神经网络 37 | def __init__(self, 38 | num_in, 39 | num_out, 40 | hidden_dim, 41 | activation=F.relu, 42 | out_fn=lambda x: x): 43 | super().__init__() 44 | self.fc1 = nn.Linear(num_in, hidden_dim) 45 | self.fc2 = nn.Linear(hidden_dim, hidden_dim) 46 | self.fc3 = nn.Linear(hidden_dim, num_out) 47 | 48 | self.activation = activation 49 | self.out_fn = out_fn 50 | 51 | def forward(self, x): 52 | x = self.activation(self.fc1(x)) 53 | x = self.activation(self.fc2(x)) 54 | x = self.out_fn(self.fc3(x)) 55 | return x 56 | 57 | class DDPG: 58 | ''' DDPG算法 ''' 59 | def __init__(self, num_in_actor, num_out_actor, num_in_critic, hidden_dim, 60 | discrete, action_bound, sigma, actor_lr, critic_lr, tau, 61 | gamma, device): 62 | # self.actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device) 63 | # self.critic = QValueNet(state_dim, hidden_dim, action_dim).to(device) 64 | # self.target_actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device) 65 | # self.target_critic = QValueNet(state_dim, hidden_dim, action_dim).to(device) 66 | out_fn = (lambda x: x) if discrete else ( 67 | lambda x: torch.tanh(x) * action_bound) 68 | self.actor = TwoLayerFC(num_in_actor, 69 | num_out_actor, 70 | hidden_dim, 71 | activation=F.relu, 72 | out_fn=out_fn).to(device) 73 | self.target_actor = TwoLayerFC(num_in_actor, 74 | num_out_actor, 75 | hidden_dim, 76 | activation=F.relu, 77 | out_fn=out_fn).to(device) 78 | self.critic = TwoLayerFC(num_in_critic, 1, hidden_dim).to(device) 79 | self.target_critic = TwoLayerFC(num_in_critic, 1, 80 | hidden_dim).to(device) 81 | # 初始化目标价值网络并设置和价值网络相同的参数 82 | self.target_critic.load_state_dict(self.critic.state_dict()) 83 | # 初始化目标策略网络并设置和策略相同的参数 84 | self.target_actor.load_state_dict(self.actor.state_dict()) 85 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), 86 | lr=actor_lr) 87 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), 88 | lr=critic_lr) 89 | self.gamma = gamma 90 | self.sigma = sigma # 高斯噪声的标准差,均值直接设为0 91 | self.action_bound = action_bound # action_bound是环境可以接受的动作最大值 92 | self.tau = tau # 目标网络软更新参数 93 | self.action_dim = num_out_actor 94 | self.device = device 95 | 96 | def take_action(self, state): 97 | state = torch.tensor([state], dtype=torch.float).to(self.device) 98 | action = self.actor(state).item() 99 | # 给动作添加噪声,增加探索 100 | action = action + self.sigma * np.random.randn(self.action_dim) 101 | return action 102 | 103 | def soft_update(self, net, target_net): 104 | for param_target, param in zip(target_net.parameters(), 105 | net.parameters()): 106 | param_target.data.copy_(param_target.data * (1.0 - self.tau) + 107 | param.data * self.tau) 108 | 109 | def update(self, transition_dict): 110 | states = torch.tensor(transition_dict['states'], 111 | dtype=torch.float).to(self.device) 112 | actions = torch.tensor(transition_dict['actions'], 113 | dtype=torch.float).view(-1, 1).to(self.device) 114 | rewards = torch.tensor(transition_dict['rewards'], 115 | dtype=torch.float).view(-1, 1).to(self.device) 116 | next_states = torch.tensor(transition_dict['next_states'], 117 | dtype=torch.float).to(self.device) 118 | dones = torch.tensor(transition_dict['dones'], 119 | dtype=torch.float).view(-1, 1).to(self.device) 120 | 121 | next_q_values = self.target_critic( 122 | torch.cat( 123 | [next_states, self.target_actor(next_states)], dim=1)) 124 | q_targets = rewards + self.gamma * next_q_values * (1 - dones) 125 | critic_loss = torch.mean( 126 | F.mse_loss( 127 | # MSE损失函数 128 | self.critic(torch.cat([states, actions], dim=1)), 129 | q_targets)) 130 | self.critic_optimizer.zero_grad() 131 | critic_loss.backward() 132 | self.critic_optimizer.step() 133 | 134 | actor_loss = -torch.mean( 135 | self.critic( 136 | # 策略网络就是为了使得Q值最大化 137 | torch.cat([states, self.actor(states)], dim=1))) 138 | self.actor_optimizer.zero_grad() 139 | actor_loss.backward() 140 | self.actor_optimizer.step() 141 | 142 | self.soft_update(self.actor, self.target_actor) # 软更新策略网络 143 | self.soft_update(self.critic, self.target_critic) # 软更新价值网络 144 | 145 | actor_lr = 5e-4 146 | critic_lr = 5e-3 147 | num_episodes = 200 148 | hidden_dim = 64 149 | gamma = 0.98 150 | tau = 0.005 # 软更新参数 151 | buffer_size = 10000 152 | minimal_size = 1000 153 | batch_size = 64 154 | sigma = 0.01 # 高斯噪声标准差 155 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 156 | "cpu") 157 | 158 | env_name = 'Pendulum-v0' 159 | env = gym.make(env_name) 160 | random.seed(0) 161 | np.random.seed(0) 162 | env.seed(0) 163 | torch.manual_seed(0) 164 | replay_buffer = rl_utils.ReplayBuffer(buffer_size) 165 | state_dim = env.observation_space.shape[0] 166 | action_dim = env.action_space.shape[0] 167 | action_bound = env.action_space.high[0] # 动作最大值 168 | agent = DDPG(state_dim, action_dim, state_dim + action_dim, hidden_dim, False, 169 | action_bound, sigma, actor_lr, critic_lr, tau, gamma, device) 170 | 171 | return_list = rl_utils.train_off_policy_agent(env, agent, num_episodes, 172 | replay_buffer, minimal_size, 173 | batch_size) 174 | 175 | episodes_list = list(range(len(return_list))) 176 | plt.plot(episodes_list, return_list) 177 | plt.xlabel('Episodes') 178 | plt.ylabel('Returns') 179 | plt.title('DDPG on {}'.format(env_name)) 180 | plt.show() 181 | 182 | mv_return = rl_utils.moving_average(return_list, 9) 183 | plt.plot(episodes_list, mv_return) 184 | plt.xlabel('Episodes') 185 | plt.ylabel('Returns') 186 | plt.title('DDPG on {}'.format(env_name)) 187 | plt.show() -------------------------------------------------------------------------------- /DQN+.py: -------------------------------------------------------------------------------- 1 | import random 2 | import gym 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | import matplotlib.pyplot as plt 7 | import rl_utils 8 | from tqdm import tqdm 9 | 10 | 11 | class Qnet(torch.nn.Module): 12 | ''' 只有一层隐藏层的Q网络 ''' 13 | def __init__(self, state_dim, hidden_dim, action_dim): 14 | super(Qnet, self).__init__() 15 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 16 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 17 | 18 | def forward(self, x): 19 | x = F.relu(self.fc1(x)) 20 | return self.fc2(x) 21 | 22 | class DQN: 23 | ''' DQN算法,包括Double DQN ''' 24 | def __init__(self, 25 | state_dim, 26 | hidden_dim, 27 | action_dim, 28 | learning_rate, 29 | gamma, 30 | epsilon, 31 | target_update, 32 | device, 33 | dqn_type='VanillaDQN'): 34 | self.action_dim = action_dim 35 | self.q_net = Qnet(state_dim, hidden_dim, self.action_dim).to(device) 36 | self.target_q_net = Qnet(state_dim, hidden_dim, 37 | self.action_dim).to(device) 38 | self.optimizer = torch.optim.Adam(self.q_net.parameters(), 39 | lr=learning_rate) 40 | self.gamma = gamma 41 | self.epsilon = epsilon 42 | self.target_update = target_update 43 | self.count = 0 44 | self.dqn_type = dqn_type 45 | self.device = device 46 | 47 | def take_action(self, state): 48 | if np.random.random() < self.epsilon: 49 | action = np.random.randint(self.action_dim) 50 | else: 51 | state = torch.tensor([state], dtype=torch.float).to(self.device) 52 | action = self.q_net(state).argmax().item() 53 | return action 54 | 55 | def max_q_value(self, state): 56 | state = torch.tensor([state], dtype=torch.float).to(self.device) 57 | return self.q_net(state).max().item() 58 | 59 | def update(self, transition_dict): 60 | states = torch.tensor(transition_dict['states'], 61 | dtype=torch.float).to(self.device) 62 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 63 | self.device) 64 | rewards = torch.tensor(transition_dict['rewards'], 65 | dtype=torch.float).view(-1, 1).to(self.device) 66 | next_states = torch.tensor(transition_dict['next_states'], 67 | dtype=torch.float).to(self.device) 68 | dones = torch.tensor(transition_dict['dones'], 69 | dtype=torch.float).view(-1, 1).to(self.device) 70 | 71 | q_values = self.q_net(states).gather(1, actions) # Q值 72 | # 下个状态的最大Q值 73 | if self.dqn_type == 'DoubleDQN': # DQN与Double DQN的区别 74 | max_action = self.q_net(next_states).max(1)[1].view(-1, 1) 75 | max_next_q_values = self.target_q_net(next_states).gather(1, max_action) 76 | else: # DQN的情况 77 | max_next_q_values = self.target_q_net(next_states).max(1)[0].view(-1, 1) 78 | q_targets = rewards + self.gamma * max_next_q_values * (1 - dones) # TD误差目标 79 | dqn_loss = torch.mean(F.mse_loss(q_values, q_targets)) # 均方误差损失函数 80 | self.optimizer.zero_grad() # PyTorch中默认梯度会累积,这里需要显式将梯度置为0 81 | dqn_loss.backward() # 反向传播更新参数 82 | self.optimizer.step() 83 | 84 | if self.count % self.target_update == 0: 85 | self.target_q_net.load_state_dict( 86 | self.q_net.state_dict()) # 更新目标网络 87 | self.count += 1 88 | 89 | lr = 1e-2 90 | 91 | lr = 1e-2 92 | num_episodes = 200 93 | hidden_dim = 128 94 | gamma = 0.98 95 | epsilon = 0.01 96 | target_update = 50 97 | buffer_size = 5000 98 | minimal_size = 1000 99 | batch_size = 64 100 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 101 | "cpu") 102 | 103 | env_name = 'Pendulum-v0' 104 | env = gym.make(env_name) 105 | state_dim = env.observation_space.shape[0] 106 | action_dim = 11 # 将连续动作分成11个离散动作 107 | 108 | 109 | def dis_to_con(discrete_action, env, action_dim): # 离散动作转回连续的函数 110 | action_lowbound = env.action_space.low[0] # 连续动作的最小值 111 | action_upbound = env.action_space.high[0] # 连续动作的最大值 112 | return action_lowbound + (discrete_action / 113 | (action_dim - 1)) * (action_upbound - 114 | action_lowbound) 115 | 116 | def train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, 117 | batch_size): 118 | return_list = [] 119 | max_q_value_list = [] 120 | max_q_value = 0 121 | for i in range(10): 122 | with tqdm(total=int(num_episodes / 10), 123 | desc='Iteration %d' % i) as pbar: 124 | for i_episode in range(int(num_episodes / 10)): 125 | episode_return = 0 126 | state = env.reset() 127 | done = False 128 | while not done: 129 | action = agent.take_action(state) 130 | max_q_value = agent.max_q_value( 131 | state) * 0.005 + max_q_value * 0.995 # 平滑处理 132 | max_q_value_list.append(max_q_value) # 保存每个状态的最大Q值 133 | action_continuous = dis_to_con(action, env, 134 | agent.action_dim) 135 | next_state, reward, done, _ = env.step([action_continuous]) 136 | replay_buffer.add(state, action, reward, next_state, done) 137 | state = next_state 138 | episode_return += reward 139 | if replay_buffer.size() > minimal_size: 140 | b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample( 141 | batch_size) 142 | transition_dict = { 143 | 'states': b_s, 144 | 'actions': b_a, 145 | 'next_states': b_ns, 146 | 'rewards': b_r, 147 | 'dones': b_d 148 | } 149 | agent.update(transition_dict) 150 | return_list.append(episode_return) 151 | if (i_episode + 1) % 10 == 0: 152 | pbar.set_postfix({ 153 | 'episode': 154 | '%d' % (num_episodes / 10 * i + i_episode + 1), 155 | 'return': 156 | '%.3f' % np.mean(return_list[-10:]) 157 | }) 158 | pbar.update(1) 159 | return return_list, max_q_value_list 160 | 161 | random.seed(0) 162 | np.random.seed(0) 163 | env.seed(0) 164 | torch.manual_seed(0) 165 | replay_buffer = rl_utils.ReplayBuffer(buffer_size) 166 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, 167 | target_update, device) 168 | return_list, max_q_value_list = train_DQN(agent, env, num_episodes, 169 | replay_buffer, minimal_size, 170 | batch_size) 171 | 172 | episodes_list = list(range(len(return_list))) 173 | mv_return = rl_utils.moving_average(return_list, 5) 174 | plt.plot(episodes_list, mv_return) 175 | plt.xlabel('Episodes') 176 | plt.ylabel('Returns') 177 | plt.title('DQN on {}'.format(env_name)) 178 | plt.show() 179 | 180 | frames_list = list(range(len(max_q_value_list))) 181 | plt.plot(frames_list, max_q_value_list) 182 | plt.axhline(0, c='orange', ls='--') 183 | plt.axhline(10, c='red', ls='--') 184 | plt.xlabel('Frames') 185 | plt.ylabel('Q value') 186 | plt.title('DQN on {}'.format(env_name)) 187 | plt.show() 188 | 189 | 190 | 191 | #### Double DQN 192 | 193 | random.seed(0) 194 | np.random.seed(0) 195 | env.seed(0) 196 | torch.manual_seed(0) 197 | replay_buffer = rl_utils.ReplayBuffer(buffer_size) 198 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, 199 | target_update, device, 'DoubleDQN') 200 | return_list, max_q_value_list = train_DQN(agent, env, num_episodes, 201 | replay_buffer, minimal_size, 202 | batch_size) 203 | 204 | episodes_list = list(range(len(return_list))) 205 | mv_return = rl_utils.moving_average(return_list, 5) 206 | plt.plot(episodes_list, mv_return) 207 | plt.xlabel('Episodes') 208 | plt.ylabel('Returns') 209 | plt.title('Double DQN on {}'.format(env_name)) 210 | plt.show() 211 | 212 | frames_list = list(range(len(max_q_value_list))) 213 | plt.plot(frames_list, max_q_value_list) 214 | plt.axhline(0, c='orange', ls='--') 215 | plt.axhline(10, c='red', ls='--') 216 | plt.xlabel('Frames') 217 | plt.ylabel('Q value') 218 | plt.title('Double DQN on {}'.format(env_name)) 219 | plt.show() 220 | 221 | 222 | 223 | ###### Dueiling DQN 224 | class VAnet(torch.nn.Module): 225 | ''' 只有一层隐藏层的A网络和V网络 ''' 226 | def __init__(self, state_dim, hidden_dim, action_dim): 227 | super(VAnet, self).__init__() 228 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) # 共享网络部分 229 | self.fc_A = torch.nn.Linear(hidden_dim, action_dim) 230 | self.fc_V = torch.nn.Linear(hidden_dim, 1) 231 | 232 | def forward(self, x): 233 | A = self.fc_A(F.relu(self.fc1(x))) 234 | V = self.fc_V(F.relu(self.fc1(x))) 235 | Q = V + A - A.mean(1).view(-1, 1) # Q值由V值和A值计算得到 236 | return Q 237 | 238 | 239 | class DQN: 240 | ''' DQN算法,包括Double DQN和Dueling DQN ''' 241 | def __init__(self, 242 | state_dim, 243 | hidden_dim, 244 | action_dim, 245 | learning_rate, 246 | gamma, 247 | epsilon, 248 | target_update, 249 | device, 250 | dqn_type='VanillaDQN'): 251 | self.action_dim = action_dim 252 | if dqn_type == 'DuelingDQN': # Dueling DQN采取不一样的网络框架 253 | self.q_net = VAnet(state_dim, hidden_dim, 254 | self.action_dim).to(device) 255 | self.target_q_net = VAnet(state_dim, hidden_dim, 256 | self.action_dim).to(device) 257 | else: 258 | self.q_net = Qnet(state_dim, hidden_dim, 259 | self.action_dim).to(device) 260 | self.target_q_net = Qnet(state_dim, hidden_dim, 261 | self.action_dim).to(device) 262 | self.optimizer = torch.optim.Adam(self.q_net.parameters(), 263 | lr=learning_rate) 264 | self.gamma = gamma 265 | self.epsilon = epsilon 266 | self.target_update = target_update 267 | self.count = 0 268 | self.dqn_type = dqn_type 269 | self.device = device 270 | 271 | def take_action(self, state): 272 | if np.random.random() < self.epsilon: 273 | action = np.random.randint(self.action_dim) 274 | else: 275 | state = torch.tensor([state], dtype=torch.float).to(self.device) 276 | action = self.q_net(state).argmax().item() 277 | return action 278 | 279 | def max_q_value(self, state): 280 | state = torch.tensor([state], dtype=torch.float).to(self.device) 281 | return self.q_net(state).max().item() 282 | 283 | def update(self, transition_dict): 284 | states = torch.tensor(transition_dict['states'], 285 | dtype=torch.float).to(self.device) 286 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 287 | self.device) 288 | rewards = torch.tensor(transition_dict['rewards'], 289 | dtype=torch.float).view(-1, 1).to(self.device) 290 | next_states = torch.tensor(transition_dict['next_states'], 291 | dtype=torch.float).to(self.device) 292 | dones = torch.tensor(transition_dict['dones'], 293 | dtype=torch.float).view(-1, 1).to(self.device) 294 | 295 | q_values = self.q_net(states).gather(1, actions) 296 | if self.dqn_type == 'DoubleDQN': 297 | max_action = self.q_net(next_states).max(1)[1].view(-1, 1) 298 | max_next_q_values = self.target_q_net(next_states).gather( 299 | 1, max_action) 300 | else: 301 | max_next_q_values = self.target_q_net(next_states).max(1)[0].view( 302 | -1, 1) 303 | q_targets = rewards + self.gamma * max_next_q_values * (1 - dones) 304 | dqn_loss = torch.mean(F.mse_loss(q_values, q_targets)) 305 | self.optimizer.zero_grad() 306 | dqn_loss.backward() 307 | self.optimizer.step() 308 | 309 | if self.count % self.target_update == 0: 310 | self.target_q_net.load_state_dict(self.q_net.state_dict()) 311 | self.count += 1 312 | 313 | 314 | random.seed(0) 315 | np.random.seed(0) 316 | env.seed(0) 317 | torch.manual_seed(0) 318 | replay_buffer = rl_utils.ReplayBuffer(buffer_size) 319 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, 320 | target_update, device, 'DuelingDQN') 321 | return_list, max_q_value_list = train_DQN(agent, env, num_episodes, 322 | replay_buffer, minimal_size, 323 | batch_size) 324 | 325 | episodes_list = list(range(len(return_list))) 326 | mv_return = rl_utils.moving_average(return_list, 5) 327 | plt.plot(episodes_list, mv_return) 328 | plt.xlabel('Episodes') 329 | plt.ylabel('Returns') 330 | plt.title('Dueling DQN on {}'.format(env_name)) 331 | plt.show() 332 | 333 | frames_list = list(range(len(max_q_value_list))) 334 | plt.plot(frames_list, max_q_value_list) 335 | plt.axhline(0, c='orange', ls='--') 336 | plt.axhline(10, c='red', ls='--') 337 | plt.xlabel('Frames') 338 | plt.ylabel('Q value') 339 | plt.title('Dueling DQN on {}'.format(env_name)) 340 | plt.show() -------------------------------------------------------------------------------- /DQN.py: -------------------------------------------------------------------------------- 1 | import random 2 | import gym 3 | import numpy as np 4 | import collections 5 | from tqdm import tqdm 6 | import torch 7 | import torch.nn.functional as F 8 | import matplotlib.pyplot as plt 9 | import rl_utils 10 | 11 | 12 | class ReplayBuffer: 13 | ''' 经验回放池 ''' 14 | def __init__(self, capacity): 15 | self.buffer = collections.deque(maxlen=capacity) # 队列,先进先出 16 | 17 | def add(self, state, action, reward, next_state, done): # 将数据加入buffer 18 | self.buffer.append((state, action, reward, next_state, done)) 19 | 20 | def sample(self, batch_size): # 从buffer中采样数据,数量为batch_size 21 | transitions = random.sample(self.buffer, batch_size) 22 | state, action, reward, next_state, done = zip(*transitions) 23 | return np.array(state), action, reward, np.array(next_state), done 24 | 25 | def size(self): # 目前buffer中数据的数量 26 | return len(self.buffer) 27 | 28 | class Qnet(torch.nn.Module): 29 | ''' 只有一层隐藏层的Q网络 ''' 30 | def __init__(self, state_dim, hidden_dim, action_dim): 31 | super(Qnet, self).__init__() 32 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 33 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 34 | 35 | def forward(self, x): 36 | x = F.relu(self.fc1(x)) # 隐藏层使用ReLU激活函数 37 | return self.fc2(x) 38 | 39 | class DQN: 40 | ''' DQN算法 ''' 41 | def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma, 42 | epsilon, target_update, device): 43 | self.action_dim = action_dim 44 | self.q_net = Qnet(state_dim, hidden_dim, 45 | self.action_dim).to(device) # Q网络 46 | # 目标网络 47 | self.target_q_net = Qnet(state_dim, hidden_dim, 48 | self.action_dim).to(device) 49 | # 使用Adam优化器 50 | self.optimizer = torch.optim.Adam(self.q_net.parameters(), 51 | lr=learning_rate) 52 | self.gamma = gamma # 折扣因子 53 | self.epsilon = epsilon # epsilon-贪婪策略 54 | self.target_update = target_update # 目标网络更新频率 55 | self.count = 0 # 计数器,记录更新次数 56 | self.device = device 57 | 58 | def take_action(self, state): # epsilon-贪婪策略采取动作 59 | if np.random.random() < self.epsilon: 60 | action = np.random.randint(self.action_dim) 61 | else: 62 | state = torch.tensor([state], dtype=torch.float).to(self.device) 63 | action = self.q_net(state).argmax().item() 64 | # .argmax返回大的值对应的索引值 65 | # .item抽取出tensor中的数 66 | return action 67 | 68 | def update(self, transition_dict): 69 | states = torch.tensor(transition_dict['states'], 70 | dtype=torch.float).to(self.device) 71 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 72 | self.device) 73 | rewards = torch.tensor(transition_dict['rewards'], 74 | dtype=torch.float).view(-1, 1).to(self.device) 75 | next_states = torch.tensor(transition_dict['next_states'], 76 | dtype=torch.float).to(self.device) 77 | dones = torch.tensor(transition_dict['dones'], 78 | dtype=torch.float).view(-1, 1).to(self.device) 79 | 80 | q_values = self.q_net(states).gather(1, actions) # Q值 81 | # 下个状态的最大Q值 82 | max_next_q_values = self.target_q_net(next_states).max(1)[0].view( 83 | -1, 1) 84 | q_targets = rewards + self.gamma * max_next_q_values * (1 - dones 85 | ) # TD误差目标 86 | dqn_loss = torch.mean(F.mse_loss(q_values, q_targets)) # 均方误差损失函数 87 | self.optimizer.zero_grad() # PyTorch中默认梯度会累积,这里需要显式将梯度置为0 88 | dqn_loss.backward() # 反向传播更新参数 89 | self.optimizer.step() 90 | 91 | if self.count % self.target_update == 0: 92 | self.target_q_net.load_state_dict( 93 | self.q_net.state_dict()) # 更新目标网络 94 | self.count += 1 95 | 96 | lr = 2e-3 97 | num_episodes = 500 98 | hidden_dim = 128 99 | gamma = 0.98 100 | epsilon = 0.01 101 | target_update = 10 102 | buffer_size = 10000 103 | minimal_size = 500 104 | batch_size = 64 105 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 106 | "cpu") 107 | 108 | env_name = 'CartPole-v0' 109 | env = gym.make(env_name) 110 | random.seed(0) 111 | np.random.seed(0) 112 | env.seed(0) 113 | torch.manual_seed(0) 114 | replay_buffer = ReplayBuffer(buffer_size) 115 | state_dim = env.observation_space.shape[0] 116 | action_dim = env.action_space.n 117 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, 118 | target_update, device) 119 | 120 | return_list = [] 121 | for i in range(10): 122 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 123 | for i_episode in range(int(num_episodes / 10)): 124 | episode_return = 0 125 | state = env.reset() 126 | done = False 127 | while not done: 128 | action = agent.take_action(state) 129 | next_state, reward, done, _ = env.step(action) 130 | replay_buffer.add(state, action, reward, next_state, done) 131 | state = next_state 132 | episode_return += reward 133 | # 当buffer数据的数量超过一定值后,才进行Q网络训练 134 | if replay_buffer.size() > minimal_size: 135 | b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size) 136 | transition_dict = { 137 | 'states': b_s, 138 | 'actions': b_a, 139 | 'next_states': b_ns, 140 | 'rewards': b_r, 141 | 'dones': b_d 142 | } 143 | agent.update(transition_dict) 144 | return_list.append(episode_return) 145 | if (i_episode + 1) % 10 == 0: 146 | pbar.set_postfix({ 147 | 'episode': 148 | '%d' % (num_episodes / 10 * i + i_episode + 1), 149 | 'return': 150 | '%.3f' % np.mean(return_list[-10:]) 151 | }) 152 | pbar.update(1) 153 | 154 | episodes_list = list(range(len(return_list))) 155 | plt.plot(episodes_list, return_list) 156 | plt.xlabel('Episodes') 157 | plt.ylabel('Returns') 158 | plt.title('DQN on {}'.format(env_name)) 159 | plt.show() 160 | 161 | mv_return = rl_utils.moving_average(return_list, 9) 162 | plt.plot(episodes_list, mv_return) 163 | plt.xlabel('Episodes') 164 | plt.ylabel('Returns') 165 | plt.title('DQN on {}'.format(env_name)) 166 | plt.show() -------------------------------------------------------------------------------- /DynaQ.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from tqdm import tqdm 4 | import random 5 | import time 6 | 7 | 8 | class CliffWalkingEnv: 9 | def __init__(self, ncol, nrow): 10 | self.nrow = nrow 11 | self.ncol = ncol 12 | self.x = 0 # 记录当前智能体位置的横坐标 13 | self.y = self.nrow - 1 # 记录当前智能体位置的纵坐标 14 | 15 | def step(self, action): # 外部调用这个函数来改变当前位置 16 | # 4种动作, change[0]:上, change[1]:下, change[2]:左, change[3]:右。坐标系原点(0,0) 17 | # 定义在左上角 18 | change = [[0, -1], [0, 1], [-1, 0], [1, 0]] 19 | self.x = min(self.ncol - 1, max(0, self.x + change[action][0])) 20 | self.y = min(self.nrow - 1, max(0, self.y + change[action][1])) 21 | next_state = self.y * self.ncol + self.x 22 | reward = -1 23 | done = False 24 | if self.y == self.nrow - 1 and self.x > 0: # 下一个位置在悬崖或者目标 25 | done = True 26 | if self.x != self.ncol - 1: 27 | reward = -100 28 | return next_state, reward, done 29 | 30 | def reset(self): # 回归初始状态,起点在左上角 31 | self.x = 0 32 | self.y = self.nrow - 1 33 | return self.y * self.ncol + self.x 34 | 35 | class DynaQ: 36 | """ Dyna-Q算法 """ 37 | def __init__(self, 38 | ncol, 39 | nrow, 40 | epsilon, 41 | alpha, 42 | gamma, 43 | n_planning, 44 | n_action=4): 45 | self.Q_table = np.zeros([nrow * ncol, n_action]) # 初始化Q(s,a)表格 46 | self.n_action = n_action # 动作个数 47 | self.alpha = alpha # 学习率 48 | self.gamma = gamma # 折扣因子 49 | self.epsilon = epsilon # epsilon-贪婪策略中的参数 50 | 51 | self.n_planning = n_planning #执行Q-planning的次数, 对应1次Q-learning 52 | self.model = dict() # 环境模型 53 | 54 | def take_action(self, state): # 选取下一步的操作 55 | if np.random.random() < self.epsilon: 56 | action = np.random.randint(self.n_action) 57 | else: 58 | action = np.argmax(self.Q_table[state]) 59 | return action 60 | 61 | def q_learning(self, s0, a0, r, s1): 62 | td_error = r + self.gamma * self.Q_table[s1].max( 63 | ) - self.Q_table[s0, a0] 64 | self.Q_table[s0, a0] += self.alpha * td_error 65 | 66 | def update(self, s0, a0, r, s1): 67 | self.q_learning(s0, a0, r, s1) 68 | self.model[(s0, a0)] = r, s1 # 将数据添加到模型中 69 | for _ in range(self.n_planning): # Q-planning循环 70 | # 随机选择曾经遇到过的状态动作对 71 | (s, a), (r, s_) = random.choice(list(self.model.items())) 72 | self.q_learning(s, a, r, s_) 73 | 74 | def DynaQ_CliffWalking(n_planning): 75 | ncol = 12 76 | nrow = 4 77 | env = CliffWalkingEnv(ncol, nrow) 78 | epsilon = 0.01 79 | alpha = 0.1 80 | gamma = 0.9 81 | agent = DynaQ(ncol, nrow, epsilon, alpha, gamma, n_planning) 82 | num_episodes = 300 # 智能体在环境中运行多少条序列 83 | 84 | return_list = [] # 记录每一条序列的回报 85 | for i in range(10): # 显示10个进度条 86 | # tqdm的进度条功能 87 | with tqdm(total=int(num_episodes / 10), 88 | desc='Iteration %d' % i) as pbar: 89 | for i_episode in range(int(num_episodes / 10)): # 每个进度条的序列数 90 | episode_return = 0 91 | state = env.reset() 92 | done = False 93 | while not done: 94 | action = agent.take_action(state) 95 | next_state, reward, done = env.step(action) 96 | episode_return += reward # 这里回报的计算不进行折扣因子衰减 97 | agent.update(state, action, reward, next_state) 98 | state = next_state 99 | return_list.append(episode_return) 100 | if (i_episode + 1) % 10 == 0: # 每10条序列打印一下这10条序列的平均回报 101 | pbar.set_postfix({ 102 | 'episode': 103 | '%d' % (num_episodes / 10 * i + i_episode + 1), 104 | 'return': 105 | '%.3f' % np.mean(return_list[-10:]) 106 | }) 107 | pbar.update(1) 108 | return return_list 109 | 110 | np.random.seed(0) 111 | random.seed(0) 112 | n_planning_list = [0, 2, 20] 113 | for n_planning in n_planning_list: 114 | print('Q-planning步数为:%d' % n_planning) 115 | time.sleep(0.5) 116 | return_list = DynaQ_CliffWalking(n_planning) 117 | episodes_list = list(range(len(return_list))) 118 | plt.plot(episodes_list, 119 | return_list, 120 | label=str(n_planning) + ' planning steps') 121 | plt.legend() 122 | plt.xlabel('Episodes') 123 | plt.ylabel('Returns') 124 | plt.title('Dyna-Q on {}'.format('Cliff Walking')) 125 | plt.show() 126 | -------------------------------------------------------------------------------- /MC.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | np.random.seed(0) 3 | # 定义状态转移概率矩阵P 4 | P = [ 5 | [0.9, 0.1, 0.0, 0.0, 0.0, 0.0], 6 | [0.5, 0.0, 0.5, 0.0, 0.0, 0.0], 7 | [0.0, 0.0, 0.0, 0.6, 0.0, 0.4], 8 | [0.0, 0.0, 0.0, 0.0, 0.3, 0.7], 9 | [0.0, 0.2, 0.3, 0.5, 0.0, 0.0], 10 | [0.0, 0.0, 0.0, 0.0, 0.0, 1.0], 11 | ] 12 | P = np.array(P) 13 | 14 | rewards = [-1, -2, -2, 10, 1, 0] # 定义奖励函数 15 | gamma = 0.5 # 定义折扣因子 16 | 17 | 18 | # 给定一条序列,计算从某个索引(起始状态)开始到序列最后(终止状态)得到的回报 19 | def compute_return(start_index, chain, gamma): 20 | G = 0 21 | for i in reversed(range(start_index, len(chain))): 22 | G = gamma * G + rewards[chain[i] - 1] 23 | return G 24 | 25 | 26 | # 一个状态序列,s1-s2-s3-s6 27 | chain = [1, 2, 3, 6] 28 | start_index = 0 29 | G = compute_return(start_index, chain, gamma) 30 | print("根据本序列计算得到回报为:%s。" % G) 31 | 32 | # 根据本序列计算得到回报为:-2.5。 33 | 34 | def compute(P, rewards, gamma, states_num): 35 | ''' 利用贝尔曼方程的矩阵形式计算解析解,states_num是MRP的状态数 ''' 36 | rewards = np.array(rewards).reshape((-1, 1)) #将rewards写成列向量形式 37 | value = np.dot(np.linalg.inv(np.eye(states_num, states_num) - gamma * P), 38 | rewards) 39 | return value 40 | 41 | 42 | V = compute(P, rewards, gamma, 6) 43 | print("MRP中每个状态价值分别为\n", V) 44 | 45 | # MRP中每个状态价值分别为 46 | # [[-2.01950168] 47 | # [-2.21451846] 48 | # [ 1.16142785] 49 | # [10.53809283] 50 | # [ 3.58728554] 51 | # [ 0. ]] 52 | 53 | S = ["s1", "s2", "s3", "s4", "s5"] # 状态集合 54 | A = ["保持s1", "前往s1", "前往s2", "前往s3", "前往s4", "前往s5", "概率前往"] # 动作集合 55 | # 状态转移函数 56 | P = { 57 | "s1-保持s1-s1": 1.0, 58 | "s1-前往s2-s2": 1.0, 59 | "s2-前往s1-s1": 1.0, 60 | "s2-前往s3-s3": 1.0, 61 | "s3-前往s4-s4": 1.0, 62 | "s3-前往s5-s5": 1.0, 63 | "s4-前往s5-s5": 1.0, 64 | "s4-概率前往-s2": 0.2, 65 | "s4-概率前往-s3": 0.4, 66 | "s4-概率前往-s4": 0.4, 67 | } 68 | # 奖励函数 69 | R = { 70 | "s1-保持s1": -1, 71 | "s1-前往s2": 0, 72 | "s2-前往s1": -1, 73 | "s2-前往s3": -2, 74 | "s3-前往s4": -2, 75 | "s3-前往s5": 0, 76 | "s4-前往s5": 10, 77 | "s4-概率前往": 1, 78 | } 79 | gamma = 0.5 # 折扣因子 80 | MDP = (S, A, P, R, gamma) 81 | 82 | # 策略1,随机策略 83 | Pi_1 = { 84 | "s1-保持s1": 0.5, 85 | "s1-前往s2": 0.5, 86 | "s2-前往s1": 0.5, 87 | "s2-前往s3": 0.5, 88 | "s3-前往s4": 0.5, 89 | "s3-前往s5": 0.5, 90 | "s4-前往s5": 0.5, 91 | "s4-概率前往": 0.5, 92 | } 93 | # 策略2 94 | Pi_2 = { 95 | "s1-保持s1": 0.6, 96 | "s1-前往s2": 0.4, 97 | "s2-前往s1": 0.3, 98 | "s2-前往s3": 0.7, 99 | "s3-前往s4": 0.5, 100 | "s3-前往s5": 0.5, 101 | "s4-前往s5": 0.1, 102 | "s4-概率前往": 0.9, 103 | } 104 | 105 | 106 | # 把输入的两个字符串通过“-”连接,便于使用上述定义的P、R变量 107 | def join(str1, str2): 108 | return str1 + '-' + str2 109 | 110 | 111 | gamma = 0.5 112 | # 转化后的MRP的状态转移矩阵 113 | P_from_mdp_to_mrp = [ 114 | [0.5, 0.5, 0.0, 0.0, 0.0], 115 | [0.5, 0.0, 0.5, 0.0, 0.0], 116 | [0.0, 0.0, 0.0, 0.5, 0.5], 117 | [0.0, 0.1, 0.2, 0.2, 0.5], 118 | [0.0, 0.0, 0.0, 0.0, 1.0], 119 | ] 120 | P_from_mdp_to_mrp = np.array(P_from_mdp_to_mrp) 121 | R_from_mdp_to_mrp = [-0.5, -1.5, -1.0, 5.5, 0] 122 | 123 | V = compute(P_from_mdp_to_mrp, R_from_mdp_to_mrp, gamma, 5) 124 | print("MDP中每个状态价值分别为\n", V) 125 | 126 | # MDP中每个状态价值分别为 127 | # [[-1.22555411] 128 | # [-1.67666232] 129 | # [ 0.51890482] 130 | # [ 6.0756193 ] 131 | # [ 0. ]] 132 | 133 | def sample(MDP, Pi, timestep_max, number): 134 | ''' 采样函数,策略Pi,限制最长时间步timestep_max,总共采样序列数number ''' 135 | S, A, P, R, gamma = MDP 136 | episodes = [] 137 | for _ in range(number): 138 | episode = [] 139 | timestep = 0 140 | s = S[np.random.randint(4)] # 随机选择一个除s5以外的状态s作为起点 141 | # 当前状态为终止状态或者时间步太长时,一次采样结束 142 | while s != "s5" and timestep <= timestep_max: 143 | timestep += 1 144 | rand, temp = np.random.rand(), 0 145 | # 在状态s下根据策略选择动作 146 | for a_opt in A: 147 | temp += Pi.get(join(s, a_opt), 0) 148 | if temp > rand: 149 | a = a_opt 150 | r = R.get(join(s, a), 0) 151 | break 152 | rand, temp = np.random.rand(), 0 153 | # 根据状态转移概率得到下一个状态s_next 154 | for s_opt in S: 155 | temp += P.get(join(join(s, a), s_opt), 0) 156 | if temp > rand: 157 | s_next = s_opt 158 | break 159 | episode.append((s, a, r, s_next)) # 把(s,a,r,s_next)元组放入序列中 160 | s = s_next # s_next变成当前状态,开始接下来的循环 161 | episodes.append(episode) 162 | return episodes 163 | 164 | 165 | # 采样5次,每个序列最长不超过1000步 166 | episodes = sample(MDP, Pi_1, 20, 5) 167 | print('第一条序列\n', episodes[0]) 168 | print('第二条序列\n', episodes[1]) 169 | print('第五条序列\n', episodes[4]) 170 | 171 | # 第一条序列 172 | # [('s1', '前往s2', 0, 's2'), ('s2', '前往s3', -2, 's3'), ('s3', '前往s5', 0, 's5')] 173 | # 第二条序列 174 | # [('s4', '概率前往', 1, 's4'), ('s4', '前往s5', 10, 's5')] 175 | # 第五条序列 176 | # [('s2', '前往s3', -2, 's3'), ('s3', '前往s4', -2, 's4'), ('s4', '前往s5', 10, 's5')] 177 | 178 | # 对所有采样序列计算所有状态的价值 179 | def MC(episodes, V, N, gamma): 180 | for episode in episodes: 181 | G = 0 182 | for i in range(len(episode) - 1, -1, -1): #一个序列从后往前计算 183 | (s, a, r, s_next) = episode[i] 184 | G = r + gamma * G 185 | N[s] = N[s] + 1 186 | V[s] = V[s] + (G - V[s]) / N[s] 187 | 188 | 189 | timestep_max = 20 190 | # 采样1000次,可以自行修改 191 | episodes = sample(MDP, Pi_1, timestep_max, 1000) 192 | gamma = 0.5 193 | V = {"s1": 0, "s2": 0, "s3": 0, "s4": 0, "s5": 0} 194 | N = {"s1": 0, "s2": 0, "s3": 0, "s4": 0, "s5": 0} 195 | MC(episodes, V, N, gamma) 196 | print("使用蒙特卡洛方法计算MDP的状态价值为\n", V) 197 | 198 | # 使用蒙特卡洛方法计算MDP的状态价值为 199 | # {'s1': -1.228923788722258, 's2': -1.6955696284402704, 's3': 0.4823809701532294, 200 | # 's4': 5.967514743019431, 's5': 0} 201 | 202 | 203 | def occupancy(episodes, s, a, timestep_max, gamma): 204 | ''' 计算状态动作对(s,a)出现的频率,以此来估算策略的占用度量 ''' 205 | rho = 0 206 | total_times = np.zeros(timestep_max) # 记录每个时间步t各被经历过几次 207 | occur_times = np.zeros(timestep_max) # 记录(s_t,a_t)=(s,a)的次数 208 | for episode in episodes: 209 | for i in range(len(episode)): 210 | (s_opt, a_opt, r, s_next) = episode[i] 211 | total_times[i] += 1 212 | if s == s_opt and a == a_opt: 213 | occur_times[i] += 1 214 | for i in reversed(range(timestep_max)): 215 | if total_times[i]: 216 | rho += gamma**i * occur_times[i] / total_times[i] 217 | return (1 - gamma) * rho 218 | 219 | 220 | gamma = 0.5 221 | timestep_max = 1000 222 | 223 | episodes_1 = sample(MDP, Pi_1, timestep_max, 1000) 224 | episodes_2 = sample(MDP, Pi_2, timestep_max, 1000) 225 | rho_1 = occupancy(episodes_1, "s4", "概率前往", timestep_max, gamma) 226 | rho_2 = occupancy(episodes_2, "s4", "概率前往", timestep_max, gamma) 227 | print(rho_1, rho_2) 228 | 229 | # 0.112567796310472 0.23199480615618912 -------------------------------------------------------------------------------- /PPO.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import rl_utils 7 | 8 | 9 | class PolicyNet(torch.nn.Module): 10 | def __init__(self, state_dim, hidden_dim, action_dim): 11 | super(PolicyNet, self).__init__() 12 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 13 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 14 | 15 | def forward(self, x): 16 | x = F.relu(self.fc1(x)) 17 | return F.softmax(self.fc2(x), dim=1) 18 | 19 | 20 | class ValueNet(torch.nn.Module): 21 | def __init__(self, state_dim, hidden_dim): 22 | super(ValueNet, self).__init__() 23 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 24 | self.fc2 = torch.nn.Linear(hidden_dim, 1) 25 | 26 | def forward(self, x): 27 | x = F.relu(self.fc1(x)) 28 | return self.fc2(x) 29 | 30 | 31 | class PPO: 32 | ''' PPO算法,采用截断方式 ''' 33 | def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, 34 | lmbda, epochs, eps, gamma, device): 35 | self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device) 36 | self.critic = ValueNet(state_dim, hidden_dim).to(device) 37 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), 38 | lr=actor_lr) 39 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), 40 | lr=critic_lr) 41 | self.gamma = gamma 42 | self.lmbda = lmbda 43 | self.epochs = epochs # 一条序列的数据用来训练轮数 44 | self.eps = eps # PPO中截断范围的参数 45 | self.device = device 46 | 47 | def take_action(self, state): 48 | state = torch.tensor([state], dtype=torch.float).to(self.device) 49 | probs = self.actor(state) 50 | action_dist = torch.distributions.Categorical(probs) 51 | action = action_dist.sample() 52 | return action.item() 53 | 54 | def update(self, transition_dict): 55 | states = torch.tensor(transition_dict['states'], 56 | dtype=torch.float).to(self.device) 57 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 58 | self.device) 59 | rewards = torch.tensor(transition_dict['rewards'], 60 | dtype=torch.float).view(-1, 1).to(self.device) 61 | next_states = torch.tensor(transition_dict['next_states'], 62 | dtype=torch.float).to(self.device) 63 | dones = torch.tensor(transition_dict['dones'], 64 | dtype=torch.float).view(-1, 1).to(self.device) 65 | td_target = rewards + self.gamma * self.critic(next_states) * (1 - 66 | dones) 67 | td_delta = td_target - self.critic(states) 68 | advantage = rl_utils.compute_advantage(self.gamma, self.lmbda, 69 | td_delta.cpu()).to(self.device) 70 | old_log_probs = torch.log(self.actor(states).gather(1, 71 | actions)).detach() 72 | 73 | for _ in range(self.epochs): 74 | log_probs = torch.log(self.actor(states).gather(1, actions)) 75 | ratio = torch.exp(log_probs - old_log_probs) 76 | surr1 = ratio * advantage 77 | surr2 = torch.clamp(ratio, 1 - self.eps, 78 | 1 + self.eps) * advantage # 截断 79 | actor_loss = torch.mean(-torch.min(surr1, surr2)) # PPO损失函数 80 | critic_loss = torch.mean( 81 | F.mse_loss(self.critic(states), td_target.detach())) 82 | self.actor_optimizer.zero_grad() 83 | self.critic_optimizer.zero_grad() 84 | actor_loss.backward() 85 | critic_loss.backward() 86 | self.actor_optimizer.step() 87 | self.critic_optimizer.step() 88 | 89 | actor_lr = 1e-3 90 | critic_lr = 1e-2 91 | num_episodes = 500 92 | hidden_dim = 128 93 | gamma = 0.98 94 | lmbda = 0.95 95 | epochs = 10 96 | eps = 0.2 97 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 98 | "cpu") 99 | 100 | env_name = 'CartPole-v0' 101 | env = gym.make(env_name) 102 | env.seed(0) 103 | torch.manual_seed(0) 104 | state_dim = env.observation_space.shape[0] 105 | action_dim = env.action_space.n 106 | agent = PPO(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda, 107 | epochs, eps, gamma, device) 108 | 109 | return_list = rl_utils.train_on_policy_agent(env, agent, num_episodes) 110 | 111 | episodes_list = list(range(len(return_list))) 112 | plt.plot(episodes_list, return_list) 113 | plt.xlabel('Episodes') 114 | plt.ylabel('Returns') 115 | plt.title('PPO on {}'.format(env_name)) 116 | plt.show() 117 | 118 | mv_return = rl_utils.moving_average(return_list, 9) 119 | plt.plot(episodes_list, mv_return) 120 | plt.xlabel('Episodes') 121 | plt.ylabel('Returns') 122 | plt.title('PPO on {}'.format(env_name)) 123 | plt.show() -------------------------------------------------------------------------------- /PPO_cleanrl.py: -------------------------------------------------------------------------------- 1 | # docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppopy 2 | import argparse 3 | import os 4 | import random 5 | import time 6 | from distutils.util import strtobool 7 | 8 | import gym 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | from torch.distributions.categorical import Categorical 14 | from torch.utils.tensorboard import SummaryWriter 15 | 16 | 17 | def parse_args(): 18 | # fmt: off 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"), 21 | help="the name of this experiment") 22 | parser.add_argument("--seed", type=int, default=1, 23 | help="seed of the experiment") 24 | parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 25 | help="if toggled, `torch.backends.cudnn.deterministic=False`") 26 | parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 27 | help="if toggled, cuda will be enabled by default") 28 | 29 | # Algorithm specific arguments 30 | parser.add_argument("--env-id", type=str, default="CartPole-v1", 31 | help="the id of the environment") 32 | parser.add_argument("--total-timesteps", type=int, default=500000, 33 | help="total timesteps of the experiments") 34 | parser.add_argument("--learning-rate", type=float, default=2.5e-4, 35 | help="the learning rate of the optimizer") 36 | parser.add_argument("--num-envs", type=int, default=2, 37 | help="the number of parallel game environments") 38 | parser.add_argument("--num-steps", type=int, default=128, 39 | help="the number of steps to run in each environment per policy rollout") 40 | parser.add_argument("--anneal-lr", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 41 | help="Toggle learning rate annealing for policy and value networks") 42 | parser.add_argument("--gamma", type=float, default=0.99, 43 | help="the discount factor gamma") 44 | parser.add_argument("--gae-lambda", type=float, default=0.95, 45 | help="the lambda for the general advantage estimation") 46 | parser.add_argument("--num-minibatches", type=int, default=4, 47 | help="the number of mini-batches") 48 | parser.add_argument("--update-epochs", type=int, default=4, 49 | help="the K epochs to update the policy") 50 | parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 51 | help="Toggles advantages normalization") 52 | parser.add_argument("--clip-coef", type=float, default=0.2, 53 | help="the surrogate clipping coefficient") 54 | parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 55 | help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") 56 | parser.add_argument("--ent-coef", type=float, default=0.01, 57 | help="coefficient of the entropy") 58 | parser.add_argument("--vf-coef", type=float, default=0.5, 59 | help="coefficient of the value function") 60 | parser.add_argument("--max-grad-norm", type=float, default=0.5, 61 | help="the maximum norm for the gradient clipping") 62 | parser.add_argument("--target-kl", type=float, default=None, 63 | help="the target KL divergence threshold") 64 | args = parser.parse_args() 65 | args.batch_size = int(args.num_envs * args.num_steps) # 4*128 66 | args.minibatch_size = int(args.batch_size // args.num_minibatches) # 4 * 128 // 4 67 | # fmt: on 68 | return args 69 | 70 | 71 | def make_env(env_id, seed): 72 | def thunk(): 73 | env = gym.make(env_id) 74 | env = gym.wrappers.RecordEpisodeStatistics(env) 75 | env.seed(seed) 76 | env.action_space.seed(seed) 77 | env.observation_space.seed(seed) 78 | return env 79 | 80 | return thunk 81 | 82 | def layer_init(layer, std=np.sqrt(2), bias_const=0.0): 83 | torch.nn.init.orthogonal_(layer.weight, std) 84 | torch.nn.init.constant_(layer.bias, bias_const) 85 | return layer 86 | 87 | 88 | ''' 89 | 让我们假设 envs.single_observation_space.shape 是一个形状为 (3, 4, 2) 的 NumPy 数组。 90 | 首先,np.array(envs.single_observation_space.shape) 将其转换为一个 NumPy 数组:np.array([3, 4, 2])。 91 | 然后,.prod() 方法被调用,它返回数组中所有元素的乘积。在这种情况下,3 * 4 * 2 = 24,所以最终的结果是 24。 92 | ''' 93 | 94 | class Agent(nn.Module): 95 | def __init__(self, envs): 96 | super().__init__() 97 | self.critic = nn.Sequential( 98 | layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), 99 | nn.Tanh(), 100 | layer_init(nn.Linear(64, 64)), 101 | nn.Tanh(), 102 | layer_init(nn.Linear(64, 1), std=1.0), 103 | ) 104 | self.actor = nn.Sequential( 105 | layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), 106 | nn.Tanh(), 107 | layer_init(nn.Linear(64, 64)), 108 | nn.Tanh(), 109 | layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01), 110 | ) 111 | 112 | def get_value(self, x): 113 | return self.critic(x) 114 | 115 | def get_action_and_value(self, x, action=None): 116 | # 接收状态x作为输入,并输出动作的未归一化的对数概率(logits) 117 | logits = self.actor(x) 118 | # 使用未归一化的对数概率创建一个Categorical分布对象 119 | probs = Categorical(logits=logits) 120 | if action is None: 121 | # 根据概率分布随机采样一个动作 122 | action = probs.sample() 123 | # 动作(action)、动作的对数概率(log_prob(action))、概率分布的熵(entropy())以及通过值函数网络(critic)对状态x的值函数估计 124 | # 动作的对数概率(log_prob(action))是指给定一个动作,根据策略网络输出的概率分布,计算该动作的对数概率值 125 | return action, probs.log_prob(action), probs.entropy(), self.critic(x) 126 | 127 | 128 | if __name__ == "__main__": 129 | args = parse_args() 130 | run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" 131 | writer = SummaryWriter(f"runs/{run_name}") 132 | writer.add_text( 133 | "hyperparameters", 134 | "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), 135 | ) 136 | # TRY NOT TO MODIFY: seeding 137 | random.seed(args.seed) 138 | np.random.seed(args.seed) 139 | torch.manual_seed(args.seed) 140 | torch.backends.cudnn.deterministic = args.torch_deterministic 141 | device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") 142 | # env setup 143 | envs = gym.vector.SyncVectorEnv( 144 | [make_env(args.env_id, args.seed + i) for i in range(args.num_envs)] 145 | ) 146 | assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported" 147 | agent = Agent(envs).to(device) 148 | optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) 149 | # ALGO Logic: Storage setup 150 | obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) 151 | actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) 152 | logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) 153 | rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) 154 | dones = torch.zeros((args.num_steps, args.num_envs)).to(device) 155 | values = torch.zeros((args.num_steps, args.num_envs)).to(device) 156 | # TRY NOT TO MODIFY: start the game 157 | global_step = 0 158 | start_time = time.time() 159 | next_obs = torch.Tensor(envs.reset()).to(device) 160 | next_done = torch.zeros(args.num_envs).to(device) 161 | num_updates = args.total_timesteps // args.batch_size # 500000 // (4*128) = 976 162 | for update in range(1, num_updates + 1): # update 从1 到 976 163 | # Annealing the rate if instructed to do so. 164 | if args.anneal_lr: 165 | # anneal_lr 是 Learning Rate Annealing 学习率退火 166 | # 学习率退火是一种训练过程中动态调整学习率的技术。它通常会在训练的早期使用较大的学习率以加快收敛速度, 167 | # 然后逐渐降低学习率,让模型在训练后期更加稳定地收敛或探索更细致的参数空间。 168 | frac = 1.0 - (update - 1.0) / num_updates 169 | lrnow = frac * args.learning_rate 170 | optimizer.param_groups[0]["lr"] = lrnow 171 | # 每个环境执行128个step 172 | for step in range(0, args.num_steps): 173 | global_step += 1 * args.num_envs 174 | obs[step] = next_obs 175 | dones[step] = next_done 176 | # ALGO LOGIC: action logic 177 | with torch.no_grad(): 178 | action, logprob, _, value = agent.get_action_and_value(next_obs) 179 | values[step] = value.flatten() 180 | actions[step] = action 181 | logprobs[step] = logprob 182 | # TRY NOT TO MODIFY: execute the game and log data. 183 | next_obs, reward, done, info = envs.step(action.cpu().numpy()) 184 | rewards[step] = torch.tensor(reward).to(device).view(-1) 185 | next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device) 186 | for item in info: 187 | if "episode" in item.keys(): 188 | print(f"global_step={global_step}, episodic_return={item['episode']['r']}") 189 | writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step) 190 | writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step) 191 | break 192 | # bootstrap value if not done 193 | # 用于计算优势函数(advantages)和返回值(returns) 194 | with torch.no_grad(): 195 | next_value = agent.get_value(next_obs).reshape(1, -1) 196 | advantages = torch.zeros_like(rewards).to(device) 197 | # 初始化lastgaelam变量为0,用于计算GAE(Generalized Advantage Estimation)中的累积因子 198 | lastgaelam = 0 199 | for t in reversed(range(args.num_steps)): 200 | if t == args.num_steps - 1: 201 | nextnonterminal = 1.0 - next_done 202 | nextvalues = next_value 203 | else: 204 | nextnonterminal = 1.0 - dones[t + 1] 205 | nextvalues = values[t + 1] 206 | delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] 207 | advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam 208 | returns = advantages + values 209 | 210 | # flatten the batch 211 | # 把(128,4,4)维度转换为(512,4) 212 | # 相当于一共512个step的数据 213 | b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) 214 | b_logprobs = logprobs.reshape(-1) 215 | b_actions = actions.reshape((-1,) + envs.single_action_space.shape) 216 | b_advantages = advantages.reshape(-1) 217 | b_returns = returns.reshape(-1) 218 | b_values = values.reshape(-1) 219 | 220 | # Optimizing the policy and value network 221 | b_inds = np.arange(args.batch_size) # batch_size = 4*128 b_inds = [0,1,2....510,511] 222 | # 用于存储每个批次的clip fraction值 223 | clipfracs = [] 224 | for epoch in range(args.update_epochs): # update_epochs = 4 225 | # 随机打乱b_inds数组中的元素顺序,以便每个epoch中随机选择训练样本。 226 | np.random.shuffle(b_inds) 227 | # 将训练样本划分为多个大小为args.minibatch_size = 128的小批次 228 | # 其中start和end是小批次的起始索引和结束索引 229 | # mb_inds是当前小批次中样本的索引。 230 | for start in range(0, args.batch_size, args.minibatch_size): # minibatch_size = 128 231 | # start = 0, 128, 256, 384 232 | end = start + args.minibatch_size 233 | mb_inds = b_inds[start:end] 234 | # 根据输入的观察和动作,获取新的对数概率(newlogprob),策略熵(entropy)和值函数估计值(newvalue) 235 | _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds]) 236 | logratio = newlogprob - b_logprobs[mb_inds] 237 | ratio = logratio.exp() 238 | 239 | # "clip fraction"(裁剪比例)是指在使用PPO算法进行优化时,计算出的近似策略比率在被裁剪范围之外的比例。 240 | # 在PPO算法中,为了限制每次更新的策略变化幅度,会使用一个裁剪系数(clip coefficient) 241 | # 如果策略比率(新的概率与旧的概率之比)超过了裁剪系数范围之外,那么它就会被裁剪到该范围内 242 | # 裁剪后的策略比率被用于计算策略损失 243 | # "clip fraction"是指裁剪后的策略比率超过裁剪系数的比例 244 | # 它表示了在训练过程中有多少比例的策略比率被裁剪到了裁剪范围内 245 | # 通常,我们希望裁剪比例较低,即大部分策略比率都处于裁剪范围内 246 | # 较低的裁剪比例表明策略更新的幅度较小,收敛性更好。因此,观察和监控裁剪比例可以帮助我们了解模型训练的稳定性和效果 247 | # 计算旧的近似KL散度(old_approx_kl)和新的近似KL散度(approx_kl) 248 | with torch.no_grad(): 249 | # calculate approx_kl http://joschu.net/blog/kl-approx.html 250 | old_approx_kl = (-logratio).mean() 251 | approx_kl = ((ratio - 1) - logratio).mean() 252 | clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] 253 | 254 | mb_advantages = b_advantages[mb_inds] 255 | if args.norm_adv: 256 | mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) 257 | 258 | # Policy loss 259 | pg_loss1 = -mb_advantages * ratio 260 | pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) 261 | pg_loss = torch.max(pg_loss1, pg_loss2).mean() 262 | 263 | # Value loss 264 | newvalue = newvalue.view(-1) 265 | if args.clip_vloss: 266 | v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 267 | v_clipped = b_values[mb_inds] + torch.clamp( 268 | newvalue - b_values[mb_inds], 269 | -args.clip_coef, 270 | args.clip_coef, 271 | ) 272 | v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 273 | v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) 274 | v_loss = 0.5 * v_loss_max.mean() 275 | else: 276 | v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() 277 | 278 | entropy_loss = entropy.mean() 279 | loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef 280 | 281 | optimizer.zero_grad() 282 | loss.backward() 283 | nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) 284 | optimizer.step() 285 | 286 | if args.target_kl is not None: 287 | if approx_kl > args.target_kl: 288 | break 289 | 290 | y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy() 291 | var_y = np.var(y_true) 292 | explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y 293 | 294 | # TRY NOT TO MODIFY: record rewards for plotting purposes 295 | writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) 296 | writer.add_scalar("losses/value_loss", v_loss.item(), global_step) 297 | writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) 298 | writer.add_scalar("losses/entropy", entropy_loss.item(), global_step) 299 | writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) 300 | writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) 301 | writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) 302 | writer.add_scalar("losses/explained_variance", explained_var, global_step) 303 | print("SPS:", int(global_step / (time.time() - start_time))) 304 | writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) 305 | 306 | envs.close() 307 | writer.close() 308 | -------------------------------------------------------------------------------- /PPO_cleanrl_atari.py: -------------------------------------------------------------------------------- 1 | # docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_ataripy 2 | import argparse 3 | import os 4 | import random 5 | import time 6 | from distutils.util import strtobool 7 | 8 | import gym 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | from torch.distributions.categorical import Categorical 14 | from torch.utils.tensorboard import SummaryWriter 15 | 16 | from stable_baselines3.common.atari_wrappers import ( # isort:skip 17 | ClipRewardEnv, 18 | EpisodicLifeEnv, 19 | FireResetEnv, 20 | MaxAndSkipEnv, 21 | NoopResetEnv, 22 | ) 23 | 24 | 25 | def parse_args(): 26 | # fmt: off 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"), 29 | help="the name of this experiment") 30 | parser.add_argument("--seed", type=int, default=1, 31 | help="seed of the experiment") 32 | parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 33 | help="if toggled, `torch.backends.cudnn.deterministic=False`") 34 | parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 35 | help="if toggled, cuda will be enabled by default") 36 | parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True, 37 | help="if toggled, this experiment will be tracked with Weights and Biases") 38 | parser.add_argument("--wandb-project-name", type=str, default="cleanRL", 39 | help="the wandb's project name") 40 | parser.add_argument("--wandb-entity", type=str, default=None, 41 | help="the entity (team) of wandb's project") 42 | parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True, 43 | help="whether to capture videos of the agent performances (check out `videos` folder)") 44 | 45 | # Algorithm specific arguments 46 | parser.add_argument("--env-id", type=str, default="BreakoutNoFrameskip-v4", 47 | help="the id of the environment") 48 | parser.add_argument("--total-timesteps", type=int, default=10000000, 49 | help="total timesteps of the experiments") 50 | parser.add_argument("--learning-rate", type=float, default=2.5e-4, 51 | help="the learning rate of the optimizer") 52 | parser.add_argument("--num-envs", type=int, default=8, 53 | help="the number of parallel game environments") 54 | parser.add_argument("--num-steps", type=int, default=128, 55 | help="the number of steps to run in each environment per policy rollout") 56 | parser.add_argument("--anneal-lr", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 57 | help="Toggle learning rate annealing for policy and value networks") 58 | parser.add_argument("--gamma", type=float, default=0.99, 59 | help="the discount factor gamma") 60 | parser.add_argument("--gae-lambda", type=float, default=0.95, 61 | help="the lambda for the general advantage estimation") 62 | parser.add_argument("--num-minibatches", type=int, default=4, 63 | help="the number of mini-batches") 64 | parser.add_argument("--update-epochs", type=int, default=4, 65 | help="the K epochs to update the policy") 66 | parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 67 | help="Toggles advantages normalization") 68 | parser.add_argument("--clip-coef", type=float, default=0.1, 69 | help="the surrogate clipping coefficient") 70 | parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 71 | help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") 72 | parser.add_argument("--ent-coef", type=float, default=0.01, 73 | help="coefficient of the entropy") 74 | parser.add_argument("--vf-coef", type=float, default=0.5, 75 | help="coefficient of the value function") 76 | parser.add_argument("--max-grad-norm", type=float, default=0.5, 77 | help="the maximum norm for the gradient clipping") 78 | parser.add_argument("--target-kl", type=float, default=None, 79 | help="the target KL divergence threshold") 80 | args = parser.parse_args() 81 | args.batch_size = int(args.num_envs * args.num_steps) 82 | args.minibatch_size = int(args.batch_size // args.num_minibatches) 83 | # fmt: on 84 | return args 85 | 86 | 87 | def make_env(env_id, seed, idx, capture_video, run_name): 88 | def thunk(): 89 | env = gym.make(env_id) 90 | env = gym.wrappers.RecordEpisodeStatistics(env) 91 | if capture_video: 92 | if idx == 0: 93 | env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") 94 | env = NoopResetEnv(env, noop_max=30) 95 | env = MaxAndSkipEnv(env, skip=4) 96 | env = EpisodicLifeEnv(env) 97 | if "FIRE" in env.unwrapped.get_action_meanings(): 98 | env = FireResetEnv(env) 99 | env = ClipRewardEnv(env) 100 | env = gym.wrappers.ResizeObservation(env, (84, 84)) 101 | env = gym.wrappers.GrayScaleObservation(env) 102 | env = gym.wrappers.FrameStack(env, 4) 103 | env.seed(seed) 104 | env.action_space.seed(seed) 105 | env.observation_space.seed(seed) 106 | return env 107 | 108 | return thunk 109 | 110 | 111 | def layer_init(layer, std=np.sqrt(2), bias_const=0.0): 112 | torch.nn.init.orthogonal_(layer.weight, std) 113 | torch.nn.init.constant_(layer.bias, bias_const) 114 | return layer 115 | 116 | 117 | class Agent(nn.Module): 118 | def __init__(self, envs): 119 | super().__init__() 120 | self.network = nn.Sequential( 121 | layer_init(nn.Conv2d(4, 32, 8, stride=4)), 122 | nn.ReLU(), 123 | layer_init(nn.Conv2d(32, 64, 4, stride=2)), 124 | nn.ReLU(), 125 | layer_init(nn.Conv2d(64, 64, 3, stride=1)), 126 | nn.ReLU(), 127 | nn.Flatten(), 128 | layer_init(nn.Linear(64 * 7 * 7, 512)), 129 | nn.ReLU(), 130 | ) 131 | self.actor = layer_init(nn.Linear(512, envs.single_action_space.n), std=0.01) 132 | self.critic = layer_init(nn.Linear(512, 1), std=1) 133 | 134 | def get_value(self, x): 135 | return self.critic(self.network(x / 255.0)) 136 | 137 | def get_action_and_value(self, x, action=None): 138 | hidden = self.network(x / 255.0) 139 | logits = self.actor(hidden) 140 | probs = Categorical(logits=logits) 141 | if action is None: 142 | action = probs.sample() 143 | return action, probs.log_prob(action), probs.entropy(), self.critic(hidden) 144 | 145 | 146 | if __name__ == "__main__": 147 | args = parse_args() 148 | run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" 149 | if args.track: 150 | import wandb 151 | 152 | wandb.init( 153 | project=args.wandb_project_name, 154 | entity=args.wandb_entity, 155 | sync_tensorboard=True, 156 | config=vars(args), 157 | name=run_name, 158 | monitor_gym=True, 159 | save_code=True, 160 | ) 161 | writer = SummaryWriter(f"runs/{run_name}") 162 | writer.add_text( 163 | "hyperparameters", 164 | "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), 165 | ) 166 | 167 | # TRY NOT TO MODIFY: seeding 168 | random.seed(args.seed) 169 | np.random.seed(args.seed) 170 | torch.manual_seed(args.seed) 171 | torch.backends.cudnn.deterministic = args.torch_deterministic 172 | 173 | device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") 174 | 175 | # env setup 176 | envs = gym.vector.SyncVectorEnv( 177 | [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)] 178 | ) 179 | assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported" 180 | 181 | agent = Agent(envs).to(device) 182 | optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) 183 | 184 | # ALGO Logic: Storage setup 185 | obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) 186 | actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) 187 | logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) 188 | rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) 189 | dones = torch.zeros((args.num_steps, args.num_envs)).to(device) 190 | values = torch.zeros((args.num_steps, args.num_envs)).to(device) 191 | 192 | # TRY NOT TO MODIFY: start the game 193 | global_step = 0 194 | start_time = time.time() 195 | next_obs = torch.Tensor(envs.reset()).to(device) 196 | next_done = torch.zeros(args.num_envs).to(device) 197 | num_updates = args.total_timesteps // args.batch_size 198 | 199 | for update in range(1, num_updates + 1): 200 | # Annealing the rate if instructed to do so. 201 | if args.anneal_lr: 202 | frac = 1.0 - (update - 1.0) / num_updates 203 | lrnow = frac * args.learning_rate 204 | optimizer.param_groups[0]["lr"] = lrnow 205 | 206 | for step in range(0, args.num_steps): 207 | global_step += 1 * args.num_envs 208 | obs[step] = next_obs 209 | dones[step] = next_done 210 | 211 | # ALGO LOGIC: action logic 212 | with torch.no_grad(): 213 | action, logprob, _, value = agent.get_action_and_value(next_obs) 214 | values[step] = value.flatten() 215 | actions[step] = action 216 | logprobs[step] = logprob 217 | 218 | # TRY NOT TO MODIFY: execute the game and log data. 219 | next_obs, reward, done, info = envs.step(action.cpu().numpy()) 220 | rewards[step] = torch.tensor(reward).to(device).view(-1) 221 | next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device) 222 | 223 | for item in info: 224 | if "episode" in item.keys(): 225 | print(f"global_step={global_step}, episodic_return={item['episode']['r']}") 226 | writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step) 227 | writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step) 228 | break 229 | 230 | # bootstrap value if not done 231 | with torch.no_grad(): 232 | next_value = agent.get_value(next_obs).reshape(1, -1) 233 | advantages = torch.zeros_like(rewards).to(device) 234 | lastgaelam = 0 235 | for t in reversed(range(args.num_steps)): 236 | if t == args.num_steps - 1: 237 | nextnonterminal = 1.0 - next_done 238 | nextvalues = next_value 239 | else: 240 | nextnonterminal = 1.0 - dones[t + 1] 241 | nextvalues = values[t + 1] 242 | delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] 243 | advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam 244 | returns = advantages + values 245 | 246 | # flatten the batch 247 | b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) 248 | b_logprobs = logprobs.reshape(-1) 249 | b_actions = actions.reshape((-1,) + envs.single_action_space.shape) 250 | b_advantages = advantages.reshape(-1) 251 | b_returns = returns.reshape(-1) 252 | b_values = values.reshape(-1) 253 | 254 | # Optimizing the policy and value network 255 | b_inds = np.arange(args.batch_size) 256 | clipfracs = [] 257 | for epoch in range(args.update_epochs): 258 | np.random.shuffle(b_inds) 259 | for start in range(0, args.batch_size, args.minibatch_size): 260 | end = start + args.minibatch_size 261 | mb_inds = b_inds[start:end] 262 | 263 | _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds]) 264 | logratio = newlogprob - b_logprobs[mb_inds] 265 | ratio = logratio.exp() 266 | 267 | with torch.no_grad(): 268 | # calculate approx_kl http://joschu.net/blog/kl-approx.html 269 | old_approx_kl = (-logratio).mean() 270 | approx_kl = ((ratio - 1) - logratio).mean() 271 | clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] 272 | 273 | mb_advantages = b_advantages[mb_inds] 274 | if args.norm_adv: 275 | mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) 276 | 277 | # Policy loss 278 | pg_loss1 = -mb_advantages * ratio 279 | pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) 280 | pg_loss = torch.max(pg_loss1, pg_loss2).mean() 281 | 282 | # Value loss 283 | newvalue = newvalue.view(-1) 284 | if args.clip_vloss: 285 | v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 286 | v_clipped = b_values[mb_inds] + torch.clamp( 287 | newvalue - b_values[mb_inds], 288 | -args.clip_coef, 289 | args.clip_coef, 290 | ) 291 | v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 292 | v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) 293 | v_loss = 0.5 * v_loss_max.mean() 294 | else: 295 | v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() 296 | 297 | entropy_loss = entropy.mean() 298 | loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef 299 | 300 | optimizer.zero_grad() 301 | loss.backward() 302 | nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) 303 | optimizer.step() 304 | 305 | if args.target_kl is not None: 306 | if approx_kl > args.target_kl: 307 | break 308 | 309 | y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy() 310 | var_y = np.var(y_true) 311 | explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y 312 | 313 | # TRY NOT TO MODIFY: record rewards for plotting purposes 314 | writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) 315 | writer.add_scalar("losses/value_loss", v_loss.item(), global_step) 316 | writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) 317 | writer.add_scalar("losses/entropy", entropy_loss.item(), global_step) 318 | writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) 319 | writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) 320 | writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) 321 | writer.add_scalar("losses/explained_variance", explained_var, global_step) 322 | print("SPS:", int(global_step / (time.time() - start_time))) 323 | writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) 324 | 325 | envs.close() 326 | writer.close() -------------------------------------------------------------------------------- /PPO_cleanrl_env1.py: -------------------------------------------------------------------------------- 1 | # docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppopy 2 | import argparse 3 | import os 4 | import random 5 | import time 6 | from distutils.util import strtobool 7 | 8 | import gym 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | from torch.distributions.categorical import Categorical 14 | from torch.utils.tensorboard import SummaryWriter 15 | 16 | def parse_args(): 17 | # fmt: off 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"), 20 | help="the name of this experiment") 21 | parser.add_argument("--seed", type=int, default=1, 22 | help="seed of the experiment") 23 | parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 24 | help="if toggled, `torch.backends.cudnn.deterministic=False`") 25 | parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 26 | help="if toggled, cuda will be enabled by default") 27 | 28 | # Algorithm specific arguments 29 | parser.add_argument("--env-id", type=str, default="CartPole-v1", 30 | help="the id of the environment") 31 | parser.add_argument("--total-timesteps", type=int, default=500000, 32 | help="total timesteps of the experiments") 33 | parser.add_argument("--learning-rate", type=float, default=2.5e-4, 34 | help="the learning rate of the optimizer") 35 | parser.add_argument("--num-envs", type=int, default=1, 36 | help="the number of parallel game environments") 37 | # parser.add_argument("--num-steps", type=int, default=128, 38 | parser.add_argument("--num-steps", type=int, default=512, 39 | help="the number of steps to run in each environment per policy rollout") 40 | parser.add_argument("--anneal-lr", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 41 | help="Toggle learning rate annealing for policy and value networks") 42 | parser.add_argument("--gamma", type=float, default=0.99, 43 | help="the discount factor gamma") 44 | parser.add_argument("--gae-lambda", type=float, default=0.95, 45 | help="the lambda for the general advantage estimation") 46 | parser.add_argument("--num-minibatches", type=int, default=4, 47 | help="the number of mini-batches") 48 | parser.add_argument("--update-epochs", type=int, default=4, 49 | help="the K epochs to update the policy") 50 | parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 51 | help="Toggles advantages normalization") 52 | parser.add_argument("--clip-coef", type=float, default=0.2, 53 | help="the surrogate clipping coefficient") 54 | parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, 55 | help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") 56 | parser.add_argument("--ent-coef", type=float, default=0.01, 57 | help="coefficient of the entropy") 58 | parser.add_argument("--vf-coef", type=float, default=0.5, 59 | help="coefficient of the value function") 60 | parser.add_argument("--max-grad-norm", type=float, default=0.5, 61 | help="the maximum norm for the gradient clipping") 62 | parser.add_argument("--target-kl", type=float, default=None, 63 | help="the target KL divergence threshold") 64 | args = parser.parse_args() 65 | args.batch_size = int(args.num_envs * args.num_steps) # 512 66 | args.minibatch_size = int(args.batch_size // args.num_minibatches) # 512// 4 67 | # fmt: on 68 | return args 69 | 70 | 71 | def make_env(env_id, seed): 72 | def thunk(): 73 | env = gym.make(env_id) 74 | # env = gym.wrappers.RecordEpisodeStatistics(env) 75 | env.seed(seed) 76 | env.action_space.seed(seed) 77 | env.observation_space.seed(seed) 78 | return env 79 | 80 | return thunk 81 | 82 | def layer_init(layer, std=np.sqrt(2), bias_const=0.0): 83 | torch.nn.init.orthogonal_(layer.weight, std) 84 | torch.nn.init.constant_(layer.bias, bias_const) 85 | return layer 86 | 87 | 88 | class Agent(nn.Module): 89 | def __init__(self, envs): 90 | super().__init__() 91 | self.critic = nn.Sequential( 92 | layer_init(nn.Linear(envs.observation_space.shape[0], 64)), 93 | nn.Tanh(), 94 | layer_init(nn.Linear(64, 64)), 95 | nn.Tanh(), 96 | layer_init(nn.Linear(64, 1), std=1.0), 97 | ) 98 | self.actor = nn.Sequential( 99 | layer_init(nn.Linear(envs.observation_space.shape[0], 64)), 100 | nn.Tanh(), 101 | layer_init(nn.Linear(64, 64)), 102 | nn.Tanh(), 103 | layer_init(nn.Linear(64, envs.action_space.n), std=0.01), 104 | ) 105 | 106 | def get_value(self, x): 107 | return self.critic(x) 108 | 109 | def get_action_and_value(self, x, action=None): 110 | # 接收状态x作为输入,并输出动作的未归一化的对数概率(logits) 111 | logits = self.actor(x) 112 | # 使用未归一化的对数概率创建一个Categorical分布对象 113 | probs = Categorical(logits=logits) 114 | if action is None: 115 | # 根据概率分布随机采样一个动作 116 | action = probs.sample() 117 | # 动作(action)、动作的对数概率(log_prob(action))、概率分布的熵(entropy())以及通过值函数网络(critic)对状态x的值函数估计 118 | # 动作的对数概率(log_prob(action))是指给定一个动作,根据策略网络输出的概率分布,计算该动作的对数概率值 119 | return action, probs.log_prob(action), probs.entropy(), self.critic(x) 120 | 121 | 122 | if __name__ == "__main__": 123 | args = parse_args() 124 | run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" 125 | writer = SummaryWriter(f"runs/{run_name}") 126 | writer.add_text( 127 | "hyperparameters", 128 | "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), 129 | ) 130 | # TRY NOT TO MODIFY: seeding 131 | random.seed(args.seed) 132 | np.random.seed(args.seed) 133 | torch.manual_seed(args.seed) 134 | torch.backends.cudnn.deterministic = args.torch_deterministic 135 | device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") 136 | # env setup 137 | # envs = gym.vector.SyncVectorEnv( 138 | # [make_env(args.env_id, args.seed + i) for i in range(args.num_envs)] 139 | # ) 140 | envs = gym.make(args.env_id) 141 | assert isinstance(envs.action_space, gym.spaces.Discrete), "only discrete action space is supported" 142 | agent = Agent(envs).to(device) 143 | optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) 144 | # ALGO Logic: Storage setup 145 | obs = torch.zeros((args.num_steps, envs.observation_space.shape[0])).to(device) 146 | actions = torch.zeros((args.num_steps,)).to(device) 147 | logprobs = torch.zeros((args.num_steps,)).to(device) 148 | rewards = torch.zeros((args.num_steps,)).to(device) 149 | dones = torch.zeros((args.num_steps,)).to(device) 150 | values = torch.zeros((args.num_steps,)).to(device) 151 | 152 | # TRY NOT TO MODIFY: start the game 153 | global_step = 0 154 | start_time = time.time() 155 | next_obs = torch.Tensor(envs.reset()).to(device) 156 | next_done = torch.zeros(args.num_envs).to(device) 157 | num_updates = args.total_timesteps // args.batch_size # 500000 // 512 = 976 158 | 159 | episodic_return = 0 160 | episodic_length = 0 161 | for update in range(1, num_updates + 1): # update 从1 到 976 162 | print('**************************************************') 163 | print('第 {} of 976 轮'.format(update)) 164 | print('**************************************************') 165 | # Annealing the rate if instructed to do so. 166 | if args.anneal_lr: 167 | # anneal_lr 是 Learning Rate Annealing 学习率退火 168 | # 学习率退火是一种训练过程中动态调整学习率的技术。它通常会在训练的早期使用较大的学习率以加快收敛速度, 169 | # 然后逐渐降低学习率,让模型在训练后期更加稳定地收敛或探索更细致的参数空间。 170 | frac = 1.0 - (update - 1.0) / num_updates 171 | lrnow = frac * args.learning_rate 172 | optimizer.param_groups[0]["lr"] = lrnow 173 | # 执行512个step 174 | for step in range(0, args.num_steps): 175 | global_step += 1 176 | obs[step] = next_obs 177 | dones[step] = next_done 178 | # ALGO LOGIC: action logic 179 | with torch.no_grad(): 180 | action, logprob, _, value = agent.get_action_and_value(next_obs) 181 | values[step] = value.flatten() 182 | actions[step] = action 183 | logprobs[step] = logprob 184 | # TRY NOT TO MODIFY: execute the game and log data. 185 | next_obs, reward, done, info = envs.step(action.cpu().numpy()) 186 | rewards[step] = torch.tensor(reward).to(device).view(-1) 187 | next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(np.array(done)).to(device) 188 | episodic_return += reward 189 | episodic_length += 1 190 | if done == True: 191 | # 计算回合的奖励和长度 192 | next_obs = torch.Tensor(envs.reset()).to(device) 193 | print(f"global_step={global_step}, episodic_return={episodic_return}") 194 | writer.add_scalar("charts/episodic_return", episodic_return, global_step) 195 | writer.add_scalar("charts/episodic_length", episodic_length, global_step) 196 | episodic_return = 0 197 | episodic_length = 0 198 | print('--------------------------------------------------') 199 | print('第 {} of 976 轮 采样完毕数据'.format(update)) 200 | print('--------------------------------------------------') 201 | 202 | # bootstrap value if not done 203 | # 用于计算优势函数(advantages)和返回值(returns) 204 | with torch.no_grad(): 205 | next_value = agent.get_value(next_obs).reshape(1, -1) 206 | advantages = torch.zeros_like(rewards).to(device) 207 | # 初始化lastgaelam变量为0,用于计算GAE(Generalized Advantage Estimation)中的累积因子 208 | lastgaelam = 0 209 | for t in reversed(range(args.num_steps)): 210 | if t == args.num_steps - 1: 211 | nextnonterminal = 1.0 - next_done 212 | nextvalues = next_value 213 | else: 214 | nextnonterminal = 1.0 - dones[t + 1] 215 | nextvalues = values[t + 1] 216 | delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] 217 | advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam 218 | returns = advantages + values 219 | 220 | # flatten the batch 221 | # 222 | # 相当于一共512个step的数据 223 | b_obs = obs.reshape((-1,) + envs.observation_space.shape) 224 | b_logprobs = logprobs.reshape(-1) 225 | b_actions = actions.reshape((-1,) + envs.action_space.shape) 226 | b_advantages = advantages.reshape(-1) 227 | b_returns = returns.reshape(-1) 228 | b_values = values.reshape(-1) 229 | 230 | # Optimizing the policy and value network 231 | b_inds = np.arange(args.batch_size) # batch_size = 512 b_inds = [0,1,2....,511] 232 | # 用于存储每个批次的clip fraction值 233 | clipfracs = [] 234 | for epoch in range(args.update_epochs): # update_epochs = 4 235 | # 随机打乱b_inds数组中的元素顺序,以便每个epoch中随机选择训练样本。 236 | np.random.shuffle(b_inds) 237 | # 将训练样本划分为多个大小为args.minibatch_size = 128的小批次 238 | # 其中start和end是小批次的起始索引和结束索引 239 | # mb_inds是当前小批次中样本的索引。 240 | for start in range(0, args.batch_size, args.minibatch_size): # minibatch_size = 128 241 | # start = 0, 128, 256, 384 242 | end = start + args.minibatch_size 243 | mb_inds = b_inds[start:end] 244 | # 根据输入的观察和动作,获取新的对数概率(newlogprob),策略熵(entropy)和值函数估计值(newvalue) 245 | _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], 246 | b_actions.long()[mb_inds]) 247 | logratio = newlogprob - b_logprobs[mb_inds] 248 | ratio = logratio.exp() 249 | 250 | # "clip fraction"(裁剪比例)是指在使用PPO算法进行优化时,计算出的近似策略比率在被裁剪范围之外的比例。 251 | # 在PPO算法中,为了限制每次更新的策略变化幅度,会使用一个裁剪系数(clip coefficient) 252 | # 如果策略比率(新的概率与旧的概率之比)超过了裁剪系数范围之外,那么它就会被裁剪到该范围内 253 | # 裁剪后的策略比率被用于计算策略损失 254 | # "clip fraction"是指裁剪后的策略比率超过裁剪系数的比例 255 | # 它表示了在训练过程中有多少比例的策略比率被裁剪到了裁剪范围内 256 | # 通常,我们希望裁剪比例较低,即大部分策略比率都处于裁剪范围内 257 | # 较低的裁剪比例表明策略更新的幅度较小,收敛性更好。因此,观察和监控裁剪比例可以帮助我们了解模型训练的稳定性和效果 258 | # 计算旧的近似KL散度(old_approx_kl)和新的近似KL散度(approx_kl) 259 | with torch.no_grad(): 260 | # calculate approx_kl http://joschu.net/blog/kl-approx.html 261 | old_approx_kl = (-logratio).mean() 262 | approx_kl = ((ratio - 1) - logratio).mean() 263 | clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] 264 | 265 | mb_advantages = b_advantages[mb_inds] 266 | if args.norm_adv: 267 | mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) 268 | 269 | # Policy loss 270 | pg_loss1 = -mb_advantages * ratio 271 | pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) 272 | pg_loss = torch.max(pg_loss1, pg_loss2).mean() 273 | 274 | # Value loss 275 | newvalue = newvalue.view(-1) 276 | if args.clip_vloss: 277 | v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 278 | v_clipped = b_values[mb_inds] + torch.clamp( 279 | newvalue - b_values[mb_inds], 280 | -args.clip_coef, 281 | args.clip_coef, 282 | ) 283 | v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 284 | v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) 285 | v_loss = 0.5 * v_loss_max.mean() 286 | else: 287 | v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() 288 | 289 | entropy_loss = entropy.mean() 290 | loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef 291 | 292 | optimizer.zero_grad() 293 | loss.backward() 294 | nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) 295 | optimizer.step() 296 | 297 | if args.target_kl is not None: 298 | if approx_kl > args.target_kl: 299 | break 300 | 301 | y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy() 302 | var_y = np.var(y_true) 303 | explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y 304 | 305 | # TRY NOT TO MODIFY: record rewards for plotting purposes 306 | writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) 307 | writer.add_scalar("losses/value_loss", v_loss.item(), global_step) 308 | writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) 309 | writer.add_scalar("losses/entropy", entropy_loss.item(), global_step) 310 | writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) 311 | writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) 312 | writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) 313 | writer.add_scalar("losses/explained_variance", explained_var, global_step) 314 | print("SPS:", int(global_step / (time.time() - start_time))) 315 | writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) 316 | 317 | envs.close() 318 | writer.close() 319 | 320 | 321 | -------------------------------------------------------------------------------- /PolicyIteration.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | 4 | class CliffWalkingEnv: 5 | """ 悬崖漫步环境""" 6 | def __init__(self, ncol=12, nrow=4): 7 | self.ncol = ncol # 定义网格世界的列 8 | self.nrow = nrow # 定义网格世界的行 9 | # 转移矩阵P[state][action] = [(p, next_state, reward, done)]包含下一个状态和奖励 10 | self.P = self.createP() 11 | 12 | def createP(self): 13 | # 初始化 14 | P = [[[] for j in range(4)] for i in range(self.nrow * self.ncol)] 15 | # 4种动作, change[0]:上,change[1]:下, change[2]:左, change[3]:右。坐标系原点(0,0) 16 | # 定义在左上角 17 | change = [[0, -1], [0, 1], [-1, 0], [1, 0]] 18 | for i in range(self.nrow): 19 | for j in range(self.ncol): 20 | for a in range(4): 21 | # 位置在悬崖或者目标状态,因为无法继续交互,任何动作奖励都为0 22 | if i == self.nrow - 1 and j > 0: 23 | P[i * self.ncol + j][a] = [(1, i * self.ncol + j, 0, 24 | True)] 25 | continue 26 | # 其他位置 27 | next_x = min(self.ncol - 1, max(0, j + change[a][0])) 28 | next_y = min(self.nrow - 1, max(0, i + change[a][1])) 29 | next_state = next_y * self.ncol + next_x 30 | reward = -1 31 | done = False 32 | # 下一个位置在悬崖或者终点 33 | if next_y == self.nrow - 1 and next_x > 0: 34 | done = True 35 | if next_x != self.ncol - 1: # 下一个位置在悬崖 36 | reward = -100 37 | P[i * self.ncol + j][a] = [(1, next_state, reward, done)] 38 | return P 39 | 40 | class PolicyIteration: 41 | """ 策略迭代算法 """ 42 | def __init__(self, env, theta, gamma): 43 | self.env = env 44 | self.v = [0] * self.env.ncol * self.env.nrow # 初始化价值为0 45 | self.pi = [[0.25, 0.25, 0.25, 0.25] 46 | for i in range(self.env.ncol * self.env.nrow)] # 初始化为均匀随机策略 47 | self.theta = theta # 策略评估收敛阈值 48 | self.gamma = gamma # 折扣因子 49 | 50 | def policy_evaluation(self): # 策略评估 51 | cnt = 1 # 计数器 52 | while 1: 53 | max_diff = 0 54 | new_v = [0] * self.env.ncol * self.env.nrow 55 | for s in range(self.env.ncol * self.env.nrow): 56 | qsa_list = [] # 开始计算状态s下的所有Q(s,a)价值 57 | for a in range(4): 58 | qsa = 0 59 | for res in self.env.P[s][a]: 60 | p, next_state, r, done = res 61 | qsa += p * (r + self.gamma * self.v[next_state] * 62 | (1 - done)) 63 | # 本章环境比较特殊,奖励和下一个状态有关,所以需要和状态转移概率相乘 64 | qsa_list.append(self.pi[s][a] * qsa) 65 | new_v[s] = sum(qsa_list) # 状态价值函数和动作价值函数之间的关系 66 | max_diff = max(max_diff, abs(new_v[s] - self.v[s])) 67 | self.v = new_v 68 | if max_diff < self.theta: break # 满足收敛条件,退出评估迭代 69 | cnt += 1 70 | print("策略评估进行%d轮后完成" % cnt) 71 | 72 | def policy_improvement(self): # 策略提升 73 | for s in range(self.env.nrow * self.env.ncol): 74 | qsa_list = [] 75 | for a in range(4): 76 | qsa = 0 77 | for res in self.env.P[s][a]: 78 | p, next_state, r, done = res 79 | qsa += p * (r + self.gamma * self.v[next_state] * 80 | (1 - done)) 81 | qsa_list.append(qsa) 82 | maxq = max(qsa_list) 83 | cntq = qsa_list.count(maxq) # 计算有几个动作得到了最大的Q值 84 | # 让这些动作均分概率 85 | self.pi[s] = [1 / cntq if q == maxq else 0 for q in qsa_list] 86 | print("策略提升完成") 87 | return self.pi 88 | 89 | def policy_iteration(self): # 策略迭代 90 | while 1: 91 | self.policy_evaluation() 92 | old_pi = copy.deepcopy(self.pi) # 将列表进行深拷贝,方便接下来进行比较 93 | new_pi = self.policy_improvement() 94 | if old_pi == new_pi: break 95 | 96 | def print_agent(agent, action_meaning, disaster=[], end=[]): 97 | print("状态价值:") 98 | for i in range(agent.env.nrow): 99 | for j in range(agent.env.ncol): 100 | # 为了输出美观,保持输出6个字符 101 | print('%6.6s' % ('%.3f' % agent.v[i * agent.env.ncol + j]), 102 | end=' ') 103 | print() 104 | 105 | print("策略:") 106 | for i in range(agent.env.nrow): 107 | for j in range(agent.env.ncol): 108 | # 一些特殊的状态,例如悬崖漫步中的悬崖 109 | if (i * agent.env.ncol + j) in disaster: 110 | print('****', end=' ') 111 | elif (i * agent.env.ncol + j) in end: # 目标状态 112 | print('EEEE', end=' ') 113 | else: 114 | a = agent.pi[i * agent.env.ncol + j] 115 | pi_str = '' 116 | for k in range(len(action_meaning)): 117 | pi_str += action_meaning[k] if a[k] > 0 else 'o' 118 | print(pi_str, end=' ') 119 | print() 120 | 121 | 122 | env = CliffWalkingEnv() 123 | action_meaning = ['^', 'v', '<', '>'] 124 | theta = 0.001 125 | gamma = 0.9 126 | agent = PolicyIteration(env, theta, gamma) 127 | agent.policy_iteration() 128 | print_agent(agent, action_meaning, list(range(37, 47)), [47]) 129 | 130 | 131 | class ValueIteration: 132 | """ 价值迭代算法 """ 133 | def __init__(self, env, theta, gamma): 134 | self.env = env 135 | self.v = [0] * self.env.ncol * self.env.nrow # 初始化价值为0 136 | self.theta = theta # 价值收敛阈值 137 | self.gamma = gamma 138 | # 价值迭代结束后得到的策略 139 | self.pi = [None for i in range(self.env.ncol * self.env.nrow)] 140 | 141 | def value_iteration(self): 142 | cnt = 0 143 | while 1: 144 | max_diff = 0 145 | new_v = [0] * self.env.ncol * self.env.nrow 146 | for s in range(self.env.ncol * self.env.nrow): 147 | qsa_list = [] # 开始计算状态s下的所有Q(s,a)价值 148 | for a in range(4): 149 | qsa = 0 150 | for res in self.env.P[s][a]: 151 | p, next_state, r, done = res 152 | qsa += p * (r + self.gamma * self.v[next_state] * 153 | (1 - done)) 154 | qsa_list.append(qsa) # 这一行和下一行代码是价值迭代和策略迭代的主要区别 155 | new_v[s] = max(qsa_list) 156 | max_diff = max(max_diff, abs(new_v[s] - self.v[s])) 157 | self.v = new_v 158 | if max_diff < self.theta: break # 满足收敛条件,退出评估迭代 159 | cnt += 1 160 | print("价值迭代一共进行%d轮" % cnt) 161 | self.get_policy() 162 | 163 | def get_policy(self): # 根据价值函数导出一个贪婪策略 164 | for s in range(self.env.nrow * self.env.ncol): 165 | qsa_list = [] 166 | for a in range(4): 167 | qsa = 0 168 | for res in self.env.P[s][a]: 169 | p, next_state, r, done = res 170 | qsa += r + p * self.gamma * self.v[next_state] * (1 - done) 171 | qsa_list.append(qsa) 172 | maxq = max(qsa_list) 173 | cntq = qsa_list.count(maxq) # 计算有几个动作得到了最大的Q值 174 | # 让这些动作均分概率 175 | self.pi[s] = [1 / cntq if q == maxq else 0 for q in qsa_list] 176 | 177 | 178 | # env = CliffWalkingEnv() 179 | # action_meaning = ['^', 'v', '<', '>'] 180 | # theta = 0.001 181 | # gamma = 0.9 182 | # agent = ValueIteration(env, theta, gamma) 183 | # agent.value_iteration() 184 | # print_agent(agent, action_meaning, list(range(37, 47)), [47]) 185 | 186 | 187 | 188 | # 价值迭代一共进行14轮 189 | # 状态价值: 190 | # -7.712 -7.458 -7.176 -6.862 -6.513 -6.126 -5.695 -5.217 -4.686 -4.095 -3.439 -2.710 191 | # -7.458 -7.176 -6.862 -6.513 -6.126 -5.695 -5.217 -4.686 -4.095 -3.439 -2.710 -1.900 192 | # -7.176 -6.862 -6.513 -6.126 -5.695 -5.217 -4.686 -4.095 -3.439 -2.710 -1.900 -1.000 193 | # -7.458 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 194 | # 策略: 195 | # ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovoo 196 | # ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovo> ovoo 197 | # ooo> ooo> ooo> ooo> ooo> ooo> ooo> ooo> ooo> ooo> ooo> ovoo 198 | # ^ooo **** **** **** **** **** **** **** **** **** **** EEEE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dsx-rl 2 | 动手学强化学习代码(pycharm打开) 3 | 代码参考自:https://github.com/boyu-ai/Hands-on-RL 4 | -------------------------------------------------------------------------------- /REINFORCE.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from tqdm import tqdm 7 | import rl_utils 8 | 9 | class PolicyNet(torch.nn.Module): 10 | def __init__(self, state_dim, hidden_dim, action_dim): 11 | super(PolicyNet, self).__init__() 12 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 13 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 14 | 15 | def forward(self, x): 16 | x = F.relu(self.fc1(x)) 17 | return F.softmax(self.fc2(x), dim=1) # 0是对列做归一化,1是对行做归一化 18 | 19 | class REINFORCE: 20 | def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma, 21 | device): 22 | self.policy_net = PolicyNet(state_dim, hidden_dim, 23 | action_dim).to(device) 24 | self.optimizer = torch.optim.Adam(self.policy_net.parameters(), 25 | lr=learning_rate) # 使用Adam优化器 26 | self.gamma = gamma # 折扣因子 27 | self.device = device 28 | 29 | def take_action(self, state): # 根据动作概率分布随机采样 30 | state = torch.tensor([state], dtype=torch.float).to(self.device) # 1*4 31 | probs = self.policy_net(state) # 1*2 32 | action_dist = torch.distributions.Categorical(probs) 33 | action = action_dist.sample() 34 | return action.item() 35 | 36 | def update(self, transition_dict): 37 | reward_list = transition_dict['rewards'] 38 | state_list = transition_dict['states'] 39 | action_list = transition_dict['actions'] 40 | 41 | G = 0 42 | self.optimizer.zero_grad() 43 | for i in reversed(range(len(reward_list))): # 从最后一步算起 44 | reward = reward_list[i] 45 | state = torch.tensor([state_list[i]], # 1*4 46 | dtype=torch.float).to(self.device) 47 | action = torch.tensor([action_list[i]]).view(-1, 1).to(self.device) # 1*1 48 | log_prob = torch.log(self.policy_net(state).gather(1, action)) # 1*1 49 | G = self.gamma * G + reward 50 | loss = -log_prob * G # 每一步的损失函数 51 | loss.backward() # 反向传播计算梯度 52 | self.optimizer.step() # 梯度下降 53 | 54 | 55 | learning_rate = 1e-3 56 | num_episodes = 1000 57 | hidden_dim = 128 58 | gamma = 0.98 59 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 60 | "cpu") 61 | 62 | env_name = "CartPole-v0" 63 | env = gym.make(env_name) 64 | env.seed(0) 65 | torch.manual_seed(0) 66 | state_dim = env.observation_space.shape[0] 67 | action_dim = env.action_space.n 68 | agent = REINFORCE(state_dim, hidden_dim, action_dim, learning_rate, gamma, 69 | device) 70 | 71 | return_list = [] 72 | for i in range(10): 73 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 74 | for i_episode in range(int(num_episodes / 10)): 75 | episode_return = 0 76 | transition_dict = { 77 | 'states': [], 78 | 'actions': [], 79 | 'next_states': [], 80 | 'rewards': [], 81 | 'dones': [] 82 | } 83 | state = env.reset() 84 | done = False 85 | while not done: 86 | action = agent.take_action(state) 87 | next_state, reward, done, _ = env.step(action) 88 | transition_dict['states'].append(state) 89 | transition_dict['actions'].append(action) 90 | transition_dict['next_states'].append(next_state) 91 | transition_dict['rewards'].append(reward) 92 | transition_dict['dones'].append(done) 93 | state = next_state 94 | episode_return += reward 95 | return_list.append(episode_return) 96 | agent.update(transition_dict) 97 | if (i_episode + 1) % 10 == 0: 98 | pbar.set_postfix({ 99 | 'episode': 100 | '%d' % (num_episodes / 10 * i + i_episode + 1), 101 | 'return': 102 | '%.3f' % np.mean(return_list[-10:]) 103 | }) 104 | pbar.update(1) 105 | 106 | episodes_list = list(range(len(return_list))) 107 | plt.plot(episodes_list, return_list) 108 | plt.xlabel('Episodes') 109 | plt.ylabel('Returns') 110 | plt.title('REINFORCE on {}'.format(env_name)) 111 | plt.show() 112 | 113 | mv_return = rl_utils.moving_average(return_list, 9) 114 | plt.plot(episodes_list, mv_return) 115 | plt.xlabel('Episodes') 116 | plt.ylabel('Returns') 117 | plt.title('REINFORCE on {}'.format(env_name)) 118 | plt.show() -------------------------------------------------------------------------------- /SAC-continue.py: -------------------------------------------------------------------------------- 1 | import random 2 | import gym 3 | import numpy as np 4 | from tqdm import tqdm 5 | import torch 6 | import torch.nn.functional as F 7 | from torch.distributions import Normal 8 | import matplotlib.pyplot as plt 9 | import rl_utils 10 | 11 | class PolicyNetContinuous(torch.nn.Module): 12 | def __init__(self, state_dim, hidden_dim, action_dim, action_bound): 13 | super(PolicyNetContinuous, self).__init__() 14 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 15 | self.fc_mu = torch.nn.Linear(hidden_dim, action_dim) 16 | self.fc_std = torch.nn.Linear(hidden_dim, action_dim) 17 | self.action_bound = action_bound 18 | 19 | def forward(self, x): 20 | x = F.relu(self.fc1(x)) 21 | mu = self.fc_mu(x) 22 | std = F.softplus(self.fc_std(x)) 23 | dist = Normal(mu, std) 24 | normal_sample = dist.rsample() # rsample()是重参数化采样 25 | log_prob = dist.log_prob(normal_sample) 26 | action = torch.tanh(normal_sample) 27 | # 计算tanh_normal分布的对数概率密度 28 | log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7) 29 | action = action * self.action_bound 30 | return action, log_prob 31 | 32 | 33 | class QValueNetContinuous(torch.nn.Module): 34 | def __init__(self, state_dim, hidden_dim, action_dim): 35 | super(QValueNetContinuous, self).__init__() 36 | self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim) 37 | self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim) 38 | self.fc_out = torch.nn.Linear(hidden_dim, 1) 39 | 40 | def forward(self, x, a): 41 | cat = torch.cat([x, a], dim=1) 42 | x = F.relu(self.fc1(cat)) 43 | x = F.relu(self.fc2(x)) 44 | return self.fc_out(x) 45 | 46 | 47 | class SACContinuous: 48 | ''' 处理连续动作的SAC算法 ''' 49 | def __init__(self, state_dim, hidden_dim, action_dim, action_bound, 50 | actor_lr, critic_lr, alpha_lr, target_entropy, tau, gamma, 51 | device): 52 | self.actor = PolicyNetContinuous(state_dim, hidden_dim, action_dim, 53 | action_bound).to(device) # 策略网络 54 | self.critic_1 = QValueNetContinuous(state_dim, hidden_dim, 55 | action_dim).to(device) # 第一个Q网络 56 | self.critic_2 = QValueNetContinuous(state_dim, hidden_dim, 57 | action_dim).to(device) # 第二个Q网络 58 | self.target_critic_1 = QValueNetContinuous(state_dim, 59 | hidden_dim, action_dim).to( 60 | device) # 第一个目标Q网络 61 | self.target_critic_2 = QValueNetContinuous(state_dim, 62 | hidden_dim, action_dim).to( 63 | device) # 第二个目标Q网络 64 | # 令目标Q网络的初始参数和Q网络一样 65 | self.target_critic_1.load_state_dict(self.critic_1.state_dict()) 66 | self.target_critic_2.load_state_dict(self.critic_2.state_dict()) 67 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), 68 | lr=actor_lr) 69 | self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(), 70 | lr=critic_lr) 71 | self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(), 72 | lr=critic_lr) 73 | # 使用alpha的log值,可以使训练结果比较稳定 74 | self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float) 75 | self.log_alpha.requires_grad = True # 可以对alpha求梯度 76 | self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], 77 | lr=alpha_lr) 78 | self.target_entropy = target_entropy # 目标熵的大小 79 | self.gamma = gamma 80 | self.tau = tau 81 | self.device = device 82 | 83 | def take_action(self, state): 84 | state = torch.tensor([state], dtype=torch.float).to(self.device) 85 | action = self.actor(state)[0] 86 | return [action.item()] 87 | 88 | def calc_target(self, rewards, next_states, dones): # 计算目标Q值 89 | next_actions, log_prob = self.actor(next_states) 90 | entropy = -log_prob 91 | q1_value = self.target_critic_1(next_states, next_actions) 92 | q2_value = self.target_critic_2(next_states, next_actions) 93 | next_value = torch.min(q1_value, 94 | q2_value) + self.log_alpha.exp() * entropy 95 | td_target = rewards + self.gamma * next_value * (1 - dones) 96 | return td_target 97 | 98 | def soft_update(self, net, target_net): 99 | for param_target, param in zip(target_net.parameters(), 100 | net.parameters()): 101 | param_target.data.copy_(param_target.data * (1.0 - self.tau) + 102 | param.data * self.tau) 103 | 104 | def update(self, transition_dict): 105 | states = torch.tensor(transition_dict['states'], 106 | dtype=torch.float).to(self.device) 107 | actions = torch.tensor(transition_dict['actions'], 108 | dtype=torch.float).view(-1, 1).to(self.device) 109 | rewards = torch.tensor(transition_dict['rewards'], 110 | dtype=torch.float).view(-1, 1).to(self.device) 111 | next_states = torch.tensor(transition_dict['next_states'], 112 | dtype=torch.float).to(self.device) 113 | dones = torch.tensor(transition_dict['dones'], 114 | dtype=torch.float).view(-1, 1).to(self.device) 115 | # 和之前章节一样,对倒立摆环境的奖励进行重塑以便训练 116 | rewards = (rewards + 8.0) / 8.0 117 | 118 | # 更新两个Q网络 119 | td_target = self.calc_target(rewards, next_states, dones) 120 | critic_1_loss = torch.mean( 121 | F.mse_loss(self.critic_1(states, actions), td_target.detach())) 122 | critic_2_loss = torch.mean( 123 | F.mse_loss(self.critic_2(states, actions), td_target.detach())) 124 | self.critic_1_optimizer.zero_grad() 125 | critic_1_loss.backward() 126 | self.critic_1_optimizer.step() 127 | self.critic_2_optimizer.zero_grad() 128 | critic_2_loss.backward() 129 | self.critic_2_optimizer.step() 130 | 131 | # 更新策略网络 132 | new_actions, log_prob = self.actor(states) 133 | entropy = -log_prob 134 | q1_value = self.critic_1(states, new_actions) 135 | q2_value = self.critic_2(states, new_actions) 136 | actor_loss = torch.mean(-self.log_alpha.exp() * entropy - 137 | torch.min(q1_value, q2_value)) 138 | self.actor_optimizer.zero_grad() 139 | actor_loss.backward() 140 | self.actor_optimizer.step() 141 | 142 | # 更新alpha值 143 | alpha_loss = torch.mean( 144 | (entropy - self.target_entropy).detach() * self.log_alpha.exp()) 145 | self.log_alpha_optimizer.zero_grad() 146 | alpha_loss.backward() 147 | self.log_alpha_optimizer.step() 148 | 149 | self.soft_update(self.critic_1, self.target_critic_1) 150 | self.soft_update(self.critic_2, self.target_critic_2) 151 | 152 | env_name = 'Pendulum-v0' 153 | env = gym.make(env_name) 154 | state_dim = env.observation_space.shape[0] 155 | action_dim = env.action_space.shape[0] 156 | action_bound = env.action_space.high[0] # 动作最大值 157 | random.seed(0) 158 | np.random.seed(0) 159 | env.seed(0) 160 | torch.manual_seed(0) 161 | 162 | actor_lr = 3e-4 163 | critic_lr = 3e-3 164 | alpha_lr = 3e-4 165 | num_episodes = 100 166 | hidden_dim = 128 167 | gamma = 0.99 168 | tau = 0.005 # 软更新参数 169 | buffer_size = 100000 170 | minimal_size = 1000 171 | batch_size = 64 172 | target_entropy = -env.action_space.shape[0] 173 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 174 | "cpu") 175 | 176 | replay_buffer = rl_utils.ReplayBuffer(buffer_size) 177 | agent = SACContinuous(state_dim, hidden_dim, action_dim, action_bound, 178 | actor_lr, critic_lr, alpha_lr, target_entropy, tau, 179 | gamma, device) 180 | 181 | return_list = rl_utils.train_off_policy_agent(env, agent, num_episodes, 182 | replay_buffer, minimal_size, 183 | batch_size) 184 | 185 | episodes_list = list(range(len(return_list))) 186 | plt.plot(episodes_list, return_list) 187 | plt.xlabel('Episodes') 188 | plt.ylabel('Returns') 189 | plt.title('SAC on {}'.format(env_name)) 190 | plt.show() 191 | 192 | mv_return = rl_utils.moving_average(return_list, 9) 193 | plt.plot(episodes_list, mv_return) 194 | plt.xlabel('Episodes') 195 | plt.ylabel('Returns') 196 | plt.title('SAC on {}'.format(env_name)) 197 | plt.show() -------------------------------------------------------------------------------- /SAC.py: -------------------------------------------------------------------------------- 1 | import random 2 | import gym 3 | import numpy as np 4 | from tqdm import tqdm 5 | import torch 6 | import torch.nn.functional as F 7 | from torch.distributions import Normal 8 | import matplotlib.pyplot as plt 9 | import rl_utils 10 | 11 | class PolicyNetContinuous(torch.nn.Module): 12 | def __init__(self, state_dim, hidden_dim, action_dim, action_bound): 13 | super(PolicyNetContinuous, self).__init__() 14 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 15 | self.fc_mu = torch.nn.Linear(hidden_dim, action_dim) 16 | self.fc_std = torch.nn.Linear(hidden_dim, action_dim) 17 | self.action_bound = action_bound 18 | 19 | def forward(self, x): 20 | x = F.relu(self.fc1(x)) 21 | mu = self.fc_mu(x) 22 | std = F.softplus(self.fc_std(x)) 23 | dist = Normal(mu, std) 24 | normal_sample = dist.rsample() # rsample()是重参数化采样 25 | log_prob = dist.log_prob(normal_sample) 26 | action = torch.tanh(normal_sample) 27 | # 计算tanh_normal分布的对数概率密度 28 | log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7) 29 | action = action * self.action_bound 30 | return action, log_prob 31 | 32 | 33 | class QValueNetContinuous(torch.nn.Module): 34 | def __init__(self, state_dim, hidden_dim, action_dim): 35 | super(QValueNetContinuous, self).__init__() 36 | self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim) 37 | self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim) 38 | self.fc_out = torch.nn.Linear(hidden_dim, 1) 39 | 40 | def forward(self, x, a): 41 | cat = torch.cat([x, a], dim=1) 42 | x = F.relu(self.fc1(cat)) 43 | x = F.relu(self.fc2(x)) 44 | return self.fc_out(x) 45 | 46 | class SACContinuous: 47 | ''' 处理连续动作的SAC算法 ''' 48 | def __init__(self, state_dim, hidden_dim, action_dim, action_bound, 49 | actor_lr, critic_lr, alpha_lr, target_entropy, tau, gamma, 50 | device): 51 | self.actor = PolicyNetContinuous(state_dim, hidden_dim, action_dim, 52 | action_bound).to(device) # 策略网络 53 | self.critic_1 = QValueNetContinuous(state_dim, hidden_dim, 54 | action_dim).to(device) # 第一个Q网络 55 | self.critic_2 = QValueNetContinuous(state_dim, hidden_dim, 56 | action_dim).to(device) # 第二个Q网络 57 | self.target_critic_1 = QValueNetContinuous(state_dim, 58 | hidden_dim, action_dim).to( 59 | device) # 第一个目标Q网络 60 | self.target_critic_2 = QValueNetContinuous(state_dim, 61 | hidden_dim, action_dim).to( 62 | device) # 第二个目标Q网络 63 | # 令目标Q网络的初始参数和Q网络一样 64 | self.target_critic_1.load_state_dict(self.critic_1.state_dict()) 65 | self.target_critic_2.load_state_dict(self.critic_2.state_dict()) 66 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), 67 | lr=actor_lr) 68 | self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(), 69 | lr=critic_lr) 70 | self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(), 71 | lr=critic_lr) 72 | # 使用alpha的log值,可以使训练结果比较稳定 73 | self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float) 74 | self.log_alpha.requires_grad = True # 可以对alpha求梯度 75 | self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], 76 | lr=alpha_lr) 77 | self.target_entropy = target_entropy # 目标熵的大小 78 | self.gamma = gamma 79 | self.tau = tau 80 | self.device = device 81 | 82 | def take_action(self, state): 83 | state = torch.tensor([state], dtype=torch.float).to(self.device) 84 | action = self.actor(state)[0] 85 | return [action.item()] 86 | 87 | def calc_target(self, rewards, next_states, dones): # 计算目标Q值 88 | next_actions, log_prob = self.actor(next_states) 89 | entropy = -log_prob 90 | q1_value = self.target_critic_1(next_states, next_actions) 91 | q2_value = self.target_critic_2(next_states, next_actions) 92 | next_value = torch.min(q1_value, 93 | q2_value) + self.log_alpha.exp() * entropy 94 | td_target = rewards + self.gamma * next_value * (1 - dones) 95 | return td_target 96 | 97 | def soft_update(self, net, target_net): 98 | for param_target, param in zip(target_net.parameters(), 99 | net.parameters()): 100 | param_target.data.copy_(param_target.data * (1.0 - self.tau) + 101 | param.data * self.tau) 102 | 103 | def update(self, transition_dict): 104 | states = torch.tensor(transition_dict['states'], 105 | dtype=torch.float).to(self.device) 106 | actions = torch.tensor(transition_dict['actions'], 107 | dtype=torch.float).view(-1, 1).to(self.device) 108 | rewards = torch.tensor(transition_dict['rewards'], 109 | dtype=torch.float).view(-1, 1).to(self.device) 110 | next_states = torch.tensor(transition_dict['next_states'], 111 | dtype=torch.float).to(self.device) 112 | dones = torch.tensor(transition_dict['dones'], 113 | dtype=torch.float).view(-1, 1).to(self.device) 114 | # 和之前章节一样,对倒立摆环境的奖励进行重塑以便训练 115 | rewards = (rewards + 8.0) / 8.0 116 | 117 | # 更新两个Q网络 118 | td_target = self.calc_target(rewards, next_states, dones) 119 | critic_1_loss = torch.mean( 120 | F.mse_loss(self.critic_1(states, actions), td_target.detach())) 121 | critic_2_loss = torch.mean( 122 | F.mse_loss(self.critic_2(states, actions), td_target.detach())) 123 | self.critic_1_optimizer.zero_grad() 124 | critic_1_loss.backward() 125 | self.critic_1_optimizer.step() 126 | self.critic_2_optimizer.zero_grad() 127 | critic_2_loss.backward() 128 | self.critic_2_optimizer.step() 129 | 130 | # 更新策略网络 131 | new_actions, log_prob = self.actor(states) 132 | entropy = -log_prob 133 | q1_value = self.critic_1(states, new_actions) 134 | q2_value = self.critic_2(states, new_actions) 135 | actor_loss = torch.mean(-self.log_alpha.exp() * entropy - 136 | torch.min(q1_value, q2_value)) 137 | self.actor_optimizer.zero_grad() 138 | actor_loss.backward() 139 | self.actor_optimizer.step() 140 | 141 | # 更新alpha值 142 | alpha_loss = torch.mean( 143 | (entropy - self.target_entropy).detach() * self.log_alpha.exp()) 144 | self.log_alpha_optimizer.zero_grad() 145 | alpha_loss.backward() 146 | self.log_alpha_optimizer.step() 147 | 148 | self.soft_update(self.critic_1, self.target_critic_1) 149 | self.soft_update(self.critic_2, self.target_critic_2) 150 | 151 | env_name = 'Pendulum-v0' 152 | env = gym.make(env_name) 153 | state_dim = env.observation_space.shape[0] 154 | action_dim = env.action_space.shape[0] 155 | action_bound = env.action_space.high[0] # 动作最大值 156 | random.seed(0) 157 | np.random.seed(0) 158 | env.seed(0) 159 | torch.manual_seed(0) 160 | 161 | actor_lr = 3e-4 162 | critic_lr = 3e-3 163 | alpha_lr = 3e-4 164 | num_episodes = 100 165 | hidden_dim = 128 166 | gamma = 0.99 167 | tau = 0.005 # 软更新参数 168 | buffer_size = 100000 169 | minimal_size = 1000 170 | batch_size = 64 171 | target_entropy = -env.action_space.shape[0] 172 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 173 | "cpu") 174 | 175 | replay_buffer = rl_utils.ReplayBuffer(buffer_size) 176 | agent = SACContinuous(state_dim, hidden_dim, action_dim, action_bound, 177 | actor_lr, critic_lr, alpha_lr, target_entropy, tau, 178 | gamma, device) 179 | 180 | return_list = rl_utils.train_off_policy_agent(env, agent, num_episodes, 181 | replay_buffer, minimal_size, 182 | batch_size) 183 | 184 | episodes_list = list(range(len(return_list))) 185 | plt.plot(episodes_list, return_list) 186 | plt.xlabel('Episodes') 187 | plt.ylabel('Returns') 188 | plt.title('SAC on {}'.format(env_name)) 189 | plt.show() 190 | 191 | mv_return = rl_utils.moving_average(return_list, 9) 192 | plt.plot(episodes_list, mv_return) 193 | plt.xlabel('Episodes') 194 | plt.ylabel('Returns') 195 | plt.title('SAC on {}'.format(env_name)) 196 | plt.show() 197 | 198 | 199 | class PolicyNet(torch.nn.Module): 200 | def __init__(self, state_dim, hidden_dim, action_dim): 201 | super(PolicyNet, self).__init__() 202 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 203 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 204 | 205 | def forward(self, x): 206 | x = F.relu(self.fc1(x)) 207 | return F.softmax(self.fc2(x), dim=1) 208 | 209 | 210 | class QValueNet(torch.nn.Module): 211 | ''' 只有一层隐藏层的Q网络 ''' 212 | def __init__(self, state_dim, hidden_dim, action_dim): 213 | super(QValueNet, self).__init__() 214 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 215 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 216 | 217 | def forward(self, x): 218 | x = F.relu(self.fc1(x)) 219 | return self.fc2(x) 220 | 221 | class SAC: 222 | ''' 处理离散动作的SAC算法 ''' 223 | def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, 224 | alpha_lr, target_entropy, tau, gamma, device): 225 | # 策略网络 226 | self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device) 227 | # 第一个Q网络 228 | self.critic_1 = QValueNet(state_dim, hidden_dim, action_dim).to(device) 229 | # 第二个Q网络 230 | self.critic_2 = QValueNet(state_dim, hidden_dim, action_dim).to(device) 231 | self.target_critic_1 = QValueNet(state_dim, hidden_dim, 232 | action_dim).to(device) # 第一个目标Q网络 233 | self.target_critic_2 = QValueNet(state_dim, hidden_dim, 234 | action_dim).to(device) # 第二个目标Q网络 235 | # 令目标Q网络的初始参数和Q网络一样 236 | self.target_critic_1.load_state_dict(self.critic_1.state_dict()) 237 | self.target_critic_2.load_state_dict(self.critic_2.state_dict()) 238 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), 239 | lr=actor_lr) 240 | self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(), 241 | lr=critic_lr) 242 | self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(), 243 | lr=critic_lr) 244 | # 使用alpha的log值,可以使训练结果比较稳定 245 | self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float) 246 | self.log_alpha.requires_grad = True # 可以对alpha求梯度 247 | self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], 248 | lr=alpha_lr) 249 | self.target_entropy = target_entropy # 目标熵的大小 250 | self.gamma = gamma 251 | self.tau = tau 252 | self.device = device 253 | 254 | def take_action(self, state): 255 | state = torch.tensor([state], dtype=torch.float).to(self.device) 256 | probs = self.actor(state) 257 | action_dist = torch.distributions.Categorical(probs) 258 | action = action_dist.sample() 259 | return action.item() 260 | 261 | # 计算目标Q值,直接用策略网络的输出概率进行期望计算 262 | def calc_target(self, rewards, next_states, dones): 263 | next_probs = self.actor(next_states) 264 | next_log_probs = torch.log(next_probs + 1e-8) 265 | entropy = -torch.sum(next_probs * next_log_probs, dim=1, keepdim=True) 266 | q1_value = self.target_critic_1(next_states) 267 | q2_value = self.target_critic_2(next_states) 268 | min_qvalue = torch.sum(next_probs * torch.min(q1_value, q2_value), 269 | dim=1, 270 | keepdim=True) 271 | next_value = min_qvalue + self.log_alpha.exp() * entropy 272 | td_target = rewards + self.gamma * next_value * (1 - dones) 273 | return td_target 274 | 275 | def soft_update(self, net, target_net): 276 | for param_target, param in zip(target_net.parameters(), 277 | net.parameters()): 278 | param_target.data.copy_(param_target.data * (1.0 - self.tau) + 279 | param.data * self.tau) 280 | 281 | def update(self, transition_dict): 282 | states = torch.tensor(transition_dict['states'], 283 | dtype=torch.float).to(self.device) 284 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 285 | self.device) # 动作不再是float类型 286 | rewards = torch.tensor(transition_dict['rewards'], 287 | dtype=torch.float).view(-1, 1).to(self.device) 288 | next_states = torch.tensor(transition_dict['next_states'], 289 | dtype=torch.float).to(self.device) 290 | dones = torch.tensor(transition_dict['dones'], 291 | dtype=torch.float).view(-1, 1).to(self.device) 292 | 293 | # 更新两个Q网络 294 | td_target = self.calc_target(rewards, next_states, dones) 295 | critic_1_q_values = self.critic_1(states).gather(1, actions) 296 | critic_1_loss = torch.mean( 297 | F.mse_loss(critic_1_q_values, td_target.detach())) 298 | critic_2_q_values = self.critic_2(states).gather(1, actions) 299 | critic_2_loss = torch.mean( 300 | F.mse_loss(critic_2_q_values, td_target.detach())) 301 | self.critic_1_optimizer.zero_grad() 302 | critic_1_loss.backward() 303 | self.critic_1_optimizer.step() 304 | self.critic_2_optimizer.zero_grad() 305 | critic_2_loss.backward() 306 | self.critic_2_optimizer.step() 307 | 308 | # 更新策略网络 309 | probs = self.actor(states) 310 | log_probs = torch.log(probs + 1e-8) 311 | # 直接根据概率计算熵 312 | entropy = -torch.sum(probs * log_probs, dim=1, keepdim=True) # 313 | q1_value = self.critic_1(states) 314 | q2_value = self.critic_2(states) 315 | min_qvalue = torch.sum(probs * torch.min(q1_value, q2_value), 316 | dim=1, 317 | keepdim=True) # 直接根据概率计算期望 318 | actor_loss = torch.mean(-self.log_alpha.exp() * entropy - min_qvalue) 319 | self.actor_optimizer.zero_grad() 320 | actor_loss.backward() 321 | self.actor_optimizer.step() 322 | 323 | # 更新alpha值 324 | alpha_loss = torch.mean( 325 | (entropy - target_entropy).detach() * self.log_alpha.exp()) 326 | self.log_alpha_optimizer.zero_grad() 327 | alpha_loss.backward() 328 | self.log_alpha_optimizer.step() 329 | 330 | self.soft_update(self.critic_1, self.target_critic_1) 331 | self.soft_update(self.critic_2, self.target_critic_2) 332 | 333 | 334 | actor_lr = 1e-3 335 | critic_lr = 1e-2 336 | alpha_lr = 1e-2 337 | num_episodes = 200 338 | hidden_dim = 128 339 | gamma = 0.98 340 | tau = 0.005 # 软更新参数 341 | buffer_size = 10000 342 | minimal_size = 500 343 | batch_size = 64 344 | target_entropy = -1 345 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 346 | "cpu") 347 | 348 | env_name = 'CartPole-v0' 349 | env = gym.make(env_name) 350 | random.seed(0) 351 | np.random.seed(0) 352 | env.seed(0) 353 | torch.manual_seed(0) 354 | replay_buffer = rl_utils.ReplayBuffer(buffer_size) 355 | state_dim = env.observation_space.shape[0] 356 | action_dim = env.action_space.n 357 | agent = SAC(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, alpha_lr, 358 | target_entropy, tau, gamma, device) 359 | 360 | return_list = rl_utils.train_off_policy_agent(env, agent, num_episodes, 361 | replay_buffer, minimal_size, 362 | batch_size) 363 | 364 | episodes_list = list(range(len(return_list))) 365 | plt.plot(episodes_list, return_list) 366 | plt.xlabel('Episodes') 367 | plt.ylabel('Returns') 368 | plt.title('SAC on {}'.format(env_name)) 369 | plt.show() 370 | 371 | mv_return = rl_utils.moving_average(return_list, 9) 372 | plt.plot(episodes_list, mv_return) 373 | plt.xlabel('Episodes') 374 | plt.ylabel('Returns') 375 | plt.title('SAC on {}'.format(env_name)) 376 | plt.show() -------------------------------------------------------------------------------- /Sarsa.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from tqdm import tqdm # tqdm是显示循环进度条的库 4 | 5 | 6 | class CliffWalkingEnv: 7 | def __init__(self, ncol, nrow): 8 | self.nrow = nrow 9 | self.ncol = ncol 10 | self.x = 0 # 记录当前智能体位置的横坐标 11 | self.y = self.nrow - 1 # 记录当前智能体位置的纵坐标 12 | 13 | def step(self, action): # 外部调用这个函数来改变当前位置 14 | # 4种动作, change[0]:上, change[1]:下, change[2]:左, change[3]:右。坐标系原点(0,0) 15 | # 定义在左上角 16 | change = [[0, -1], [0, 1], [-1, 0], [1, 0]] 17 | self.x = min(self.ncol - 1, max(0, self.x + change[action][0])) 18 | self.y = min(self.nrow - 1, max(0, self.y + change[action][1])) 19 | next_state = self.y * self.ncol + self.x 20 | reward = -1 21 | done = False 22 | if self.y == self.nrow - 1 and self.x > 0: # 下一个位置在悬崖或者目标 23 | done = True 24 | if self.x != self.ncol - 1: 25 | reward = -100 26 | return next_state, reward, done 27 | 28 | def reset(self): # 回归初始状态,坐标轴原点在左上角 29 | self.x = 0 30 | self.y = self.nrow - 1 31 | return self.y * self.ncol + self.x 32 | 33 | class Sarsa: 34 | """ Sarsa算法 """ 35 | def __init__(self, ncol, nrow, epsilon, alpha, gamma, n_action=4): 36 | self.Q_table = np.zeros([nrow * ncol, n_action]) # 初始化Q(s,a)表格 37 | self.n_action = n_action # 动作个数 38 | self.alpha = alpha # 学习率 39 | self.gamma = gamma # 折扣因子 40 | self.epsilon = epsilon # epsilon-贪婪策略中的参数 41 | 42 | def take_action(self, state): # 选取下一步的操作,具体实现为epsilon-贪婪 43 | if np.random.random() < self.epsilon: 44 | action = np.random.randint(self.n_action) 45 | else: 46 | action = np.argmax(self.Q_table[state]) 47 | return action 48 | 49 | def best_action(self, state): # 用于打印策略 50 | Q_max = np.max(self.Q_table[state]) 51 | a = [0 for _ in range(self.n_action)] 52 | for i in range(self.n_action): # 若两个动作的价值一样,都会记录下来 53 | if self.Q_table[state, i] == Q_max: 54 | a[i] = 1 55 | return a 56 | 57 | def update(self, s0, a0, r, s1, a1): 58 | td_error = r + self.gamma * self.Q_table[s1, a1] - self.Q_table[s0, a0] 59 | self.Q_table[s0, a0] += self.alpha * td_error 60 | 61 | ncol = 12 62 | nrow = 4 63 | env = CliffWalkingEnv(ncol, nrow) 64 | # np.random.seed(0) 65 | # epsilon = 0.1 66 | # alpha = 0.1 67 | # gamma = 0.9 68 | # agent = Sarsa(ncol, nrow, epsilon, alpha, gamma) 69 | # num_episodes = 500 # 智能体在环境中运行的序列的数量 70 | # 71 | # return_list = [] # 记录每一条序列的回报 72 | # for i in range(10): # 显示10个进度条 73 | # # tqdm的进度条功能 74 | # with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 75 | # for i_episode in range(int(num_episodes / 10)): # 每个进度条的序列数 76 | # episode_return = 0 77 | # state = env.reset() 78 | # action = agent.take_action(state) 79 | # done = False 80 | # while not done: 81 | # next_state, reward, done = env.step(action) 82 | # next_action = agent.take_action(next_state) 83 | # episode_return += reward # 这里回报的计算不进行折扣因子衰减 84 | # agent.update(state, action, reward, next_state, next_action) 85 | # state = next_state 86 | # action = next_action 87 | # return_list.append(episode_return) 88 | # if (i_episode + 1) % 10 == 0: # 每10条序列打印一下这10条序列的平均回报 89 | # pbar.set_postfix({ 90 | # 'episode': 91 | # '%d' % (num_episodes / 10 * i + i_episode + 1), 92 | # 'return': 93 | # '%.3f' % np.mean(return_list[-10:]) 94 | # }) 95 | # pbar.update(1) 96 | # 97 | # episodes_list = list(range(len(return_list))) 98 | # plt.plot(episodes_list, return_list) 99 | # plt.xlabel('Episodes') 100 | # plt.ylabel('Returns') 101 | # plt.title('Sarsa on {}'.format('Cliff Walking')) 102 | # plt.show() 103 | 104 | 105 | def print_agent(agent, env, action_meaning, disaster=[], end=[]): 106 | for i in range(env.nrow): 107 | for j in range(env.ncol): 108 | if (i * env.ncol + j) in disaster: 109 | print('****', end=' ') 110 | elif (i * env.ncol + j) in end: 111 | print('EEEE', end=' ') 112 | else: 113 | a = agent.best_action(i * env.ncol + j) 114 | pi_str = '' 115 | for k in range(len(action_meaning)): 116 | pi_str += action_meaning[k] if a[k] > 0 else 'o' 117 | print(pi_str, end=' ') 118 | print() 119 | # 120 | # 121 | # action_meaning = ['^', 'v', '<', '>'] 122 | # print('Sarsa算法最终收敛得到的策略为:') 123 | # print_agent(agent, env, action_meaning, list(range(37, 47)), [47]) 124 | 125 | 126 | class nstep_Sarsa: 127 | """ n步Sarsa算法 """ 128 | def __init__(self, n, ncol, nrow, epsilon, alpha, gamma, n_action=4): 129 | self.Q_table = np.zeros([nrow * ncol, n_action]) 130 | self.n_action = n_action 131 | self.alpha = alpha 132 | self.gamma = gamma 133 | self.epsilon = epsilon 134 | self.n = n # 采用n步Sarsa算法 135 | self.state_list = [] # 保存之前的状态 136 | self.action_list = [] # 保存之前的动作 137 | self.reward_list = [] # 保存之前的奖励 138 | 139 | def take_action(self, state): 140 | if np.random.random() < self.epsilon: 141 | action = np.random.randint(self.n_action) 142 | else: 143 | action = np.argmax(self.Q_table[state]) 144 | return action 145 | 146 | def best_action(self, state): # 用于打印策略 147 | Q_max = np.max(self.Q_table[state]) 148 | a = [0 for _ in range(self.n_action)] 149 | for i in range(self.n_action): 150 | if self.Q_table[state, i] == Q_max: 151 | a[i] = 1 152 | return a 153 | 154 | def update(self, s0, a0, r, s1, a1, done): 155 | self.state_list.append(s0) 156 | self.action_list.append(a0) 157 | self.reward_list.append(r) 158 | if len(self.state_list) == self.n: # 若保存的数据可以进行n步更新 159 | G = self.Q_table[s1, a1] # 得到Q(s_{t+n}, a_{t+n}) 160 | for i in reversed(range(self.n)): 161 | G = self.gamma * G + self.reward_list[i] # 不断向前计算每一步的回报 162 | # 如果到达终止状态,最后几步虽然长度不够n步,也将其进行更新 163 | if done and i > 0: 164 | s = self.state_list[i] 165 | a = self.action_list[i] 166 | self.Q_table[s, a] += self.alpha * (G - self.Q_table[s, a]) 167 | s = self.state_list.pop(0) # 将需要更新的状态动作从列表中删除,下次不必更新 168 | a = self.action_list.pop(0) 169 | self.reward_list.pop(0) 170 | # n步Sarsa的主要更新步骤 171 | self.Q_table[s, a] += self.alpha * (G - self.Q_table[s, a]) 172 | if done: # 如果到达终止状态,即将开始下一条序列,则将列表全清空 173 | self.state_list = [] 174 | self.action_list = [] 175 | self.reward_list = [] 176 | 177 | 178 | # np.random.seed(0) 179 | # n_step = 5 # 5步Sarsa算法 180 | # alpha = 0.1 181 | # epsilon = 0.1 182 | # gamma = 0.9 183 | # agent = nstep_Sarsa(n_step, ncol, nrow, epsilon, alpha, gamma) 184 | # num_episodes = 500 # 智能体在环境中运行的序列的数量 185 | # 186 | # return_list = [] # 记录每一条序列的回报 187 | # for i in range(10): # 显示10个进度条 188 | # #tqdm的进度条功能 189 | # with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 190 | # for i_episode in range(int(num_episodes / 10)): # 每个进度条的序列数 191 | # episode_return = 0 192 | # state = env.reset() 193 | # action = agent.take_action(state) 194 | # done = False 195 | # while not done: 196 | # next_state, reward, done = env.step(action) 197 | # next_action = agent.take_action(next_state) 198 | # episode_return += reward # 这里回报的计算不进行折扣因子衰减 199 | # agent.update(state, action, reward, next_state, next_action, 200 | # done) 201 | # state = next_state 202 | # action = next_action 203 | # return_list.append(episode_return) 204 | # if (i_episode + 1) % 10 == 0: # 每10条序列打印一下这10条序列的平均回报 205 | # pbar.set_postfix({ 206 | # 'episode': 207 | # '%d' % (num_episodes / 10 * i + i_episode + 1), 208 | # 'return': 209 | # '%.3f' % np.mean(return_list[-10:]) 210 | # }) 211 | # pbar.update(1) 212 | # 213 | # episodes_list = list(range(len(return_list))) 214 | # plt.plot(episodes_list, return_list) 215 | # plt.xlabel('Episodes') 216 | # plt.ylabel('Returns') 217 | # plt.title('5-step Sarsa on {}'.format('Cliff Walking')) 218 | # plt.show() 219 | # 220 | # action_meaning = ['^', 'v', '<', '>'] 221 | # print('5步Sarsa算法最终收敛得到的策略为:') 222 | # print_agent(agent, env, action_meaning, list(range(37, 47)), [47]) 223 | 224 | 225 | class QLearning: 226 | """ Q-learning算法 """ 227 | def __init__(self, ncol, nrow, epsilon, alpha, gamma, n_action=4): 228 | self.Q_table = np.zeros([nrow * ncol, n_action]) # 初始化Q(s,a)表格 229 | self.n_action = n_action # 动作个数 230 | self.alpha = alpha # 学习率 231 | self.gamma = gamma # 折扣因子 232 | self.epsilon = epsilon # epsilon-贪婪策略中的参数 233 | 234 | def take_action(self, state): #选取下一步的操作 235 | if np.random.random() < self.epsilon: 236 | action = np.random.randint(self.n_action) 237 | else: 238 | action = np.argmax(self.Q_table[state]) 239 | return action 240 | 241 | def best_action(self, state): # 用于打印策略 242 | Q_max = np.max(self.Q_table[state]) 243 | a = [0 for _ in range(self.n_action)] 244 | for i in range(self.n_action): 245 | if self.Q_table[state, i] == Q_max: 246 | a[i] = 1 247 | return a 248 | 249 | def update(self, s0, a0, r, s1): 250 | td_error = r + self.gamma * self.Q_table[s1].max( 251 | ) - self.Q_table[s0, a0] 252 | self.Q_table[s0, a0] += self.alpha * td_error 253 | 254 | 255 | np.random.seed(0) 256 | epsilon = 0.1 257 | alpha = 0.1 258 | gamma = 0.9 259 | agent = QLearning(ncol, nrow, epsilon, alpha, gamma) 260 | num_episodes = 500 # 智能体在环境中运行的序列的数量 261 | 262 | return_list = [] # 记录每一条序列的回报 263 | for i in range(10): # 显示10个进度条 264 | # tqdm的进度条功能 265 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 266 | for i_episode in range(int(num_episodes / 10)): # 每个进度条的序列数 267 | episode_return = 0 268 | state = env.reset() 269 | done = False 270 | while not done: 271 | action = agent.take_action(state) 272 | next_state, reward, done = env.step(action) 273 | episode_return += reward # 这里回报的计算不进行折扣因子衰减 274 | agent.update(state, action, reward, next_state) 275 | state = next_state 276 | return_list.append(episode_return) 277 | if (i_episode + 1) % 10 == 0: # 每10条序列打印一下这10条序列的平均回报 278 | pbar.set_postfix({ 279 | 'episode': 280 | '%d' % (num_episodes / 10 * i + i_episode + 1), 281 | 'return': 282 | '%.3f' % np.mean(return_list[-10:]) 283 | }) 284 | pbar.update(1) 285 | 286 | episodes_list = list(range(len(return_list))) 287 | plt.plot(episodes_list, return_list) 288 | plt.xlabel('Episodes') 289 | plt.ylabel('Returns') 290 | plt.title('Q-learning on {}'.format('Cliff Walking')) 291 | plt.show() 292 | 293 | action_meaning = ['^', 'v', '<', '>'] 294 | print('Q-learning算法最终收敛得到的策略为:') 295 | print_agent(agent, env, action_meaning, list(range(37, 47)), [47]) 296 | -------------------------------------------------------------------------------- /bandit.py: -------------------------------------------------------------------------------- 1 | # 导入需要使用的库,其中numpy是支持数组和矩阵运算的科学计算库,而matplotlib是绘图库 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | class BernoulliBandit: 7 | """ 伯努利多臂老虎机,输入K表示拉杆个数 """ 8 | def __init__(self, K): 9 | self.probs = np.random.uniform(size=K) # 随机生成K个0~1的数,作为拉动每根拉杆的获奖 10 | # 概率 11 | self.best_idx = np.argmax(self.probs) # 获奖概率最大的拉杆 12 | self.best_prob = self.probs[self.best_idx] # 最大的获奖概率 13 | self.K = K 14 | 15 | def step(self, k): 16 | # 当玩家选择了k号拉杆后,根据拉动该老虎机的k号拉杆获得奖励的概率返回1(获奖)或0(未 17 | # 获奖) 18 | if np.random.rand() < self.probs[k]: 19 | return 1 20 | else: 21 | return 0 22 | 23 | 24 | np.random.seed(1) # 设定随机种子,使实验具有可重复性 25 | K = 10 26 | bandit_10_arm = BernoulliBandit(K) 27 | 28 | 29 | # print("随机生成了一个%d臂伯努利老虎机" % K) 30 | # print("获奖概率最大的拉杆为%d号,其获奖概率为%.4f" % 31 | # (bandit_10_arm.best_idx, bandit_10_arm.best_prob)) 32 | 33 | # 随机生成了一个10臂伯努利老虎机 34 | # 获奖概率最大的拉杆为1号,其获奖概率为0.7203 35 | 36 | 37 | class Solver: 38 | """ 多臂老虎机算法基本框架 """ 39 | def __init__(self, bandit): 40 | self.bandit = bandit 41 | self.counts = np.zeros(self.bandit.K) # 每根拉杆的尝试次数 42 | self.regret = 0. # 当前步的累积懊悔 43 | self.actions = [] # 维护一个列表,记录每一步的动作 44 | self.regrets = [] # 维护一个列表,记录每一步的累积懊悔 45 | 46 | def update_regret(self, k): 47 | # 计算累积懊悔并保存,k为本次动作选择的拉杆的编号 48 | self.regret += self.bandit.best_prob - self.bandit.probs[k] 49 | self.regrets.append(self.regret) 50 | 51 | def run_one_step(self): 52 | # 返回当前动作选择哪一根拉杆,由每个具体的策略实现 53 | raise NotImplementedError 54 | 55 | def run(self, num_steps): 56 | # 运行一定次数,num_steps为总运行次数 57 | for _ in range(num_steps): 58 | k = self.run_one_step() 59 | self.counts[k] += 1 60 | self.actions.append(k) 61 | self.update_regret(k) 62 | 63 | class EpsilonGreedy(Solver): 64 | """ epsilon贪婪算法,继承Solver类 """ 65 | def __init__(self, bandit, epsilon=0.01, init_prob=1.0): 66 | super(EpsilonGreedy, self).__init__(bandit) 67 | self.epsilon = epsilon 68 | #初始化拉动所有拉杆的期望奖励估值 69 | self.estimates = np.array([init_prob] * self.bandit.K) 70 | 71 | def run_one_step(self): 72 | if np.random.random() < self.epsilon: 73 | k = np.random.randint(0, self.bandit.K) # 随机选择一根拉杆 74 | else: 75 | k = np.argmax(self.estimates) # 选择期望奖励估值最大的拉杆 76 | r = self.bandit.step(k) # 得到本次动作的奖励 77 | self.estimates[k] += 1. / (self.counts[k] + 1) * (r - 78 | self.estimates[k]) 79 | return k 80 | 81 | def plot_results(solvers, solver_names): 82 | """生成累积懊悔随时间变化的图像。输入solvers是一个列表,列表中的每个元素是一种特定的策略。 83 | 而solver_names也是一个列表,存储每个策略的名称""" 84 | for idx, solver in enumerate(solvers): 85 | time_list = range(len(solver.regrets)) 86 | plt.plot(time_list, solver.regrets, label=solver_names[idx]) 87 | plt.xlabel('Time steps') 88 | plt.ylabel('Cumulative regrets') 89 | plt.title('%d-armed bandit' % solvers[0].bandit.K) 90 | plt.legend() 91 | plt.show() 92 | 93 | 94 | class DecayingEpsilonGreedy(Solver): 95 | """ epsilon值随时间衰减的epsilon-贪婪算法,继承Solver类 """ 96 | def __init__(self, bandit, init_prob=1.0): 97 | super(DecayingEpsilonGreedy, self).__init__(bandit) 98 | self.estimates = np.array([init_prob] * self.bandit.K) 99 | self.total_count = 0 100 | 101 | def run_one_step(self): 102 | self.total_count += 1 103 | if np.random.random() < 1 / self.total_count: # epsilon值随时间衰减 104 | k = np.random.randint(0, self.bandit.K) 105 | else: 106 | k = np.argmax(self.estimates) 107 | 108 | r = self.bandit.step(k) 109 | self.estimates[k] += 1. / (self.counts[k] + 1) * (r - 110 | self.estimates[k]) 111 | 112 | return k 113 | 114 | 115 | class UCB(Solver): 116 | """ UCB算法,继承Solver类 """ 117 | def __init__(self, bandit, coef, init_prob=1.0): 118 | super(UCB, self).__init__(bandit) 119 | self.total_count = 0 120 | self.estimates = np.array([init_prob] * self.bandit.K) 121 | self.coef = coef 122 | 123 | def run_one_step(self): 124 | self.total_count += 1 125 | ucb = self.estimates + self.coef * np.sqrt( 126 | np.log(self.total_count) / (2 * (self.counts + 1))) # 计算上置信界 127 | k = np.argmax(ucb) # 选出上置信界最大的拉杆 128 | r = self.bandit.step(k) 129 | self.estimates[k] += 1. / (self.counts[k] + 1) * (r - 130 | self.estimates[k]) 131 | return k 132 | 133 | 134 | class ThompsonSampling(Solver): 135 | """ 汤普森采样算法,继承Solver类 """ 136 | def __init__(self, bandit): 137 | super(ThompsonSampling, self).__init__(bandit) 138 | self._a = np.ones(self.bandit.K) # 列表,表示每根拉杆奖励为1的次数 139 | self._b = np.ones(self.bandit.K) # 列表,表示每根拉杆奖励为0的次数 140 | 141 | def run_one_step(self): 142 | samples = np.random.beta(self._a, self._b) # 按照Beta分布采样一组奖励样本 143 | k = np.argmax(samples) # 选出采样奖励最大的拉杆 144 | r = self.bandit.step(k) 145 | 146 | self._a[k] += r # 更新Beta分布的第一个参数 147 | self._b[k] += (1 - r) # 更新Beta分布的第二个参数 148 | return k 149 | 150 | 151 | 152 | 153 | if __name__ == "__main__": 154 | # 1.epsilon-贪婪算法, epsilon=0.01 155 | # flag = False 156 | flag = True 157 | if flag: 158 | np.random.seed(1) 159 | epsilon_greedy_solver = EpsilonGreedy(bandit_10_arm, epsilon=0.01) 160 | epsilon_greedy_solver.run(5000) 161 | print('epsilon-贪婪算法的累积懊悔为:', epsilon_greedy_solver.regret) 162 | plot_results([epsilon_greedy_solver], ["EpsilonGreedy"]) 163 | 164 | # epsilon-贪婪算法的累积懊悔为:25.526630933945313 165 | 166 | # 2.epsilon-贪婪算法, epsilon=[1e-4, 0.01, 0.1, 0.25, 0.5] 167 | flag = False 168 | # flag = True 169 | if flag: 170 | np.random.seed(0) 171 | epsilons = [1e-4, 0.01, 0.1, 0.25, 0.5] 172 | epsilon_greedy_solver_list = [ 173 | EpsilonGreedy(bandit_10_arm, epsilon=e) for e in epsilons 174 | ] 175 | epsilon_greedy_solver_names = ["epsilon={}".format(e) for e in epsilons] 176 | for solver in epsilon_greedy_solver_list: 177 | solver.run(5000) 178 | 179 | plot_results(epsilon_greedy_solver_list, epsilon_greedy_solver_names) 180 | 181 | # 3.epsilon值随时间衰减的epsilon - 贪婪算法 182 | flag = False 183 | # flag = True 184 | if flag: 185 | np.random.seed(1) 186 | decaying_epsilon_greedy_solver = DecayingEpsilonGreedy(bandit_10_arm) 187 | decaying_epsilon_greedy_solver.run(5000) 188 | print('epsilon值衰减的贪婪算法的累积懊悔为:', decaying_epsilon_greedy_solver.regret) 189 | plot_results([decaying_epsilon_greedy_solver], ["DecayingEpsilonGreedy"]) 190 | 191 | # epsilon值衰减的贪婪算法的累积懊悔为:10.114334931260183 192 | 193 | # 4.UCB算法(上置信界算法) 194 | flag = False 195 | # flag = True 196 | if flag: 197 | np.random.seed(1) 198 | coef = 1 # 控制不确定性比重的系数 199 | UCB_solver = UCB(bandit_10_arm, coef) 200 | UCB_solver.run(5000) 201 | print('上置信界算法的累积懊悔为:', UCB_solver.regret) 202 | plot_results([UCB_solver], ["UCB"]) 203 | 204 | # 上置信界算法的累积懊悔为: 70.45281214197854 205 | 206 | # 5.汤普森采样算法 207 | flag = False 208 | # flag = True 209 | if flag: 210 | np.random.seed(1) 211 | thompson_sampling_solver = ThompsonSampling(bandit_10_arm) 212 | thompson_sampling_solver.run(5000) 213 | print('汤普森采样算法的累积懊悔为:', thompson_sampling_solver.regret) 214 | plot_results([thompson_sampling_solver], ["ThompsonSampling"]) 215 | 216 | # 汤普森采样算法的累积懊悔为:57.19161964443925 -------------------------------------------------------------------------------- /rl_utils.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | import torch 4 | import collections 5 | import random 6 | 7 | 8 | class ReplayBuffer: 9 | def __init__(self, capacity): 10 | self.buffer = collections.deque(maxlen=capacity) 11 | 12 | def add(self, state, action, reward, next_state, done): 13 | self.buffer.append((state, action, reward, next_state, done)) 14 | 15 | def sample(self, batch_size): 16 | transitions = random.sample(self.buffer, batch_size) 17 | state, action, reward, next_state, done = zip(*transitions) 18 | return np.array(state), action, reward, np.array(next_state), done 19 | 20 | def size(self): 21 | return len(self.buffer) 22 | 23 | 24 | def moving_average(a, window_size): 25 | cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 26 | middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size 27 | r = np.arange(1, window_size - 1, 2) 28 | begin = np.cumsum(a[:window_size - 1])[::2] / r 29 | end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1] 30 | return np.concatenate((begin, middle, end)) 31 | 32 | 33 | def train_on_policy_agent(env, agent, num_episodes): 34 | return_list = [] 35 | for i in range(10): 36 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 37 | for i_episode in range(int(num_episodes / 10)): 38 | episode_return = 0 39 | transition_dict = {'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': []} 40 | state = env.reset() 41 | done = False 42 | while not done: 43 | action = agent.take_action(state) 44 | next_state, reward, done, _ = env.step(action) 45 | transition_dict['states'].append(state) 46 | transition_dict['actions'].append(action) 47 | transition_dict['next_states'].append(next_state) 48 | transition_dict['rewards'].append(reward) 49 | transition_dict['dones'].append(done) 50 | state = next_state 51 | episode_return += reward 52 | return_list.append(episode_return) 53 | agent.update(transition_dict) 54 | if (i_episode + 1) % 10 == 0: 55 | pbar.set_postfix({'episode': '%d' % (num_episodes / 10 * i + i_episode + 1), 56 | 'return': '%.3f' % np.mean(return_list[-10:])}) 57 | pbar.update(1) 58 | return return_list 59 | 60 | 61 | def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size): 62 | return_list = [] 63 | for i in range(10): 64 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 65 | for i_episode in range(int(num_episodes / 10)): 66 | episode_return = 0 67 | state = env.reset() 68 | done = False 69 | while not done: 70 | action = agent.take_action(state) 71 | next_state, reward, done, _ = env.step(action) 72 | replay_buffer.add(state, action, reward, next_state, done) 73 | state = next_state 74 | episode_return += reward 75 | if replay_buffer.size() > minimal_size: 76 | b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size) 77 | transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 78 | 'dones': b_d} 79 | agent.update(transition_dict) 80 | return_list.append(episode_return) 81 | if (i_episode + 1) % 10 == 0: 82 | pbar.set_postfix({'episode': '%d' % (num_episodes / 10 * i + i_episode + 1), 83 | 'return': '%.3f' % np.mean(return_list[-10:])}) 84 | pbar.update(1) 85 | return return_list 86 | 87 | 88 | def compute_advantage(gamma, lmbda, td_delta): 89 | td_delta = td_delta.detach().numpy() 90 | advantage_list = [] 91 | advantage = 0.0 92 | for delta in td_delta[::-1]: 93 | advantage = gamma * lmbda * advantage + delta 94 | advantage_list.append(advantage) 95 | advantage_list.reverse() 96 | return torch.tensor(advantage_list, dtype=torch.float) 97 | --------------------------------------------------------------------------------