├── .gitignore ├── 01_mnist.py ├── 02_pi.py ├── 03_cartpole.py ├── 04_dqn.py ├── 05_sarsa.py ├── 05_sarsa_v2.py ├── 06_doubledqn.py ├── 07_a2c.py ├── 07_reinforce.py ├── 08_a2c.py ├── 08_reinforce_with_baseline.py ├── 09_trpo.py ├── 10_ddpg.py ├── 10_td3.py ├── 13_a3c.py ├── 14_mpe.py ├── 15_mac_a2c.py ├── ReadMe.md └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | output*/ 3 | __pycache__/ 4 | .ipynb_checkpoints/ 5 | mnist_data/ 6 | 7 | *.ipynb 8 | *.png 9 | *.sh 10 | 11 | tests/ 12 | test*.py 13 | test.md 14 | -------------------------------------------------------------------------------- /01_mnist.py: -------------------------------------------------------------------------------- 1 | """1.2.1节全连接网络MNIST分类实现。修改pytorch/examples,https://github.com/pytorch/examples/blob/main/mnist/main.py. 2 | """ 3 | from __future__ import print_function 4 | import argparse 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | from torchvision import datasets, transforms 10 | from torch.optim.lr_scheduler import StepLR 11 | 12 | 13 | class Net(nn.Module): 14 | def __init__(self): 15 | super(Net, self).__init__() 16 | self.fc1 = nn.Linear(784, 128) 17 | self.fc2 = nn.Linear(128, 128) 18 | self.fc3 = nn.Linear(128, 10) 19 | 20 | def forward(self, x): 21 | x = x.reshape(x.shape[0], -1) 22 | x = F.relu(self.fc1(x)) 23 | x = F.relu(self.fc2(x)) 24 | output = F.log_softmax(self.fc3(x)) 25 | return output 26 | 27 | 28 | def train(args, model, device, train_loader, optimizer, epoch): 29 | model.train() 30 | for batch_idx, (data, target) in enumerate(train_loader): 31 | data, target = data.to(device), target.to(device) 32 | optimizer.zero_grad() 33 | output = model(data) 34 | loss = F.nll_loss(output, target) 35 | loss.backward() 36 | optimizer.step() 37 | if batch_idx % args.log_interval == 0: 38 | print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(epoch, batch_idx * len(data), len(train_loader.dataset), 100.0 * batch_idx / len(train_loader), loss.item())) 39 | if args.dry_run: 40 | break 41 | 42 | 43 | def test(model, device, test_loader): 44 | model.eval() 45 | test_loss = 0 46 | correct = 0 47 | with torch.no_grad(): 48 | for data, target in test_loader: 49 | data, target = data.to(device), target.to(device) 50 | output = model(data) 51 | test_loss += F.nll_loss(output, target, reduction="sum").item() # sum up batch loss 52 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 53 | correct += pred.eq(target.view_as(pred)).sum().item() 54 | 55 | test_loss /= len(test_loader.dataset) 56 | 57 | print("\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset))) 58 | 59 | 60 | def main(): 61 | # Training settings 62 | parser = argparse.ArgumentParser(description="PyTorch MNIST Example") 63 | parser.add_argument("--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)") 64 | parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)") 65 | parser.add_argument("--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)") 66 | parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)") 67 | parser.add_argument("--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)") 68 | parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training") 69 | parser.add_argument("--dry-run", action="store_true", default=False, help="quickly check a single pass") 70 | parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") 71 | parser.add_argument("--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status") 72 | parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model") 73 | args = parser.parse_args() 74 | use_cuda = not args.no_cuda and torch.cuda.is_available() 75 | 76 | torch.manual_seed(args.seed) 77 | 78 | device = torch.device("cuda" if use_cuda else "cpu") 79 | 80 | train_kwargs = {"batch_size": args.batch_size} 81 | test_kwargs = {"batch_size": args.test_batch_size} 82 | if use_cuda: 83 | cuda_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True} 84 | train_kwargs.update(cuda_kwargs) 85 | test_kwargs.update(cuda_kwargs) 86 | 87 | transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) 88 | dataset1 = datasets.MNIST("mnist_data/", train=True, download=True, transform=transform) 89 | dataset2 = datasets.MNIST("mnist_data/", train=False, transform=transform) 90 | train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs) 91 | test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) 92 | 93 | model = Net().to(device) 94 | optimizer = optim.Adadelta(model.parameters(), lr=args.lr) 95 | 96 | scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) 97 | for epoch in range(1, args.epochs + 1): 98 | train(args, model, device, train_loader, optimizer, epoch) 99 | test(model, device, test_loader) 100 | scheduler.step() 101 | 102 | if args.save_model: 103 | torch.save(model.state_dict(), "mnist_cnn.pt") 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /02_pi.py: -------------------------------------------------------------------------------- 1 | # 2.2节,蒙特卡洛近似计算圆周率。 2 | import numpy as np 3 | 4 | 5 | def approxiate_pi(n: int): 6 | # 在[-1, 1] x [-1, 1]的空间中随机取n个点。 7 | x_lst = np.random.uniform(-1, 1, size=n) 8 | y_lst = np.random.uniform(-1, 1, size=n) 9 | # 统计距离圆心距离在1以内的点。 10 | m = 0 11 | for x, y in zip(x_lst, y_lst): 12 | if x ** 2 + y ** 2 <= 1: 13 | m += 1 14 | # 近似计算圆周率。 15 | pi = 4 * m / n 16 | return pi 17 | 18 | 19 | if __name__ == "__main__": 20 | pi = approxiate_pi(100) 21 | print("100个点近似的圆周率:", pi) 22 | 23 | pi = approxiate_pi(10000) 24 | print("10000个点近似的圆周率:", pi) 25 | 26 | pi = approxiate_pi(1000000) 27 | print("1000000个点近似的圆周率:", pi) 28 | -------------------------------------------------------------------------------- /03_cartpole.py: -------------------------------------------------------------------------------- 1 | """3.6节实验环境。 2 | """ 3 | 4 | 5 | import gym 6 | 7 | 8 | env = gym.make("CartPole-v0", render_mode="human") 9 | state = env.reset() 10 | 11 | 12 | for t in range(1000): 13 | env.render() 14 | print(state) 15 | 16 | action = env.action_space.sample() 17 | 18 | state, reward, terminated, truncated, info = env.step(action) 19 | 20 | if terminated or truncated: 21 | print("Finished") 22 | state = env.reset() 23 | 24 | env.close() 25 | -------------------------------------------------------------------------------- /04_dqn.py: -------------------------------------------------------------------------------- 1 | """4.3节DQN算法实现。 2 | """ 3 | import argparse 4 | from collections import defaultdict 5 | import os 6 | import random 7 | from dataclasses import dataclass, field 8 | import gym 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | class QNet(nn.Module): 17 | """QNet. 18 | Input: feature 19 | Output: num_act of values 20 | """ 21 | 22 | def __init__(self, dim_state, num_action): 23 | super().__init__() 24 | self.fc1 = nn.Linear(dim_state, 64) 25 | self.fc2 = nn.Linear(64, 32) 26 | self.fc3 = nn.Linear(32, num_action) 27 | 28 | def forward(self, state): 29 | x = F.relu(self.fc1(state)) 30 | x = F.relu(self.fc2(x)) 31 | x = self.fc3(x) 32 | return x 33 | 34 | 35 | class DQN: 36 | def __init__(self, dim_state=None, num_action=None, discount=0.9): 37 | self.discount = discount 38 | self.Q = QNet(dim_state, num_action) 39 | self.target_Q = QNet(dim_state, num_action) 40 | self.target_Q.load_state_dict(self.Q.state_dict()) 41 | 42 | def get_action(self, state): 43 | qvals = self.Q(state) 44 | return qvals.argmax() 45 | 46 | def compute_loss(self, s_batch, a_batch, r_batch, d_batch, next_s_batch): 47 | # 计算s_batch,a_batch对应的值。 48 | qvals = self.Q(s_batch).gather(1, a_batch.unsqueeze(1)).squeeze() 49 | # 使用target Q网络计算next_s_batch对应的值。 50 | next_qvals, _ = self.target_Q(next_s_batch).detach().max(dim=1) 51 | # 使用MSE计算loss。 52 | loss = F.mse_loss(r_batch + self.discount * next_qvals * (1 - d_batch), qvals) 53 | return loss 54 | 55 | 56 | def soft_update(target, source, tau=0.01): 57 | """ 58 | update target by target = tau * source + (1 - tau) * target. 59 | """ 60 | for target_param, param in zip(target.parameters(), source.parameters()): 61 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 62 | 63 | 64 | @dataclass 65 | class ReplayBuffer: 66 | maxsize: int 67 | size: int = 0 68 | state: list = field(default_factory=list) 69 | action: list = field(default_factory=list) 70 | next_state: list = field(default_factory=list) 71 | reward: list = field(default_factory=list) 72 | done: list = field(default_factory=list) 73 | 74 | def push(self, state, action, reward, done, next_state): 75 | if self.size < self.maxsize: 76 | self.state.append(state) 77 | self.action.append(action) 78 | self.reward.append(reward) 79 | self.done.append(done) 80 | self.next_state.append(next_state) 81 | else: 82 | position = self.size % self.maxsize 83 | self.state[position] = state 84 | self.action[position] = action 85 | self.reward[position] = reward 86 | self.done[position] = done 87 | self.next_state[position] = next_state 88 | self.size += 1 89 | 90 | def sample(self, n): 91 | total_number = self.size if self.size < self.maxsize else self.maxsize 92 | indices = np.random.randint(total_number, size=n) 93 | state = [self.state[i] for i in indices] 94 | action = [self.action[i] for i in indices] 95 | reward = [self.reward[i] for i in indices] 96 | done = [self.done[i] for i in indices] 97 | next_state = [self.next_state[i] for i in indices] 98 | return state, action, reward, done, next_state 99 | 100 | 101 | def set_seed(args): 102 | random.seed(args.seed) 103 | np.random.seed(args.seed) 104 | torch.manual_seed(args.seed) 105 | if not args.no_cuda: 106 | torch.cuda.manual_seed(args.seed) 107 | 108 | 109 | def train(args, env, agent): 110 | replay_buffer = ReplayBuffer(10_000) 111 | optimizer = torch.optim.Adam(agent.Q.parameters(), lr=args.lr) 112 | optimizer.zero_grad() 113 | 114 | epsilon = 1 115 | epsilon_max = 1 116 | epsilon_min = 0.1 117 | episode_reward = 0 118 | episode_length = 0 119 | max_episode_reward = -float("inf") 120 | log = defaultdict(list) 121 | log["loss"].append(0) 122 | 123 | agent.Q.train() 124 | state, _ = env.reset(seed=args.seed) 125 | for i in range(args.max_steps): 126 | if np.random.rand() < epsilon or i < args.warmup_steps: 127 | action = env.action_space.sample() 128 | else: 129 | action = agent.get_action(torch.from_numpy(state)) 130 | action = action.item() 131 | next_state, reward, terminated, truncated, _ = env.step(action) 132 | done = terminated or truncated 133 | episode_reward += reward 134 | episode_length += 1 135 | 136 | replay_buffer.push(state, action, reward, done, next_state) 137 | state = next_state 138 | 139 | if done is True: 140 | log["episode_reward"].append(episode_reward) 141 | log["episode_length"].append(episode_length) 142 | 143 | print(f"i={i}, reward={episode_reward:.0f}, length={episode_length}, max_reward={max_episode_reward}, loss={log['loss'][-1]:.1e}, epsilon={epsilon:.3f}") 144 | 145 | # 如果得分更高,保存模型。 146 | if episode_reward > max_episode_reward: 147 | save_path = os.path.join(args.output_dir, "model.bin") 148 | torch.save(agent.Q.state_dict(), save_path) 149 | max_episode_reward = episode_reward 150 | 151 | episode_reward = 0 152 | episode_length = 0 153 | epsilon = max(epsilon - (epsilon_max - epsilon_min) * args.epsilon_decay, 1e-1) 154 | state, _ = env.reset() 155 | 156 | if i > args.warmup_steps: 157 | bs, ba, br, bd, bns = replay_buffer.sample(n=args.batch_size) 158 | bs = torch.tensor(bs, dtype=torch.float32) 159 | ba = torch.tensor(ba, dtype=torch.long) 160 | br = torch.tensor(br, dtype=torch.float32) 161 | bd = torch.tensor(bd, dtype=torch.float32) 162 | bns = torch.tensor(bns, dtype=torch.float32) 163 | 164 | loss = agent.compute_loss(bs, ba, br, bd, bns) 165 | loss.backward() 166 | optimizer.step() 167 | optimizer.zero_grad() 168 | 169 | log["loss"].append(loss.item()) 170 | 171 | soft_update(agent.target_Q, agent.Q) 172 | 173 | # 3. 画图。 174 | plt.plot(log["loss"]) 175 | plt.yscale("log") 176 | plt.savefig(f"{args.output_dir}/loss.png", bbox_inches="tight") 177 | plt.close() 178 | 179 | plt.plot(np.cumsum(log["episode_length"]), log["episode_reward"]) 180 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 181 | plt.close() 182 | 183 | 184 | def eval(args, env, agent): 185 | agent = DQN(args.dim_state, args.num_action) 186 | model_path = os.path.join(args.output_dir, "model.bin") 187 | agent.Q.load_state_dict(torch.load(model_path)) 188 | 189 | episode_length = 0 190 | episode_reward = 0 191 | state, _ = env.reset() 192 | for i in range(5000): 193 | episode_length += 1 194 | action = agent.get_action(torch.from_numpy(state)).item() 195 | next_state, reward, terminated, truncated, _ = env.step(action) 196 | done = terminated or truncated 197 | env.render() 198 | episode_reward += reward 199 | 200 | state = next_state 201 | if done is True: 202 | print(f"episode reward={episode_reward}, episode length{episode_length}") 203 | state, _ = env.reset() 204 | episode_length = 0 205 | episode_reward = 0 206 | 207 | 208 | def main(): 209 | parser = argparse.ArgumentParser() 210 | parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment name.") 211 | parser.add_argument("--dim_state", default=4, type=int, help="Dimension of state.") 212 | parser.add_argument("--num_action", default=2, type=int, help="Number of action.") 213 | parser.add_argument("--discount", default=0.99, type=float, help="Discount coefficient.") 214 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 215 | parser.add_argument("--lr", default=1e-3, type=float, help="Learning rate.") 216 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") 217 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 218 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 219 | parser.add_argument("--warmup_steps", default=10_000, type=int, help="Warmup steps without training.") 220 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 221 | parser.add_argument("--epsilon_decay", default=1 / 1000, type=float, help="Epsilon-greedy algorithm decay coefficient.") 222 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 223 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 224 | args = parser.parse_args() 225 | 226 | args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 227 | 228 | env = gym.make(args.env) 229 | set_seed(args) 230 | agent = DQN(dim_state=args.dim_state, num_action=args.num_action, discount=args.discount) 231 | agent.Q.to(args.device) 232 | agent.target_Q.to(args.device) 233 | 234 | if args.do_train: 235 | train(args, env, agent) 236 | 237 | if args.do_eval: 238 | eval(args, env, agent) 239 | 240 | 241 | if __name__ == "__main__": 242 | main() 243 | -------------------------------------------------------------------------------- /05_sarsa.py: -------------------------------------------------------------------------------- 1 | """5.1节表格形式SARSA算法实现。 2 | """ 3 | import argparse 4 | import os 5 | import random 6 | from collections import defaultdict 7 | import gym 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | 11 | 12 | class SARSA: 13 | def __init__(self, num_state: int, num_action: int, low_state: np.array, high_state: np.array, num_discrete: int = 50): 14 | self.table = defaultdict(float) 15 | self.low_state = low_state 16 | self.high_state = high_state 17 | self.num_action = num_action 18 | self.num_discrete = num_discrete 19 | 20 | def get(self, state: np.array, action: int): 21 | index = self.get_index(state, action) 22 | return self.table[index] 23 | 24 | def set(self, state: np.array, action: int, value: float): 25 | index = self.get_index(state, action) 26 | self.table[index] = value 27 | 28 | def get_index(self, state: np.array, action: int): 29 | index_list = [] 30 | state = state[2:] 31 | for i in range(len(state)): 32 | index = (state[i] - self.low_state[i]) // ((self.high_state[i] - self.low_state[i]) / self.num_discrete) 33 | index = int(index) 34 | index_list.append(index) 35 | index_list.append(action) 36 | return tuple(index_list) 37 | 38 | def get_action(self, state: np.array): 39 | # 选取分值最高的action。 40 | max_value = -float("inf") 41 | action = None 42 | for i in range(self.num_action): 43 | index = self.get_index(state, i) 44 | value = self.table[index] 45 | if value > max_value: 46 | max_value = value 47 | action = i 48 | return action 49 | 50 | 51 | def set_seed(args): 52 | random.seed(args.seed) 53 | np.random.seed(args.seed) 54 | 55 | 56 | def train(args, env, agent: SARSA): 57 | epsilon = 0.2 58 | max_episode_reward = -float("inf") 59 | episode_reward = 0 60 | episode_length = 0 61 | log = defaultdict(list) 62 | 63 | state, _ = env.reset(seed=args.seed) 64 | for step in range(args.max_steps): 65 | if np.random.rand() < epsilon: 66 | action = env.action_space.sample() 67 | else: 68 | action = agent.get_action(state) 69 | 70 | next_state, reward, terminated, truncated, _ = env.step(action) 71 | done = terminated or truncated 72 | 73 | episode_reward += reward 74 | episode_length += 1 75 | 76 | # 6. 计算 TD 目标和 TD 误差。 77 | if np.random.rand() < epsilon: 78 | new_action = env.action_space.sample() 79 | else: 80 | new_action = agent.get_action(next_state) 81 | 82 | target = reward + args.gamma * agent.get(next_state, new_action) 83 | td_delta = agent.get(state, action) - target 84 | 85 | # 7. 更新表格中 (st, at) 位置上的元素。 86 | value = agent.get(state, action) - args.lr * td_delta 87 | agent.set(state, action, value) 88 | 89 | state = next_state 90 | 91 | # 一个episode结束后,进行训练。 92 | if done is True: 93 | log["episode_reward"].append(episode_reward) 94 | log["episode_length"].append(episode_length) 95 | 96 | max_episode_reward = max(log["episode_reward"]) 97 | print(f"step={step}, reward={episode_reward}, length={episode_length}, max_reward={max_episode_reward}, epsilon={epsilon:.2f}") 98 | 99 | # epsilon = max(epsilon - (epsilon_max - epsilon_min) * args.epsilon_decay, 1e-1) 100 | episode_reward = 0 101 | episode_length = 0 102 | state, _ = env.reset() 103 | 104 | # 3. 画图。 105 | os.makedirs(args.output_dir, exist_ok=True) 106 | plt.plot(np.cumsum(log["episode_length"]), log["episode_reward"]) 107 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 108 | plt.close() 109 | 110 | 111 | def main(): 112 | parser = argparse.ArgumentParser() 113 | parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment name.") 114 | parser.add_argument("--num_state", default=4, type=int, help="Dimension of observation.") 115 | parser.add_argument("--num_action", default=2, type=int, help="Number of actions.") 116 | 117 | parser.add_argument("--gamma", default=0.95, type=float, help="Discount coefficient.") 118 | parser.add_argument("--max_steps", default=200_000, type=int, help="Maximum steps for interaction.") 119 | parser.add_argument("--lr", default=0.2, type=float, help="Learning rate.") 120 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 121 | 122 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 123 | 124 | args = parser.parse_args() 125 | 126 | env = gym.make(args.env) 127 | set_seed(args) 128 | 129 | # 角度是这个问题的主要特征。为了减少搜索空间,这里只分析角度。 130 | low_state = np.array([env.observation_space.low[2], -1.0]) 131 | high_state = np.array([env.observation_space.high[2], 1.0]) 132 | agent = SARSA(num_state=args.num_state, num_action=args.num_action, low_state=low_state, high_state=high_state) 133 | 134 | train(args, env, agent) 135 | 136 | 137 | if __name__ == "__main__": 138 | main() 139 | -------------------------------------------------------------------------------- /05_sarsa_v2.py: -------------------------------------------------------------------------------- 1 | """5.3节多步SARSA算法实现。 2 | """ 3 | import argparse 4 | import os 5 | import random 6 | from collections import defaultdict 7 | import gym 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | 15 | class QNet(nn.Module): 16 | def __init__(self, dim_state, num_action): 17 | super().__init__() 18 | self.fc1 = nn.Linear(dim_state, 64) 19 | self.fc2 = nn.Linear(64, 32) 20 | self.fc3 = nn.Linear(32, num_action) 21 | 22 | def forward(self, state): 23 | x = F.relu(self.fc1(state)) 24 | x = F.relu(self.fc2(x)) 25 | x = self.fc3(x) 26 | return x 27 | 28 | 29 | class SARSA: 30 | def __init__(self, dim_state, num_action, gamma=0.99): 31 | self.gamma = gamma 32 | self.Q = QNet(dim_state, num_action) 33 | self.target_Q = QNet(dim_state, num_action) 34 | self.target_Q.load_state_dict(self.Q.state_dict()) 35 | 36 | def get_action(self, state): 37 | qvals = self.Q(state) 38 | return qvals.argmax(dim=-1) 39 | 40 | def compute_loss(self, args, s_list, a_list, r_list): 41 | batch_s = np.array(s_list) 42 | batch_s = torch.tensor(batch_s, dtype=torch.float32) 43 | batch_a = np.array(a_list) 44 | batch_a = torch.tensor(batch_a, dtype=torch.long) 45 | 46 | # 2. 对于所有的t=1,...,n-m,计算$\hat{q}_t = q(s_t, a_t; w_{now}$。 47 | num = len(r_list) 48 | state = batch_s[: num - args.m + 1, :] 49 | action = batch_a[: num - args.m + 1] 50 | qvals = self.Q(state).gather(1, action.unsqueeze(1)).squeeze() 51 | 52 | # s0, a0, r0, s1, a1, r1, s2, a2, r2, s3, a3, r3, ..., s6, a6, r6 53 | 54 | # s0, s1, s2 55 | 56 | # s5, s6, 57 | 58 | # 3. 对于所有的 t = 1, ..., n − m,计算多步 TD 目标和 TD 误差。 59 | R = 0 60 | for i in reversed(range(num - args.m, num)): 61 | R = args.gamma * R + r_list[i] 62 | 63 | rewards = [R] 64 | for i in reversed(range(num - args.m)): 65 | R -= args.gamma ** (args.m - 1) * r_list[i + args.m] 66 | R = args.gamma * R + r_list[i] 67 | rewards.append(R) 68 | rewards.reverse() 69 | rewards = torch.tensor(rewards, dtype=torch.float32) 70 | 71 | with torch.no_grad(): 72 | state = batch_s[args.m :, :] 73 | action = batch_a[args.m :] 74 | m_step_qvals = self.target_Q(state).gather(1, action.unsqueeze(1)).squeeze() 75 | 76 | target_values = args.gamma**args.m * m_step_qvals 77 | target_values = torch.cat([target_values, torch.tensor([0.0])]) 78 | target_values = target_values + rewards 79 | 80 | td_delta = qvals - target_values 81 | 82 | # 计算loss。 83 | value_loss = td_delta.square().mean() 84 | return value_loss 85 | 86 | def soft_update(self, tau=0.01): 87 | for target_param, param in zip(self.target_Q.parameters(), self.Q.parameters()): 88 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 89 | 90 | 91 | def set_seed(args): 92 | random.seed(args.seed) 93 | np.random.seed(args.seed) 94 | torch.manual_seed(args.seed) 95 | if not args.no_cuda: 96 | torch.cuda.manual_seed(args.seed) 97 | 98 | 99 | def train(args, env, agent): 100 | optimizer = torch.optim.Adam(agent.Q.parameters(), lr=args.lr) 101 | 102 | epsilon = 1 103 | epsilon_max = 1 104 | epsilon_min = 0.1 105 | max_episode_reward = -float("inf") 106 | episode_reward = 0 107 | episode_length = 0 108 | log = defaultdict(list) 109 | s_list = [] 110 | a_list = [] 111 | r_list = [] 112 | 113 | state, _ = env.reset(seed=args.seed) 114 | for step in range(args.max_steps): 115 | if np.random.rand() < epsilon: 116 | action = env.action_space.sample() 117 | else: 118 | action = agent.get_action(torch.from_numpy(state)) 119 | action = action.item() 120 | next_state, reward, terminated, truncated, _ = env.step(action) 121 | done = terminated or truncated 122 | 123 | s_list.append(state) 124 | a_list.append(action) 125 | r_list.append(reward) 126 | 127 | episode_reward += reward 128 | episode_length += 1 129 | 130 | state = next_state 131 | 132 | # 一个episode结束后,进行训练。 133 | if done is True: 134 | log["episode_reward"].append(episode_reward) 135 | log["episode_length"].append(episode_length) 136 | 137 | if episode_reward > max_episode_reward: 138 | save_path = os.path.join(args.output_dir, "model.bin") 139 | torch.save(agent.Q.state_dict(), save_path) 140 | max_episode_reward = episode_reward 141 | 142 | # 训练模型。 143 | for _ in range(2): 144 | loss = agent.compute_loss(args, s_list, a_list, r_list) 145 | optimizer.zero_grad() 146 | loss.backward() 147 | optimizer.step() 148 | 149 | agent.soft_update() 150 | 151 | print(f"step={step}, reward={episode_reward}, length={episode_length}, max_reward={max_episode_reward}, epsilon={epsilon:.2f}, loss={loss.item():.5f}") 152 | 153 | log["loss"].append(loss.item()) 154 | 155 | epsilon = max(epsilon - (epsilon_max - epsilon_min) * args.epsilon_decay, 1e-1) 156 | episode_reward = 0 157 | episode_length = 0 158 | state, _ = env.reset() 159 | 160 | # 3. 画图。 161 | plt.plot(log["loss"]) 162 | plt.yscale("log") 163 | plt.savefig(f"{args.output_dir}/loss.png", bbox_inches="tight") 164 | plt.close() 165 | 166 | plt.plot(np.cumsum(log["episode_length"]), log["episode_reward"]) 167 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 168 | plt.close() 169 | 170 | 171 | def eval(args, env, agent): 172 | agent = SARSA(args.dim_state, args.num_action) 173 | model_path = os.path.join(args.output_dir, "model.bin") 174 | agent.Q.load_state_dict(torch.load(model_path)) 175 | 176 | episode_length = 0 177 | episode_reward = 0 178 | state = env.reset() 179 | for i in range(5000): 180 | episode_length += 1 181 | action = agent.get_action(torch.from_numpy(state)).item() 182 | next_state, reward, done, _ = env.step(action) 183 | env.render() 184 | episode_reward += reward 185 | 186 | state = next_state 187 | if done is True: 188 | print(f"episode reward={episode_reward}, episode length={episode_length}") 189 | state = env.reset() 190 | episode_length = 0 191 | episode_reward = 0 192 | 193 | 194 | def main(): 195 | parser = argparse.ArgumentParser() 196 | parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment name.") 197 | parser.add_argument("--dim_state", default=4, type=int, help="Dimension of observation.") 198 | parser.add_argument("--num_action", default=2, type=int, help="Number of actions.") 199 | parser.add_argument("--m", default=5, type=int, help="Multi-step TD target.") 200 | 201 | parser.add_argument("--gamma", default=0.99, type=float, help="Discount coefficient.") 202 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 203 | parser.add_argument("--lr", default=1e-3, type=float, help="Learning rate.") 204 | parser.add_argument("--batch_size", default=64, type=int, help="Batch size.") 205 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 206 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 207 | 208 | parser.add_argument("--epsilon_decay", default=1 / 2000, type=float, help="Epsilon-greedy algorithm decay coefficient.") 209 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 210 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 211 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 212 | args = parser.parse_args() 213 | 214 | args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 215 | 216 | env = gym.make(args.env) 217 | set_seed(args) 218 | agent = SARSA(dim_state=args.dim_state, num_action=args.num_action, gamma=args.gamma) 219 | agent.Q.to(args.device) 220 | 221 | if args.do_train: 222 | train(args, env, agent) 223 | 224 | if args.do_eval: 225 | eval(args, env, agent) 226 | 227 | 228 | if __name__ == "__main__": 229 | main() 230 | -------------------------------------------------------------------------------- /06_doubledqn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | from dataclasses import dataclass, field 5 | import gym 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class QNet(nn.Module): 14 | """QNet. 15 | Input: feature 16 | Output: num_act of values 17 | """ 18 | 19 | def __init__(self, dim_obs, num_act): 20 | super().__init__() 21 | self.fc1 = nn.Linear(dim_obs, 64) 22 | self.fc2 = nn.Linear(64, 32) 23 | self.fc3 = nn.Linear(32, num_act) 24 | 25 | def forward(self, obs): 26 | x = F.relu(self.fc1(obs)) 27 | x = F.relu(self.fc2(x)) 28 | x = self.fc3(x) 29 | return x 30 | 31 | 32 | class DoubleDQN: 33 | def __init__(self, dim_obs=None, num_act=None, discount=0.9): 34 | self.discount = discount 35 | self.model = QNet(dim_obs, num_act) 36 | self.target_model = QNet(dim_obs, num_act) 37 | self.target_model.load_state_dict(self.model.state_dict()) 38 | 39 | def get_action(self, obs): 40 | qvals = self.model(obs) 41 | return qvals.argmax() 42 | 43 | def compute_loss(self, s_batch, a_batch, r_batch, d_batch, next_s_batch): 44 | # Compute current Q value based on current states and actions. 45 | qvals = self.model(s_batch).gather(1, a_batch.unsqueeze(1)).squeeze() 46 | # next state的value不参与导数计算,避免不收敛。 47 | next_qvals, _ = self.target_model(next_s_batch).detach().max(dim=1) 48 | loss = F.mse_loss(r_batch + self.discount * next_qvals * (1 - d_batch), qvals) 49 | return loss 50 | 51 | 52 | @dataclass 53 | class ReplayBuffer: 54 | maxsize: int 55 | size: int = 0 56 | state: list = field(default_factory=list) 57 | action: list = field(default_factory=list) 58 | next_state: list = field(default_factory=list) 59 | reward: list = field(default_factory=list) 60 | done: list = field(default_factory=list) 61 | 62 | def push(self, state, action, reward, done, next_state): 63 | if self.size < self.maxsize: 64 | self.state.append(state) 65 | self.action.append(action) 66 | self.reward.append(reward) 67 | self.done.append(done) 68 | self.next_state.append(next_state) 69 | else: 70 | position = self.size % self.maxsize 71 | self.state[position] = state 72 | self.action[position] = action 73 | self.reward[position] = reward 74 | self.done[position] = done 75 | self.next_state[position] = next_state 76 | self.size += 1 77 | 78 | def sample(self, n): 79 | total_number = self.size if self.size < self.maxsize else self.maxsize 80 | indices = np.random.randint(total_number, size=n) 81 | state = [self.state[i] for i in indices] 82 | action = [self.action[i] for i in indices] 83 | reward = [self.reward[i] for i in indices] 84 | done = [self.done[i] for i in indices] 85 | next_state = [self.next_state[i] for i in indices] 86 | return state, action, reward, done, next_state 87 | 88 | 89 | def set_seed(args): 90 | random.seed(args.seed) 91 | np.random.seed(args.seed) 92 | torch.manual_seed(args.seed) 93 | if not args.no_cuda: 94 | torch.cuda.manual_seed(args.seed) 95 | 96 | 97 | def train(args, env, agent): 98 | replay_buffer = ReplayBuffer(100_000) 99 | optimizer = torch.optim.Adam(agent.model.parameters(), lr=args.lr) 100 | optimizer.zero_grad() 101 | 102 | epsilon = 1 103 | episode_reward = 0 104 | episode_length = 0 105 | max_episode_reward = -float("inf") 106 | log_ep_length = [] 107 | log_ep_rewards = [] 108 | log_losses = [0] 109 | 110 | agent.model.train() 111 | agent.target_model.train() 112 | agent.model.zero_grad() 113 | agent.target_model.zero_grad() 114 | state, _ = env.reset() 115 | for i in range(args.max_steps): 116 | if np.random.rand() < epsilon or i < args.warmup_steps: 117 | action = env.action_space.sample() 118 | else: 119 | action = agent.get_action(torch.from_numpy(state)) 120 | action = action.item() 121 | next_state, reward, terminated, truncated, _ = env.step(action) 122 | done = terminated or truncated 123 | episode_reward += reward 124 | 125 | # 修改奖励,加速训练。 126 | episode_length += 1 127 | if done is True and episode_length < 200: 128 | reward = 250 + episode_reward 129 | else: 130 | reward = 5 * abs(next_state[0] - state[0]) + 3 * abs(state[1]) 131 | replay_buffer.push(state, action, reward, done, next_state) 132 | state = next_state 133 | 134 | if done is True: 135 | log_ep_rewards.append(episode_reward) 136 | log_ep_length.append(episode_length) 137 | 138 | epsilon = max(epsilon * args.epsilon_decay, 1e-3) 139 | 140 | print(f"i={i}, reward={episode_reward:.0f}, length={episode_length}, max_reward={max_episode_reward}, loss={log_losses[-1]:.1e}, epsilon={epsilon:.3f}") 141 | 142 | if episode_length < 180 and episode_reward > max_episode_reward: 143 | save_path = os.path.join(args.output_dir, "model.bin") 144 | torch.save(agent.model.state_dict(), save_path) 145 | max_episode_reward = episode_reward 146 | 147 | episode_reward = 0 148 | episode_length = 0 149 | state, _ = env.reset() 150 | 151 | if i > args.warmup_steps: 152 | bs, ba, br, bd, bns = replay_buffer.sample(n=args.batch_size) 153 | bs = torch.tensor(bs, dtype=torch.float32) 154 | ba = torch.tensor(ba, dtype=torch.long) 155 | br = torch.tensor(br, dtype=torch.float32) 156 | bd = torch.tensor(bd, dtype=torch.float32) 157 | bns = torch.tensor(bns, dtype=torch.float32) 158 | 159 | loss = agent.compute_loss(bs, ba, br, bd, bns) 160 | loss.backward() 161 | optimizer.step() 162 | optimizer.zero_grad() 163 | 164 | log_losses.append(loss.item()) 165 | 166 | # 更新目标网络。 167 | for target_param, param in zip(agent.target_model.parameters(), agent.model.parameters()): 168 | target_param.data.copy_(args.lr_target * param.data + (1 - args.lr_target) * target_param.data) 169 | 170 | plt.plot(log_losses) 171 | plt.yscale("log") 172 | plt.savefig(f"{args.output_dir}/loss.png", bbox_inches="tight") 173 | plt.close() 174 | 175 | plt.plot(np.cumsum(log_ep_length), log_ep_rewards) 176 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 177 | plt.close() 178 | 179 | 180 | def eval(args, env, agent): 181 | model_path = os.path.join(args.output_dir, "model.bin") 182 | agent.model.load_state_dict(torch.load(model_path)) 183 | 184 | episode_length = 0 185 | episode_reward = 0 186 | 187 | agent.model.eval() 188 | state, _ = env.reset() 189 | for i in range(5000): 190 | episode_length += 1 191 | action = agent.get_action(torch.from_numpy(state)).item() 192 | next_state, reward, terminated, truncated, _ = env.step(action) 193 | done = terminated or truncated 194 | episode_reward += reward 195 | 196 | state = next_state 197 | if done is True: 198 | print(f"episode reward={episode_reward}, episode length={episode_length}") 199 | state, _ = env.reset() 200 | episode_length = 0 201 | episode_reward = 0 202 | 203 | 204 | def main(): 205 | parser = argparse.ArgumentParser() 206 | parser.add_argument("--env", default="MountainCar-v0", type=str, help="Environment name.") 207 | parser.add_argument("--dim_obs", default=2, type=int, help="Dimension of observation.") 208 | parser.add_argument("--num_act", default=3, type=int, help="Number of actions.") 209 | parser.add_argument("--discount", default=0.95, type=float, help="Discount coefficient.") 210 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 211 | parser.add_argument("--lr", default=1e-3, type=float, help="Learning rate.") 212 | parser.add_argument("--lr_target", default=1e-3, type=float, help="Update target net.") 213 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") 214 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 215 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 216 | parser.add_argument("--warmup_steps", default=10_000, type=int, help="Warmup steps without training.") 217 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 218 | parser.add_argument("--epsilon_decay", default=0.99, type=float, help="Epsilon-greedy algorithm decay coefficient.") 219 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 220 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 221 | args = parser.parse_args() 222 | 223 | args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 224 | 225 | env = gym.make(args.env) 226 | set_seed(args) 227 | agent = DoubleDQN(dim_obs=args.dim_obs, num_act=args.num_act, discount=args.discount) 228 | agent.model.to(args.device) 229 | 230 | if args.do_train: 231 | train(args, env, agent) 232 | 233 | if args.do_eval: 234 | eval(args, env, agent) 235 | 236 | 237 | if __name__ == "__main__": 238 | main() 239 | -------------------------------------------------------------------------------- /07_a2c.py: -------------------------------------------------------------------------------- 1 | """8.3节A2C算法实现。""" 2 | import argparse 3 | import os 4 | from collections import defaultdict 5 | import gym 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from torch.distributions import Categorical 12 | 13 | 14 | class ValueNet(nn.Module): 15 | def __init__(self, dim_state): 16 | super().__init__() 17 | self.fc1 = nn.Linear(dim_state, 64) 18 | self.fc2 = nn.Linear(64, 32) 19 | self.fc3 = nn.Linear(32, 1) 20 | 21 | def forward(self, state): 22 | x = F.relu(self.fc1(state)) 23 | x = F.relu(self.fc2(x)) 24 | x = self.fc3(x) 25 | return x 26 | 27 | 28 | class PolicyNet(nn.Module): 29 | def __init__(self, dim_state, num_action): 30 | super().__init__() 31 | self.fc1 = nn.Linear(dim_state, 64) 32 | self.fc2 = nn.Linear(64, 32) 33 | self.fc3 = nn.Linear(32, num_action) 34 | 35 | def forward(self, state): 36 | x = F.relu(self.fc1(state)) 37 | x = F.relu(self.fc2(x)) 38 | x = self.fc3(x) 39 | prob = F.softmax(x, dim=-1) 40 | return prob 41 | 42 | 43 | class A2C: 44 | def __init__(self, args): 45 | self.args = args 46 | self.V = ValueNet(args.dim_state) 47 | self.V_target = ValueNet(args.dim_state) 48 | self.pi = PolicyNet(args.dim_state, args.num_action) 49 | self.V_target.load_state_dict(self.V.state_dict()) 50 | 51 | def get_action(self, state): 52 | probs = self.pi(state) 53 | m = Categorical(probs) 54 | action = m.sample() 55 | logp_action = m.log_prob(action) 56 | return action, logp_action 57 | 58 | def compute_value_loss(self, bs, blogp_a, br, bd, bns): 59 | # 目标价值。 60 | with torch.no_grad(): 61 | target_value = br + self.args.discount * torch.logical_not(bd) * self.V_target(bns).squeeze() 62 | 63 | # 计算value loss。 64 | value_loss = F.mse_loss(self.V(bs).squeeze(), target_value) 65 | return value_loss 66 | 67 | def compute_policy_loss(self, bs, blogp_a, br, bd, bns): 68 | # 建议对比08_a2c.py,比较二者的差异。 69 | with torch.no_grad(): 70 | value = self.V(bs).squeeze() 71 | 72 | policy_loss = 0 73 | for i, logp_a in enumerate(blogp_a): 74 | policy_loss += -logp_a * value[i] 75 | policy_loss = policy_loss.mean() 76 | return policy_loss 77 | 78 | def soft_update(self, tau=0.01): 79 | def soft_update_(target, source, tau_=0.01): 80 | for target_param, param in zip(target.parameters(), source.parameters()): 81 | target_param.data.copy_(target_param.data * (1.0 - tau_) + param.data * tau_) 82 | 83 | soft_update_(self.V_target, self.V, tau) 84 | 85 | 86 | class Rollout: 87 | def __init__(self): 88 | self.state_lst = [] 89 | self.action_lst = [] 90 | self.logp_action_lst = [] 91 | self.reward_lst = [] 92 | self.done_lst = [] 93 | self.next_state_lst = [] 94 | 95 | def put(self, state, action, logp_action, reward, done, next_state): 96 | self.state_lst.append(state) 97 | self.action_lst.append(action) 98 | self.logp_action_lst.append(logp_action) 99 | self.reward_lst.append(reward) 100 | self.done_lst.append(done) 101 | self.next_state_lst.append(next_state) 102 | 103 | def tensor(self): 104 | bs = torch.as_tensor(self.state_lst).float() 105 | ba = torch.as_tensor(self.action_lst).float() 106 | blogp_a = self.logp_action_lst 107 | br = torch.as_tensor(self.reward_lst).float() 108 | bd = torch.as_tensor(self.done_lst) 109 | bns = torch.as_tensor(self.next_state_lst).float() 110 | return bs, ba, blogp_a, br, bd, bns 111 | 112 | 113 | class INFO: 114 | def __init__(self): 115 | self.log = defaultdict(list) 116 | self.episode_length = 0 117 | self.episode_reward = 0 118 | self.max_episode_reward = -float("inf") 119 | 120 | def put(self, done, reward): 121 | if done is True: 122 | self.episode_length += 1 123 | self.episode_reward += reward 124 | self.log["episode_length"].append(self.episode_length) 125 | self.log["episode_reward"].append(self.episode_reward) 126 | 127 | if self.episode_reward > self.max_episode_reward: 128 | self.max_episode_reward = self.episode_reward 129 | 130 | self.episode_length = 0 131 | self.episode_reward = 0 132 | 133 | else: 134 | self.episode_length += 1 135 | self.episode_reward += reward 136 | 137 | 138 | def train(args, env, agent: A2C): 139 | V_optimizer = torch.optim.Adam(agent.V.parameters(), lr=3e-3) 140 | pi_optimizer = torch.optim.Adam(agent.pi.parameters(), lr=3e-3) 141 | info = INFO() 142 | 143 | rollout = Rollout() 144 | state, _ = env.reset() 145 | for step in range(args.max_steps): 146 | action, logp_action = agent.get_action(torch.tensor(state).float()) 147 | next_state, reward, terminated, truncated, _ = env.step(action.item()) 148 | done = terminated or truncated 149 | info.put(done, reward) 150 | 151 | rollout.put( 152 | state, 153 | action, 154 | logp_action, 155 | reward, 156 | done, 157 | next_state, 158 | ) 159 | state = next_state 160 | 161 | if done is True: 162 | # 模型训练。 163 | bs, ba, blogp_a, br, bd, bns = rollout.tensor() 164 | 165 | value_loss = agent.compute_value_loss(bs, blogp_a, br, bd, bns) 166 | V_optimizer.zero_grad() 167 | value_loss.backward(retain_graph=True) 168 | V_optimizer.step() 169 | 170 | policy_loss = agent.compute_policy_loss(bs, blogp_a, br, bd, bns) 171 | pi_optimizer.zero_grad() 172 | policy_loss.backward() 173 | pi_optimizer.step() 174 | 175 | agent.soft_update() 176 | 177 | # 打印信息。 178 | info.log["value_loss"].append(value_loss.item()) 179 | info.log["policy_loss"].append(policy_loss.item()) 180 | 181 | episode_reward = info.log["episode_reward"][-1] 182 | episode_length = info.log["episode_length"][-1] 183 | value_loss = info.log["value_loss"][-1] 184 | print(f"step={step}, reward={episode_reward:.0f}, length={episode_length}, max_reward={info.max_episode_reward}, value_loss={value_loss:.1e}") 185 | 186 | # 重置环境。 187 | state, _ = env.reset() 188 | rollout = Rollout() 189 | 190 | # 保存模型。 191 | if episode_reward == info.max_episode_reward: 192 | save_path = os.path.join(args.output_dir, "model.bin") 193 | torch.save(agent.pi.state_dict(), save_path) 194 | 195 | if step % 10000 == 0: 196 | plt.plot(info.log["value_loss"], label="value loss") 197 | plt.legend() 198 | plt.savefig(f"{args.output_dir}/value_loss.png", bbox_inches="tight") 199 | plt.close() 200 | 201 | plt.plot(info.log["episode_reward"]) 202 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 203 | plt.close() 204 | 205 | 206 | def eval(args, env, agent): 207 | agent = A2C(args) 208 | model_path = os.path.join(args.output_dir, "model.bin") 209 | agent.pi.load_state_dict(torch.load(model_path)) 210 | 211 | episode_length = 0 212 | episode_reward = 0 213 | state, _ = env.reset() 214 | for i in range(5000): 215 | episode_length += 1 216 | action, _ = agent.get_action(torch.from_numpy(state)) 217 | next_state, reward, terminated, truncated, info = env.step(action.item()) 218 | done = terminated or truncated 219 | episode_reward += reward 220 | 221 | state = next_state 222 | if done is True: 223 | print(f"episode reward={episode_reward}, length={episode_length}") 224 | state, _ = env.reset() 225 | episode_length = 0 226 | episode_reward = 0 227 | 228 | 229 | if __name__ == "__main__": 230 | parser = argparse.ArgumentParser() 231 | parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment name.") 232 | parser.add_argument("--dim_state", default=4, type=int, help="Dimension of state.") 233 | parser.add_argument("--num_action", default=2, type=int, help="Number of action.") 234 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 235 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 236 | 237 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 238 | parser.add_argument("--discount", default=0.99, type=float, help="Discount coefficient.") 239 | parser.add_argument("--lr", default=1e-3, type=float, help="Learning rate.") 240 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") 241 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 242 | 243 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 244 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 245 | args = parser.parse_args() 246 | 247 | env = gym.make(args.env) 248 | agent = A2C(args) 249 | 250 | if args.do_train: 251 | train(args, env, agent) 252 | 253 | if args.do_eval: 254 | eval(args, env, agent) 255 | -------------------------------------------------------------------------------- /07_reinforce.py: -------------------------------------------------------------------------------- 1 | """8.3节带基线的REINFORCE算法实现。""" 2 | import argparse 3 | import os 4 | from collections import defaultdict 5 | import gym 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from torch.distributions import Categorical 12 | 13 | 14 | class ValueNet(nn.Module): 15 | def __init__(self, dim_state): 16 | super().__init__() 17 | self.fc1 = nn.Linear(dim_state, 64) 18 | self.fc2 = nn.Linear(64, 32) 19 | self.fc3 = nn.Linear(32, 1) 20 | 21 | def forward(self, state): 22 | x = F.relu(self.fc1(state)) 23 | x = F.relu(self.fc2(x)) 24 | x = self.fc3(x) 25 | return x 26 | 27 | 28 | class PolicyNet(nn.Module): 29 | def __init__(self, dim_state, num_action): 30 | super().__init__() 31 | self.fc1 = nn.Linear(dim_state, 64) 32 | self.fc2 = nn.Linear(64, 32) 33 | self.fc3 = nn.Linear(32, num_action) 34 | 35 | def forward(self, state): 36 | x = F.relu(self.fc1(state)) 37 | x = F.relu(self.fc2(x)) 38 | x = self.fc3(x) 39 | prob = F.softmax(x, dim=-1) 40 | return prob 41 | 42 | 43 | class REINFORCE_with_Baseline: 44 | def __init__(self, args): 45 | self.args = args 46 | self.V = ValueNet(args.dim_state) 47 | self.V_target = ValueNet(args.dim_state) 48 | self.pi = PolicyNet(args.dim_state, args.num_action) 49 | self.V_target.load_state_dict(self.V.state_dict()) 50 | 51 | def get_action(self, state): 52 | probs = self.pi(state) 53 | m = Categorical(probs) 54 | action = m.sample() 55 | logp_action = m.log_prob(action) 56 | return action, logp_action 57 | 58 | def compute_value_loss(self, bs, blogp_a, br, bd, bns): 59 | # 累积奖励。 60 | r_lst = [] 61 | R = 0 62 | for i in reversed(range(len(br))): 63 | R = self.args.discount * R + br[i] 64 | r_lst.append(R) 65 | r_lst.reverse() 66 | batch_r = torch.tensor(r_lst) 67 | 68 | # 计算value loss。 69 | value_loss = F.mse_loss(self.V(bs).squeeze(), batch_r) 70 | return value_loss 71 | 72 | def compute_policy_loss(self, bs, blogp_a, br, bd, bns): 73 | # 累积奖励。 74 | r_lst = [] 75 | R = 0 76 | for i in reversed(range(len(br))): 77 | R = self.args.discount * R + br[i] 78 | r_lst.append(R) 79 | r_lst.reverse() 80 | batch_r = torch.tensor(r_lst) 81 | 82 | policy_loss = 0 83 | for i, logp_a in enumerate(blogp_a): 84 | policy_loss += -logp_a * batch_r[i] 85 | policy_loss = policy_loss.mean() 86 | return policy_loss 87 | 88 | def soft_update(self, tau=0.01): 89 | def soft_update_(target, source, tau_=0.01): 90 | for target_param, param in zip(target.parameters(), source.parameters()): 91 | target_param.data.copy_(target_param.data * (1.0 - tau_) + param.data * tau_) 92 | 93 | soft_update_(self.V_target, self.V, tau) 94 | 95 | 96 | class Rollout: 97 | def __init__(self): 98 | self.state_lst = [] 99 | self.action_lst = [] 100 | self.logp_action_lst = [] 101 | self.reward_lst = [] 102 | self.done_lst = [] 103 | self.next_state_lst = [] 104 | 105 | def put(self, state, action, logp_action, reward, done, next_state): 106 | self.state_lst.append(state) 107 | self.action_lst.append(action) 108 | self.logp_action_lst.append(logp_action) 109 | self.reward_lst.append(reward) 110 | self.done_lst.append(done) 111 | self.next_state_lst.append(next_state) 112 | 113 | def tensor(self): 114 | bs = torch.as_tensor(self.state_lst).float() 115 | ba = torch.as_tensor(self.action_lst).float() 116 | blogp_a = self.logp_action_lst 117 | br = self.reward_lst 118 | bd = torch.as_tensor(self.done_lst) 119 | bns = torch.as_tensor(self.next_state_lst).float() 120 | return bs, ba, blogp_a, br, bd, bns 121 | 122 | 123 | class INFO: 124 | def __init__(self): 125 | self.log = defaultdict(list) 126 | self.episode_length = 0 127 | self.episode_reward = 0 128 | self.max_episode_reward = -float("inf") 129 | 130 | def put(self, done, reward): 131 | if done is True: 132 | self.episode_length += 1 133 | self.episode_reward += reward 134 | self.log["episode_length"].append(self.episode_length) 135 | self.log["episode_reward"].append(self.episode_reward) 136 | 137 | if self.episode_reward > self.max_episode_reward: 138 | self.max_episode_reward = self.episode_reward 139 | 140 | self.episode_length = 0 141 | self.episode_reward = 0 142 | 143 | else: 144 | self.episode_length += 1 145 | self.episode_reward += reward 146 | 147 | 148 | def train(args, env, agent: REINFORCE_with_Baseline): 149 | V_optimizer = torch.optim.Adam(agent.V.parameters(), lr=args.lr) 150 | pi_optimizer = torch.optim.Adam(agent.pi.parameters(), lr=args.lr) 151 | info = INFO() 152 | 153 | rollout = Rollout() 154 | state, _ = env.reset() 155 | for step in range(args.max_steps): 156 | action, logp_action = agent.get_action(torch.tensor(state).float()) 157 | next_state, reward, terminated, truncated, _ = env.step(action.item()) 158 | done = terminated or truncated 159 | info.put(done, reward) 160 | 161 | rollout.put( 162 | state, 163 | action, 164 | logp_action, 165 | reward, 166 | done, 167 | next_state, 168 | ) 169 | state = next_state 170 | 171 | if done is True: 172 | # 模型训练。 173 | bs, ba, blogp_a, br, bd, bns = rollout.tensor() 174 | 175 | value_loss = agent.compute_value_loss(bs, blogp_a, br, bd, bns) 176 | V_optimizer.zero_grad() 177 | value_loss.backward(retain_graph=True) 178 | V_optimizer.step() 179 | 180 | policy_loss = agent.compute_policy_loss(bs, blogp_a, br, bd, bns) 181 | pi_optimizer.zero_grad() 182 | policy_loss.backward() 183 | pi_optimizer.step() 184 | 185 | agent.soft_update() 186 | 187 | # 打印信息。 188 | info.log["value_loss"].append(value_loss.item()) 189 | info.log["policy_loss"].append(policy_loss.item()) 190 | 191 | episode_reward = info.log["episode_reward"][-1] 192 | episode_length = info.log["episode_length"][-1] 193 | value_loss = info.log["value_loss"][-1] 194 | print(f"step={step}, reward={episode_reward:.0f}, length={episode_length}, max_reward={info.max_episode_reward}, value_loss={value_loss:.1e}") 195 | 196 | # 重置环境。 197 | state, _ = env.reset() 198 | rollout = Rollout() 199 | 200 | # 保存模型。 201 | if episode_reward == info.max_episode_reward: 202 | save_path = os.path.join(args.output_dir, "model.bin") 203 | torch.save(agent.pi.state_dict(), save_path) 204 | 205 | if step % 10000 == 0: 206 | plt.plot(info.log["value_loss"], label="value loss") 207 | plt.legend() 208 | plt.savefig(f"{args.output_dir}/value_loss.png", bbox_inches="tight") 209 | plt.close() 210 | 211 | plt.plot(info.log["episode_reward"]) 212 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 213 | plt.close() 214 | 215 | 216 | def eval(args, env, agent): 217 | agent = REINFORCE_with_Baseline(args) 218 | model_path = os.path.join(args.output_dir, "model.bin") 219 | agent.pi.load_state_dict(torch.load(model_path)) 220 | 221 | episode_length = 0 222 | episode_reward = 0 223 | state, _ = env.reset() 224 | for i in range(5000): 225 | episode_length += 1 226 | action, _ = agent.get_action(torch.from_numpy(state)) 227 | next_state, reward, terminated, truncated, info = env.step(action.item()) 228 | done = terminated or truncated 229 | episode_reward += reward 230 | 231 | state = next_state 232 | if done is True: 233 | print(f"episode reward={episode_reward}, episode length={episode_length}") 234 | state, _ = env.reset() 235 | episode_length = 0 236 | episode_reward = 0 237 | 238 | 239 | if __name__ == "__main__": 240 | parser = argparse.ArgumentParser() 241 | parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment name.") 242 | parser.add_argument("--dim_state", default=4, type=int, help="Dimension of state.") 243 | parser.add_argument("--num_action", default=2, type=int, help="Number of action.") 244 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 245 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 246 | 247 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 248 | parser.add_argument("--discount", default=0.99, type=float, help="Discount coefficient.") 249 | parser.add_argument("--lr", default=3e-3, type=float, help="Learning rate.") 250 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") 251 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 252 | 253 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 254 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 255 | args = parser.parse_args() 256 | 257 | env = gym.make(args.env) 258 | agent = REINFORCE_with_Baseline(args) 259 | 260 | if args.do_train: 261 | train(args, env, agent) 262 | 263 | if args.do_eval: 264 | eval(args, env, agent) 265 | -------------------------------------------------------------------------------- /08_a2c.py: -------------------------------------------------------------------------------- 1 | """8.3节A2C算法实现。""" 2 | import argparse 3 | import os 4 | from collections import defaultdict 5 | import gym 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from torch.distributions import Categorical 12 | 13 | 14 | class ValueNet(nn.Module): 15 | def __init__(self, dim_state): 16 | super().__init__() 17 | self.fc1 = nn.Linear(dim_state, 64) 18 | self.fc2 = nn.Linear(64, 32) 19 | self.fc3 = nn.Linear(32, 1) 20 | 21 | def forward(self, state): 22 | x = F.relu(self.fc1(state)) 23 | x = F.relu(self.fc2(x)) 24 | x = self.fc3(x) 25 | return x 26 | 27 | 28 | class PolicyNet(nn.Module): 29 | def __init__(self, dim_state, num_action): 30 | super().__init__() 31 | self.fc1 = nn.Linear(dim_state, 64) 32 | self.fc2 = nn.Linear(64, 32) 33 | self.fc3 = nn.Linear(32, num_action) 34 | 35 | def forward(self, state): 36 | x = F.relu(self.fc1(state)) 37 | x = F.relu(self.fc2(x)) 38 | x = self.fc3(x) 39 | prob = F.softmax(x, dim=-1) 40 | return prob 41 | 42 | 43 | class A2C: 44 | def __init__(self, args): 45 | self.args = args 46 | self.V = ValueNet(args.dim_state) 47 | self.V_target = ValueNet(args.dim_state) 48 | self.pi = PolicyNet(args.dim_state, args.num_action) 49 | self.V_target.load_state_dict(self.V.state_dict()) 50 | 51 | def get_action(self, state): 52 | probs = self.pi(state) 53 | m = Categorical(probs) 54 | action = m.sample() 55 | logp_action = m.log_prob(action) 56 | return action, logp_action 57 | 58 | def compute_value_loss(self, bs, blogp_a, br, bd, bns): 59 | # 目标价值。 60 | with torch.no_grad(): 61 | target_value = br + self.args.discount * torch.logical_not(bd) * self.V_target(bns).squeeze() 62 | 63 | # 计算value loss。 64 | value_loss = F.mse_loss(self.V(bs).squeeze(), target_value) 65 | return value_loss 66 | 67 | def compute_policy_loss(self, bs, blogp_a, br, bd, bns): 68 | # 目标价值。 69 | with torch.no_grad(): 70 | target_value = br + self.args.discount * torch.logical_not(bd) * self.V_target(bns).squeeze() 71 | 72 | # 计算policy loss。 73 | with torch.no_grad(): 74 | advantage = target_value - self.V(bs).squeeze() 75 | policy_loss = 0 76 | for i, logp_a in enumerate(blogp_a): 77 | policy_loss += -logp_a * advantage[i] 78 | policy_loss = policy_loss.mean() 79 | return policy_loss 80 | 81 | def soft_update(self, tau=0.01): 82 | def soft_update_(target, source, tau_=0.01): 83 | for target_param, param in zip(target.parameters(), source.parameters()): 84 | target_param.data.copy_(target_param.data * (1.0 - tau_) + param.data * tau_) 85 | 86 | soft_update_(self.V_target, self.V, tau) 87 | 88 | 89 | class Rollout: 90 | def __init__(self): 91 | self.state_lst = [] 92 | self.action_lst = [] 93 | self.logp_action_lst = [] 94 | self.reward_lst = [] 95 | self.done_lst = [] 96 | self.next_state_lst = [] 97 | 98 | def put(self, state, action, logp_action, reward, done, next_state): 99 | self.state_lst.append(state) 100 | self.action_lst.append(action) 101 | self.logp_action_lst.append(logp_action) 102 | self.reward_lst.append(reward) 103 | self.done_lst.append(done) 104 | self.next_state_lst.append(next_state) 105 | 106 | def tensor(self): 107 | bs = torch.as_tensor(self.state_lst).float() 108 | ba = torch.as_tensor(self.action_lst).float() 109 | blogp_a = self.logp_action_lst 110 | br = torch.as_tensor(self.reward_lst).float() 111 | bd = torch.as_tensor(self.done_lst) 112 | bns = torch.as_tensor(self.next_state_lst).float() 113 | return bs, ba, blogp_a, br, bd, bns 114 | 115 | 116 | class INFO: 117 | def __init__(self): 118 | self.log = defaultdict(list) 119 | self.episode_length = 0 120 | self.episode_reward = 0 121 | self.max_episode_reward = -float("inf") 122 | 123 | def put(self, done, reward): 124 | if done is True: 125 | self.episode_length += 1 126 | self.episode_reward += reward 127 | self.log["episode_length"].append(self.episode_length) 128 | self.log["episode_reward"].append(self.episode_reward) 129 | 130 | if self.episode_reward > self.max_episode_reward: 131 | self.max_episode_reward = self.episode_reward 132 | 133 | self.episode_length = 0 134 | self.episode_reward = 0 135 | 136 | else: 137 | self.episode_length += 1 138 | self.episode_reward += reward 139 | 140 | 141 | def train(args, env, agent: A2C): 142 | V_optimizer = torch.optim.Adam(agent.V.parameters(), lr=3e-3) 143 | pi_optimizer = torch.optim.Adam(agent.pi.parameters(), lr=3e-3) 144 | info = INFO() 145 | 146 | rollout = Rollout() 147 | state, _ = env.reset() 148 | for step in range(args.max_steps): 149 | action, logp_action = agent.get_action(torch.tensor(state).float()) 150 | next_state, reward, terminated, truncated, _ = env.step(action.item()) 151 | done = terminated or truncated 152 | info.put(done, reward) 153 | 154 | rollout.put( 155 | state, 156 | action, 157 | logp_action, 158 | reward, 159 | done, 160 | next_state, 161 | ) 162 | state = next_state 163 | 164 | if done is True: 165 | # 模型训练。 166 | bs, ba, blogp_a, br, bd, bns = rollout.tensor() 167 | 168 | value_loss = agent.compute_value_loss(bs, blogp_a, br, bd, bns) 169 | V_optimizer.zero_grad() 170 | value_loss.backward(retain_graph=True) 171 | V_optimizer.step() 172 | 173 | policy_loss = agent.compute_policy_loss(bs, blogp_a, br, bd, bns) 174 | pi_optimizer.zero_grad() 175 | policy_loss.backward() 176 | pi_optimizer.step() 177 | 178 | agent.soft_update() 179 | 180 | # 打印信息。 181 | info.log["value_loss"].append(value_loss.item()) 182 | info.log["policy_loss"].append(policy_loss.item()) 183 | 184 | episode_reward = info.log["episode_reward"][-1] 185 | episode_length = info.log["episode_length"][-1] 186 | value_loss = info.log["value_loss"][-1] 187 | print(f"step={step}, reward={episode_reward:.0f}, length={episode_length}, max_reward={info.max_episode_reward}, value_loss={value_loss:.1e}") 188 | 189 | # 重置环境。 190 | state, _ = env.reset() 191 | rollout = Rollout() 192 | 193 | # 保存模型。 194 | if episode_reward == info.max_episode_reward: 195 | save_path = os.path.join(args.output_dir, "model.bin") 196 | torch.save(agent.pi.state_dict(), save_path) 197 | 198 | if step % 10000 == 0: 199 | plt.plot(info.log["value_loss"], label="value loss") 200 | plt.legend() 201 | plt.savefig(f"{args.output_dir}/value_loss.png", bbox_inches="tight") 202 | plt.close() 203 | 204 | plt.plot(info.log["episode_reward"]) 205 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 206 | plt.close() 207 | 208 | 209 | def eval(args, env, agent): 210 | agent = A2C(args) 211 | model_path = os.path.join(args.output_dir, "model.bin") 212 | agent.pi.load_state_dict(torch.load(model_path)) 213 | 214 | episode_length = 0 215 | episode_reward = 0 216 | state, _ = env.reset() 217 | for i in range(5000): 218 | episode_length += 1 219 | action, _ = agent.get_action(torch.from_numpy(state)) 220 | next_state, reward, terminated, truncated, info = env.step(action.item()) 221 | done = terminated or truncated 222 | episode_reward += reward 223 | 224 | state = next_state 225 | if done is True: 226 | print(f"episode reward={episode_reward}, length={episode_length}") 227 | state, _ = env.reset() 228 | episode_length = 0 229 | episode_reward = 0 230 | 231 | 232 | if __name__ == "__main__": 233 | parser = argparse.ArgumentParser() 234 | parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment name.") 235 | parser.add_argument("--dim_state", default=4, type=int, help="Dimension of state.") 236 | parser.add_argument("--num_action", default=2, type=int, help="Number of action.") 237 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 238 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 239 | 240 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 241 | parser.add_argument("--discount", default=0.99, type=float, help="Discount coefficient.") 242 | parser.add_argument("--lr", default=1e-3, type=float, help="Learning rate.") 243 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") 244 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 245 | 246 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 247 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 248 | args = parser.parse_args() 249 | 250 | env = gym.make(args.env) 251 | agent = A2C(args) 252 | 253 | if args.do_train: 254 | train(args, env, agent) 255 | 256 | if args.do_eval: 257 | eval(args, env, agent) 258 | -------------------------------------------------------------------------------- /08_reinforce_with_baseline.py: -------------------------------------------------------------------------------- 1 | """8.3节带基线的REINFORCE算法实现。""" 2 | import argparse 3 | import os 4 | from collections import defaultdict 5 | import gym 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from torch.distributions import Categorical 12 | 13 | 14 | class ValueNet(nn.Module): 15 | def __init__(self, dim_state): 16 | super().__init__() 17 | self.fc1 = nn.Linear(dim_state, 64) 18 | self.fc2 = nn.Linear(64, 32) 19 | self.fc3 = nn.Linear(32, 1) 20 | 21 | def forward(self, state): 22 | x = F.relu(self.fc1(state)) 23 | x = F.relu(self.fc2(x)) 24 | x = self.fc3(x) 25 | return x 26 | 27 | 28 | class PolicyNet(nn.Module): 29 | def __init__(self, dim_state, num_action): 30 | super().__init__() 31 | self.fc1 = nn.Linear(dim_state, 64) 32 | self.fc2 = nn.Linear(64, 32) 33 | self.fc3 = nn.Linear(32, num_action) 34 | 35 | def forward(self, state): 36 | x = F.relu(self.fc1(state)) 37 | x = F.relu(self.fc2(x)) 38 | x = self.fc3(x) 39 | prob = F.softmax(x, dim=-1) 40 | return prob 41 | 42 | 43 | class REINFORCE_with_Baseline: 44 | def __init__(self, args): 45 | self.args = args 46 | self.V = ValueNet(args.dim_state) 47 | self.V_target = ValueNet(args.dim_state) 48 | self.pi = PolicyNet(args.dim_state, args.num_action) 49 | self.V_target.load_state_dict(self.V.state_dict()) 50 | 51 | def get_action(self, state): 52 | probs = self.pi(state) 53 | m = Categorical(probs) 54 | action = m.sample() 55 | logp_action = m.log_prob(action) 56 | return action, logp_action 57 | 58 | def compute_value_loss(self, bs, blogp_a, br, bd, bns): 59 | # 累积奖励。 60 | r_lst = [] 61 | R = 0 62 | for i in reversed(range(len(br))): 63 | R = self.args.discount * R + br[i] 64 | r_lst.append(R) 65 | r_lst.reverse() 66 | batch_r = torch.tensor(r_lst) 67 | 68 | # 计算value loss。 69 | value_loss = F.mse_loss(self.V(bs).squeeze(), batch_r) 70 | return value_loss 71 | 72 | def compute_policy_loss(self, bs, blogp_a, br, bd, bns): 73 | # 累积奖励。 74 | r_lst = [] 75 | R = 0 76 | for i in reversed(range(len(br))): 77 | R = self.args.discount * R + br[i] 78 | r_lst.append(R) 79 | r_lst.reverse() 80 | batch_r = torch.tensor(r_lst) 81 | 82 | # 计算policy loss。 83 | with torch.no_grad(): 84 | advantage = self.V(bs).squeeze() - batch_r 85 | 86 | policy_loss = 0 87 | for i, logp_a in enumerate(blogp_a): 88 | policy_loss += logp_a * advantage[i] 89 | policy_loss = policy_loss.mean() 90 | return policy_loss 91 | 92 | def soft_update(self, tau=0.01): 93 | def soft_update_(target, source, tau_=0.01): 94 | for target_param, param in zip(target.parameters(), source.parameters()): 95 | target_param.data.copy_(target_param.data * (1.0 - tau_) + param.data * tau_) 96 | 97 | soft_update_(self.V_target, self.V, tau) 98 | 99 | 100 | class Rollout: 101 | def __init__(self): 102 | self.state_lst = [] 103 | self.action_lst = [] 104 | self.logp_action_lst = [] 105 | self.reward_lst = [] 106 | self.done_lst = [] 107 | self.next_state_lst = [] 108 | 109 | def put(self, state, action, logp_action, reward, done, next_state): 110 | self.state_lst.append(state) 111 | self.action_lst.append(action) 112 | self.logp_action_lst.append(logp_action) 113 | self.reward_lst.append(reward) 114 | self.done_lst.append(done) 115 | self.next_state_lst.append(next_state) 116 | 117 | def tensor(self): 118 | bs = torch.as_tensor(self.state_lst).float() 119 | ba = torch.as_tensor(self.action_lst).float() 120 | blogp_a = self.logp_action_lst 121 | br = self.reward_lst 122 | bd = torch.as_tensor(self.done_lst) 123 | bns = torch.as_tensor(self.next_state_lst).float() 124 | return bs, ba, blogp_a, br, bd, bns 125 | 126 | 127 | class INFO: 128 | def __init__(self): 129 | self.log = defaultdict(list) 130 | self.episode_length = 0 131 | self.episode_reward = 0 132 | self.max_episode_reward = -float("inf") 133 | 134 | def put(self, done, reward): 135 | if done is True: 136 | self.episode_length += 1 137 | self.episode_reward += reward 138 | self.log["episode_length"].append(self.episode_length) 139 | self.log["episode_reward"].append(self.episode_reward) 140 | 141 | if self.episode_reward > self.max_episode_reward: 142 | self.max_episode_reward = self.episode_reward 143 | 144 | self.episode_length = 0 145 | self.episode_reward = 0 146 | 147 | else: 148 | self.episode_length += 1 149 | self.episode_reward += reward 150 | 151 | 152 | def train(args, env, agent: REINFORCE_with_Baseline): 153 | V_optimizer = torch.optim.Adam(agent.V.parameters(), lr=args.lr) 154 | pi_optimizer = torch.optim.Adam(agent.pi.parameters(), lr=args.lr) 155 | info = INFO() 156 | 157 | rollout = Rollout() 158 | state, _ = env.reset() 159 | for step in range(args.max_steps): 160 | action, logp_action = agent.get_action(torch.tensor(state).float()) 161 | next_state, reward, terminated, truncated, _ = env.step(action.item()) 162 | done = terminated or truncated 163 | info.put(done, reward) 164 | 165 | rollout.put( 166 | state, 167 | action, 168 | logp_action, 169 | reward, 170 | done, 171 | next_state, 172 | ) 173 | state = next_state 174 | 175 | if done is True: 176 | # 模型训练。 177 | bs, ba, blogp_a, br, bd, bns = rollout.tensor() 178 | 179 | value_loss = agent.compute_value_loss(bs, blogp_a, br, bd, bns) 180 | V_optimizer.zero_grad() 181 | value_loss.backward(retain_graph=True) 182 | V_optimizer.step() 183 | 184 | policy_loss = agent.compute_policy_loss(bs, blogp_a, br, bd, bns) 185 | pi_optimizer.zero_grad() 186 | policy_loss.backward() 187 | pi_optimizer.step() 188 | 189 | agent.soft_update() 190 | 191 | # 打印信息。 192 | info.log["value_loss"].append(value_loss.item()) 193 | info.log["policy_loss"].append(policy_loss.item()) 194 | 195 | episode_reward = info.log["episode_reward"][-1] 196 | episode_length = info.log["episode_length"][-1] 197 | value_loss = info.log["value_loss"][-1] 198 | print(f"step={step}, reward={episode_reward:.0f}, length={episode_length}, max_reward={info.max_episode_reward}, value_loss={value_loss:.1e}") 199 | 200 | # 重置环境。 201 | state, _ = env.reset() 202 | rollout = Rollout() 203 | 204 | # 保存模型。 205 | if episode_reward == info.max_episode_reward: 206 | save_path = os.path.join(args.output_dir, "model.bin") 207 | torch.save(agent.pi.state_dict(), save_path) 208 | 209 | if step % 10000 == 0: 210 | plt.plot(info.log["value_loss"], label="value loss") 211 | plt.legend() 212 | plt.savefig(f"{args.output_dir}/value_loss.png", bbox_inches="tight") 213 | plt.close() 214 | 215 | plt.plot(info.log["episode_reward"]) 216 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 217 | plt.close() 218 | 219 | 220 | def eval(args, env, agent): 221 | agent = REINFORCE_with_Baseline(args) 222 | model_path = os.path.join(args.output_dir, "model.bin") 223 | agent.pi.load_state_dict(torch.load(model_path)) 224 | 225 | episode_length = 0 226 | episode_reward = 0 227 | state, _ = env.reset() 228 | for i in range(5000): 229 | episode_length += 1 230 | action, _ = agent.get_action(torch.from_numpy(state)) 231 | next_state, reward, terminated, truncated, info = env.step(action.item()) 232 | done = terminated or truncated 233 | episode_reward += reward 234 | 235 | state = next_state 236 | if done is True: 237 | print(f"episode reward={episode_reward}, episode length={episode_length}") 238 | state, _ = env.reset() 239 | episode_length = 0 240 | episode_reward = 0 241 | 242 | 243 | if __name__ == "__main__": 244 | parser = argparse.ArgumentParser() 245 | parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment name.") 246 | parser.add_argument("--dim_state", default=4, type=int, help="Dimension of state.") 247 | parser.add_argument("--num_action", default=2, type=int, help="Number of action.") 248 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 249 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 250 | 251 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 252 | parser.add_argument("--discount", default=0.99, type=float, help="Discount coefficient.") 253 | parser.add_argument("--lr", default=3e-3, type=float, help="Learning rate.") 254 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") 255 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 256 | 257 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 258 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 259 | args = parser.parse_args() 260 | 261 | env = gym.make(args.env) 262 | agent = REINFORCE_with_Baseline(args) 263 | 264 | if args.do_train: 265 | train(args, env, agent) 266 | 267 | if args.do_eval: 268 | eval(args, env, agent) 269 | -------------------------------------------------------------------------------- /09_trpo.py: -------------------------------------------------------------------------------- 1 | """参考https://github.com/ajlangley/trpo-pytorch。 2 | """ 3 | import argparse 4 | import os 5 | import random 6 | from dataclasses import dataclass, field 7 | import gym 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | from torch.distributions.categorical import Categorical 14 | from torch.autograd import grad 15 | 16 | 17 | class CategoricalLayer(nn.Module): 18 | """ 19 | Implements a layer that outputs a multinomial distribution 20 | Methods 21 | ------ 22 | __call__(log_action_probs) 23 | Takes as input log probabilities and outputs a pytorch multinomail 24 | distribution 25 | """ 26 | 27 | def __init__(self): 28 | super().__init__() 29 | 30 | def __call__(self, log_action_probs): 31 | return Categorical(logits=log_action_probs) 32 | 33 | 34 | class PolicyNet(nn.Module): 35 | def __init__(self, dim_obs, num_act): 36 | super().__init__() 37 | self.fc1 = nn.Linear(dim_obs, 64) 38 | self.fc2 = nn.Linear(64, 32) 39 | self.fc3 = nn.Linear(32, num_act) 40 | self.log_softmax = nn.LogSoftmax(dim=-1) 41 | self.categorical = CategoricalLayer() 42 | 43 | def forward(self, obs): 44 | x = F.relu(self.fc1(obs)) 45 | x = F.relu(self.fc2(x)) 46 | x = self.fc3(x) # logits 47 | x = self.log_softmax(x) 48 | x = self.categorical(x) 49 | return x 50 | 51 | 52 | class ValueNet(nn.Module): 53 | """QNet. 54 | Input: feature 55 | Output: num_act of values 56 | """ 57 | 58 | def __init__(self, dim_obs): 59 | super().__init__() 60 | self.fc1 = nn.Linear(dim_obs, 64) 61 | self.fc2 = nn.Linear(64, 32) 62 | self.fc3 = nn.Linear(32, 1) 63 | 64 | def forward(self, obs): 65 | x = F.relu(self.fc1(obs)) 66 | x = F.relu(self.fc2(x)) 67 | x = self.fc3(x) 68 | return x 69 | 70 | 71 | class TRPO: 72 | def __init__(self, args): 73 | self.discount = args.discount 74 | self.policy_net = PolicyNet(args.dim_obs, args.num_act) 75 | self.value_net = ValueNet(args.dim_obs) 76 | self.value_optimizer = torch.optim.AdamW(self.value_net.parameters(), lr=args.lr_value_net) 77 | 78 | self.max_kl_div = 0.01 79 | self.cg_max_iters = 10 80 | self.line_search_accept_ratio = 0.1 81 | 82 | def get_action(self, obs): 83 | action_dist = self.policy_net(obs) 84 | act = action_dist.sample() 85 | return act 86 | 87 | def surrogate_loss(self, log_action_probs, imp_sample_probs, advantages): 88 | return torch.mean(torch.exp(log_action_probs - imp_sample_probs) * advantages) 89 | 90 | def get_max_step_len(self, search_dir, Hvp_fun, max_step, retain_graph=False): 91 | num = 2 * max_step 92 | denom = torch.matmul(search_dir, Hvp_fun(search_dir, retain_graph)) 93 | max_step_len = torch.sqrt(num / denom) 94 | return max_step_len 95 | 96 | def update_policy_net(self, s_batch, a_batch, r_batch, d_batch, next_s_batch): 97 | cumsum_rewards = [0] # 加上0,方便计算。 98 | for i in reversed(range(len(r_batch))): 99 | cumsum_current = cumsum_rewards[-1] * self.discount * (1 - d_batch[i]) + r_batch[i] 100 | cumsum_rewards.append(cumsum_current) 101 | cumsum_rewards.pop(0) 102 | cumsum_rewards = list(reversed(cumsum_rewards)) 103 | cumsum_rewards = torch.tensor(cumsum_rewards, dtype=torch.float32) 104 | 105 | action_dists = self.policy_net(s_batch) 106 | log_action_probs = action_dists.log_prob(a_batch) 107 | 108 | loss = self.surrogate_loss(log_action_probs, log_action_probs.detach(), cumsum_rewards) 109 | loss_grad = flat_grad(loss, self.policy_net.parameters(), retain_graph=True) 110 | 111 | mean_kl = mean_kl_first_fixed(action_dists, action_dists) 112 | 113 | Fvp_fun = get_Hvp_fun(mean_kl, self.policy_net.parameters()) 114 | search_dir = cg_solver(Fvp_fun, loss_grad, self.cg_max_iters) 115 | 116 | expected_improvement = torch.matmul(loss_grad, search_dir) 117 | 118 | def constraints_satisfied(step, beta): 119 | apply_update(self.policy_net, step) 120 | 121 | with torch.no_grad(): 122 | new_action_dists = self.policy_net(s_batch) 123 | new_log_action_probs = new_action_dists.log_prob(a_batch) 124 | 125 | new_loss = self.surrogate_loss(new_log_action_probs, log_action_probs, cumsum_rewards) 126 | 127 | mean_kl = mean_kl_first_fixed(action_dists, new_action_dists) 128 | 129 | actual_improvement = new_loss - loss 130 | improvement_ratio = actual_improvement / (expected_improvement * beta) 131 | 132 | apply_update(self.policy_net, -step) 133 | 134 | surrogate_cond = improvement_ratio >= self.line_search_accept_ratio and actual_improvement > 0.0 135 | kl_cond = mean_kl <= self.max_kl_div 136 | 137 | # print(f"kl contidion = {kl_cond}, mean_kl = {mean_kl}") 138 | 139 | return surrogate_cond and kl_cond 140 | 141 | max_step_len = self.get_max_step_len(search_dir, Fvp_fun, self.max_kl_div, retain_graph=True) 142 | step_len = line_search(search_dir, max_step_len, constraints_satisfied) 143 | 144 | opt_step = step_len * search_dir 145 | apply_update(self.policy_net, opt_step) 146 | 147 | def update_value_net(self, args, states, r_batch, d_batch): 148 | cumsum_rewards = [0] # 加上0,方便计算。 149 | for i in reversed(range(len(r_batch))): 150 | cumsum_current = cumsum_rewards[-1] * self.discount * (1 - d_batch[i]) + r_batch[i] 151 | cumsum_rewards.append(cumsum_current) 152 | cumsum_rewards.pop(0) 153 | cumsum_rewards = list(reversed(cumsum_rewards)) 154 | cumsum_rewards = torch.tensor(cumsum_rewards, dtype=torch.float32) 155 | 156 | for i in range(args.num_update_value): 157 | 158 | def mse(): 159 | self.value_optimizer.zero_grad() 160 | state_values = self.value_net(states).view(-1) 161 | loss = F.mse_loss(state_values, cumsum_rewards) 162 | loss.backward(retain_graph=True) 163 | return loss 164 | 165 | self.value_optimizer.step(mse) 166 | 167 | 168 | def flat_grad(functional_output, inputs, retain_graph=False, create_graph=False): 169 | """ 170 | Return a flattened view of the gradients of functional_output w.r.t. inputs 171 | Parameters 172 | ---------- 173 | functional_output : torch.FloatTensor 174 | The output of the function for which the gradient is to be calculated 175 | inputs : torch.FloatTensor (with requires_grad=True) 176 | the variables w.r.t. which the gradient will be computed 177 | retain_graph : bool 178 | whether to keep the computational graph in memory after computing the 179 | gradient (not required if create_graph is True) 180 | create_graph : bool 181 | whether to create a computational graph of the gradient computation 182 | itself 183 | Return 184 | ------ 185 | flat_grads : torch.FloatTensor 186 | a flattened view of the gradients of functional_output w.r.t. inputs 187 | """ 188 | 189 | if create_graph == True: 190 | retain_graph = True 191 | 192 | grads = grad(functional_output, inputs, retain_graph=retain_graph, create_graph=create_graph) 193 | flat_grads = torch.cat([v.view(-1) for v in grads]) 194 | return flat_grads 195 | 196 | 197 | def detach_dist(dist): 198 | detached_dist = Categorical(logits=dist.logits.detach()) 199 | return detached_dist 200 | 201 | 202 | def mean_kl_first_fixed(dist_1, dist_2): 203 | """ 204 | Calculate the kl-divergence between dist_1 and dist_2 after detaching dist_1 205 | from the computational graph 206 | Parameters 207 | ---------- 208 | dist_1 : torch.distributions.distribution.Distribution 209 | the first argument to the kl-divergence function (will be fixed) 210 | dist_2 : torch.distributions.distribution.Distribution 211 | the second argument to the kl-divergence function (will not be fixed) 212 | Returns 213 | ------- 214 | mean_kl : torch.float 215 | the kl-divergence between dist_1 and dist_2 216 | """ 217 | dist_1_detached = detach_dist(dist_1) 218 | mean_kl = torch.mean(torch.distributions.kl.kl_divergence(dist_1_detached, dist_2)) 219 | return mean_kl 220 | 221 | 222 | def get_Hvp_fun(functional_output, inputs, damping_coef=0.0): 223 | """ 224 | Returns a function that calculates a Hessian-vector product with the Hessian 225 | of functional_output w.r.t. inputs 226 | Parameters 227 | ---------- 228 | functional_output : torch.FloatTensor (with requires_grad=True) 229 | the output of the function of which the Hessian is calculated 230 | inputs : torch.FloatTensor 231 | the inputs w.r.t. which the Hessian is calculated 232 | damping_coef : float 233 | the multiple of the identity matrix to be added to the Hessian 234 | """ 235 | 236 | inputs = list(inputs) 237 | grad_f = flat_grad(functional_output, inputs, create_graph=True) 238 | 239 | def Hvp_fun(v, retain_graph=True): 240 | gvp = torch.matmul(grad_f, v) 241 | Hvp = flat_grad(gvp, inputs, retain_graph=retain_graph) 242 | Hvp += damping_coef * v 243 | return Hvp 244 | 245 | return Hvp_fun 246 | 247 | 248 | def cg_solver(Avp_fun, b, max_iter=10): 249 | """ 250 | Finds an approximate solution to a set of linear equations Ax = b 251 | Parameters 252 | ---------- 253 | Avp_fun : callable 254 | a function that right multiplies a matrix A by a vector 255 | b : torch.FloatTensor 256 | the right hand term in the set of linear equations Ax = b 257 | max_iter : int 258 | the maximum number of iterations (default is 10) 259 | Returns 260 | ------- 261 | x : torch.FloatTensor 262 | the approximate solution to the system of equations defined by Avp_fun 263 | and b 264 | """ 265 | 266 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 267 | x = torch.zeros_like(b).to(device) 268 | r = b.clone() 269 | p = b.clone() 270 | 271 | for i in range(max_iter): 272 | Avp = Avp_fun(p, retain_graph=True) 273 | 274 | alpha = torch.matmul(r, r) / torch.matmul(p, Avp) 275 | x += alpha * p 276 | 277 | if i == max_iter - 1: 278 | return x 279 | 280 | r_new = r - alpha * Avp 281 | beta = torch.matmul(r_new, r_new) / torch.matmul(r, r) 282 | r = r_new 283 | p = r + beta * p 284 | 285 | 286 | def apply_update(parameterized_fun, update): 287 | """ 288 | Add update to the weights of parameterized_fun 289 | Parameters 290 | ---------- 291 | parameterized_fun : torch.nn.Sequential 292 | the function approximator to be updated 293 | update : torch.FloatTensor 294 | a flattened version of the update to be applied 295 | """ 296 | 297 | n = 0 298 | for param in parameterized_fun.parameters(): 299 | numel = param.numel() 300 | param_update = update[n : n + numel].view(param.size()) 301 | param.data += param_update 302 | n += numel 303 | 304 | 305 | def line_search(search_dir, max_step_len, constraints_satisfied, line_search_coef=0.9, max_iter=10): 306 | """ 307 | Perform a backtracking line search that terminates when constraints_satisfied 308 | return True and return the calculated step length. Return 0.0 if no step 309 | length can be found for which constraints_satisfied returns True 310 | Parameters 311 | ---------- 312 | search_dir : torch.FloatTensor 313 | the search direction along which the line search is done 314 | max_step_len : torch.FloatTensor 315 | the maximum step length to consider in the line search 316 | constraints_satisfied : callable 317 | a function that returns a boolean indicating whether the constraints 318 | are met by the current step length 319 | line_search_coef : float 320 | the proportion by which to reduce the step length after each iteration 321 | max_iter : int 322 | the maximum number of backtracks to do before return 0.0 323 | Returns 324 | ------- 325 | the maximum step length coefficient for which constraints_satisfied evaluates 326 | to True 327 | """ 328 | 329 | step_len = max_step_len / line_search_coef 330 | 331 | for i in range(max_iter): 332 | step_len *= line_search_coef 333 | 334 | if constraints_satisfied(step_len * search_dir, step_len): 335 | return step_len 336 | 337 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 338 | return torch.tensor(0.0).to(device) 339 | 340 | 341 | @dataclass 342 | class Trajectory: 343 | state: list = field(default_factory=list) 344 | action: list = field(default_factory=list) 345 | next_state: list = field(default_factory=list) 346 | reward: list = field(default_factory=list) 347 | done: list = field(default_factory=list) 348 | 349 | def push(self, state, action, reward, done, next_state): 350 | self.state.append(state) 351 | self.action.append(action) 352 | self.reward.append(reward) 353 | self.done.append(done) 354 | self.next_state.append(next_state) 355 | 356 | 357 | def set_seed(args): 358 | random.seed(args.seed) 359 | np.random.seed(args.seed) 360 | torch.manual_seed(args.seed) 361 | if not args.no_cuda: 362 | torch.cuda.manual_seed(args.seed) 363 | 364 | 365 | def train(args, env, agent): 366 | trajectory = Trajectory() 367 | 368 | max_episode_reward = -float("inf") 369 | episode_reward = 0 370 | episode_length = 0 371 | log_ep_rewards = [] 372 | log_ep_length = [] 373 | 374 | agent.policy_net.train() 375 | agent.policy_net.zero_grad() 376 | agent.value_net.train() 377 | agent.value_net.zero_grad() 378 | state = env.reset() 379 | for i in range(args.max_steps): 380 | action = agent.get_action(torch.from_numpy(state)).item() 381 | next_state, reward, done, info = env.step(action) 382 | episode_reward += reward 383 | episode_length += 1 384 | 385 | trajectory.push(state, action, reward, done, next_state) 386 | state = next_state 387 | 388 | if done is True: 389 | print(f"{i=}, reward={episode_reward:.0f}, length={episode_length}, max_reward={max_episode_reward}") 390 | log_ep_rewards.append(episode_reward) 391 | log_ep_length.append(episode_length) 392 | 393 | if episode_length < 150 and episode_reward > max_episode_reward: 394 | save_path = os.path.join(args.output_dir, "model.bin") 395 | torch.save(agent.policy_net.state_dict(), save_path) 396 | max_episode_reward = episode_reward 397 | 398 | episode_reward = 0 399 | episode_length = 0 400 | state = env.reset() 401 | 402 | # Update policy and value nets. 403 | s_batch = torch.tensor(trajectory.state, dtype=torch.float32) 404 | a_batch = torch.tensor(trajectory.action, dtype=torch.int64) 405 | r_batch = torch.tensor(trajectory.reward, dtype=torch.float32) 406 | d_batch = torch.tensor(trajectory.done, dtype=torch.float32) 407 | ns_batch = torch.tensor(trajectory.next_state, dtype=torch.float32) 408 | 409 | agent.update_policy_net(s_batch, a_batch, r_batch, d_batch, ns_batch) 410 | agent.update_value_net(args, s_batch, r_batch, d_batch) 411 | 412 | trajectory = Trajectory() 413 | 414 | # 3. 画图。 415 | plt.plot(np.cumsum(log_ep_length), log_ep_rewards, label="length") 416 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 417 | plt.close() 418 | 419 | 420 | def eval(args, env, agent): 421 | model_path = os.path.join(args.output_dir, "model.bin") 422 | agent.model.load_state_dict(torch.load(model_path)) 423 | 424 | episode_length = 0 425 | episode_reward = 0 426 | state = env.reset() 427 | for i in range(5000): 428 | episode_length += 1 429 | action = agent.get_action(torch.from_numpy(state)).item() 430 | next_state, reward, done, info = env.step(action) 431 | env.render() 432 | episode_reward += reward 433 | 434 | state = next_state 435 | if done is True: 436 | print(f"{episode_reward=}, {episode_length=}") 437 | state = env.reset() 438 | episode_length = 0 439 | episode_reward = 0 440 | 441 | 442 | def main(): 443 | parser = argparse.ArgumentParser() 444 | parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment name.") 445 | parser.add_argument("--dim_obs", default=4, type=int, help="Dimension of observation.") 446 | parser.add_argument("--num_act", default=2, type=int, help="Number of actions.") 447 | parser.add_argument("--discount", default=0.95, type=float, help="Discount coefficient.") 448 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 449 | parser.add_argument("--lr_value_net", default=1e-3, type=float, help="Learning rate of value net.") 450 | parser.add_argument("--num_update_value", default=10, type=int, help="Number of updating value net per episode.") 451 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 452 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 453 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 454 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 455 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 456 | args = parser.parse_args() 457 | 458 | args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 459 | 460 | env = gym.make(args.env) 461 | env.seed(args.seed) 462 | set_seed(args) 463 | agent = TRPO(args) 464 | agent.policy_net.to(args.device) 465 | agent.value_net.to(args.device) 466 | 467 | if args.do_train: 468 | train(args, env, agent) 469 | 470 | if args.do_eval: 471 | eval(args, env, agent) 472 | 473 | 474 | if __name__ == "__main__": 475 | main() 476 | -------------------------------------------------------------------------------- /10_ddpg.py: -------------------------------------------------------------------------------- 1 | """10.2节DDPG算法实现。""" 2 | import argparse 3 | import gym 4 | from torch import nn 5 | import torch 6 | import numpy as np 7 | import random 8 | import torch.nn.functional as F 9 | from collections import defaultdict, deque 10 | from torch.optim import Adam 11 | import matplotlib.pyplot as plt 12 | import os 13 | 14 | 15 | def fanin_init(size, fanin=None): 16 | """weight initializer known from https://arxiv.org/abs/1502.01852""" 17 | fanin = fanin or size[0] 18 | v = 1.0 / np.sqrt(fanin) 19 | return torch.Tensor(size).uniform_(-v, v) 20 | 21 | 22 | class Actor(nn.Module): 23 | def __init__(self, state_dim, action_dim, h1=400, h2=300, eps=0.03): 24 | """ 25 | :param action_lim: Used to limit action space in [-action_lim,action_lim] 26 | :return: 27 | """ 28 | super(Actor, self).__init__() 29 | 30 | self.state_dim = state_dim 31 | self.action_dim = action_dim 32 | 33 | self.fc1 = nn.Linear(state_dim, h1) 34 | self.fc2 = nn.Linear(h1, h2) 35 | self.fc3 = nn.Linear(h2, action_dim) 36 | 37 | def forward(self, state): 38 | """ 39 | return actor policy function Pi(s) 40 | :param state: state [n, state_dim] 41 | :return: action [n, action_dim] 42 | """ 43 | x = F.relu(self.fc1(state)) 44 | x = F.relu(self.fc2(x)) 45 | action = F.tanh(self.fc3(x)) 46 | return action 47 | 48 | 49 | class Critic(nn.Module): 50 | def __init__(self, state_dim, action_dim, h1=200, h2=300, eps=0.03): 51 | super(Critic, self).__init__() 52 | self.state_dim = state_dim 53 | self.action_dim = action_dim 54 | 55 | self.fc1 = nn.Linear(state_dim, h1) 56 | self.fc2 = nn.Linear(action_dim, h1) 57 | self.fc3 = nn.Linear(h1 + h1, h2) 58 | self.fc4 = nn.Linear(h2, 1) 59 | 60 | def forward(self, state, action): 61 | """return critic Q(s,a) 62 | :param state: state [n, state_dim] (n is batch_size) 63 | :param action: action [n, action_dim] 64 | :return: Q(s,a) [n, 1] 65 | """ 66 | x = F.relu(self.fc1(state)) 67 | y = F.relu(self.fc2(action)) 68 | h = torch.cat((x, y), dim=1) 69 | h = F.relu(self.fc3(h)) 70 | h = self.fc4(h) 71 | return h 72 | 73 | 74 | class ReplayBuffer(object): 75 | def __init__(self, buffer_size, random_seed=123): 76 | self.buffer_size = buffer_size 77 | self.count = 0 78 | self.buffer = [] 79 | random.seed(random_seed) 80 | 81 | def add(self, s, a, r, t, s2): 82 | experience = (s, a, r, t, s2) 83 | if self.count < self.buffer_size: 84 | self.buffer.append(experience) 85 | self.count += 1 86 | else: 87 | self.buffer.pop(0) 88 | self.buffer.append(experience) 89 | 90 | def size(self): 91 | return self.count 92 | 93 | def sample_batch(self, batch_size): 94 | if self.count < batch_size: 95 | batch = random.sample(self.buffer, self.count) 96 | else: 97 | batch = random.sample(self.buffer, batch_size) 98 | 99 | s_batch = np.array([_[0] for _ in batch]) 100 | a_batch = np.array([_[1] for _ in batch]) 101 | r_batch = np.array([_[2] for _ in batch]) 102 | t_batch = np.array([_[3] for _ in batch]) 103 | s2_batch = np.array([_[4] for _ in batch]) 104 | 105 | s_batch = torch.tensor(s_batch, dtype=torch.float32) 106 | a_batch = torch.tensor(a_batch, dtype=torch.float32) 107 | r_batch = torch.tensor(r_batch, dtype=torch.float32) 108 | t_batch = torch.tensor(t_batch, dtype=torch.float32) 109 | s2_batch = torch.tensor(s2_batch, dtype=torch.float32) 110 | return s_batch, a_batch, r_batch, t_batch, s2_batch 111 | 112 | 113 | class OUNoise: 114 | """docstring for OUNoise""" 115 | 116 | def __init__(self, action_dimension, mu=0, theta=0.15, sigma=0.2): 117 | self.action_dimension = action_dimension 118 | self.mu = mu 119 | self.theta = theta 120 | self.sigma = sigma 121 | self.state = np.ones(self.action_dimension) * self.mu 122 | self.reset() 123 | 124 | def reset(self): 125 | self.state = np.ones(self.action_dimension) * self.mu 126 | 127 | def noise(self): 128 | x = self.state 129 | dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x)) 130 | self.state = x + dx 131 | return self.state 132 | 133 | 134 | class DDPG: 135 | def __init__( 136 | self, 137 | state_dim: int, 138 | action_dim: int, 139 | gamma: float, 140 | ): 141 | self.policy = Actor(state_dim, action_dim) 142 | self.value = Critic(state_dim, action_dim) 143 | self.target_policy = Actor(state_dim, action_dim) 144 | self.target_value = Critic(state_dim, action_dim) 145 | self.target_policy.load_state_dict(self.policy.state_dict()) 146 | self.target_value.load_state_dict(self.value.state_dict()) 147 | self.gamma = gamma 148 | 149 | def get_action(self, state): 150 | action = self.policy(state) 151 | return action * 2 152 | 153 | def compute_policy_loss(self, bs, ba, br, bd, bns): 154 | predicted_action = self.get_action(bs) 155 | loss = -self.value(bs, predicted_action).mean() 156 | return loss 157 | 158 | def compute_value_loss(self, bs, ba, br, bd, bns): 159 | with torch.no_grad(): 160 | predicted_bna = self.target_policy(bns) * 2 161 | target_value = self.gamma * self.target_value(bns, predicted_bna).squeeze() * (1 - bd) + br 162 | 163 | value = self.value(bs, ba).squeeze() 164 | loss = F.mse_loss(value, target_value) 165 | return loss 166 | 167 | def soft_update(self, tau=0.01): 168 | """ 169 | update target by target = tau * source + (1 - tau) * target. 170 | """ 171 | for target_param, param in zip(self.target_value.parameters(), self.value.parameters()): 172 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 173 | 174 | for target_param, param in zip(self.target_policy.parameters(), self.policy.parameters()): 175 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 176 | 177 | 178 | def train(args, env, agent: DDPG): 179 | policy_optimizer = Adam(agent.policy.parameters(), lr=args.lr_policy) 180 | value_optimizer = Adam(agent.value.parameters(), lr=args.lr_value) 181 | 182 | replay_buffer = ReplayBuffer(buffer_size=args.buffer_size) 183 | 184 | log = defaultdict(list) 185 | 186 | episode_reward = 0 187 | episode_length = 0 188 | max_episode_reward = -float("inf") 189 | value_loss_list = [0] 190 | policy_loss_list = [0] 191 | 192 | state, _ = env.reset() 193 | for i in range(args.max_steps): 194 | 195 | action = agent.get_action(torch.tensor(state)) 196 | action = action.detach().numpy() 197 | action = (action + np.random.normal(0, 0.1, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high) 198 | 199 | next_state, reward, terminated, truncated, _ = env.step(action) 200 | done = terminated or truncated 201 | 202 | replay_buffer.add(state, action, reward, done, next_state) 203 | 204 | state = next_state 205 | 206 | episode_reward += reward 207 | episode_length += 1 208 | 209 | if done: 210 | if episode_reward > max_episode_reward: 211 | max_episode_reward = episode_reward 212 | torch.save({"policy": agent.policy.state_dict(), "value": agent.value.state_dict()}, os.path.join(args.output_dir, "model.bin")) 213 | 214 | print(f"i={i}, episode reward={episode_reward:.2f}, max episode reward={max_episode_reward:.2f}, value loss={np.mean(value_loss_list):.2f}, policy loss={np.mean(policy_loss_list):.2f}") 215 | 216 | log["episode_reward"].append(episode_reward) 217 | 218 | state, _ = env.reset() 219 | episode_reward = 0 220 | episode_length = 0 221 | value_loss_list = [0] 222 | policy_loss_list = [0] 223 | 224 | if i > args.warm_steps: 225 | 226 | for _ in range(20): 227 | bs, ba, br, bd, bns = replay_buffer.sample_batch(args.batch_size) 228 | 229 | value_loss = agent.compute_value_loss(bs, ba, br, bd, bns) 230 | value_optimizer.zero_grad() 231 | value_loss.backward() 232 | value_optimizer.step() 233 | 234 | value_loss_list.append(value_loss.item()) 235 | 236 | policy_loss = agent.compute_policy_loss(bs, ba, br, bd, bns) 237 | policy_optimizer.zero_grad() 238 | policy_loss.backward() 239 | policy_optimizer.step() 240 | 241 | policy_loss_list.append(policy_loss.item()) 242 | 243 | agent.soft_update() 244 | 245 | # 画图。 246 | moving_avg = [] 247 | d = deque(maxlen=10) 248 | for x in log["episode_reward"]: 249 | d.append(x) 250 | moving_avg.append(np.mean(d)) 251 | 252 | plt.plot(moving_avg) 253 | plt.xlabel("episode") 254 | plt.ylabel("episode reward") 255 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 256 | 257 | 258 | def eval(args, agent: DDPG): 259 | state_dict = torch.load(os.path.join(args.output_dir, "model.bin")) 260 | agent.policy.load_state_dict(state_dict["policy"]) 261 | agent.value.load_state_dict(state_dict["value"]) 262 | 263 | episode_reward = 0 264 | 265 | env = gym.make(args.env, render_mode="human") 266 | state, _ = env.reset() 267 | for i in range(1000): 268 | 269 | action = agent.get_action(torch.tensor(state)) 270 | action = action.detach().numpy() 271 | state, reward, terminated, truncated, _ = env.step(action) 272 | done = terminated or truncated 273 | 274 | episode_reward += reward 275 | 276 | if done: 277 | print(f"episode reward={episode_reward}") 278 | state, _ = env.reset() 279 | episode_reward = 0 280 | 281 | 282 | if __name__ == "__main__": 283 | parser = argparse.ArgumentParser() 284 | parser.add_argument("--env", default="Pendulum-v1", type=str, help="gym environment") 285 | parser.add_argument("--max_steps", default=100_000, type=int) 286 | parser.add_argument("--warm_steps", default=1_000, type=int) 287 | 288 | parser.add_argument("--gamma", default=0.95, type=float, help="discount") 289 | parser.add_argument("--batch_size", default=128, type=int) 290 | parser.add_argument("--lr_policy", default=1e-4, type=float) 291 | parser.add_argument("--lr_value", default=1e-3, type=float) 292 | 293 | parser.add_argument("--tau", default=0.001, type=float, help="target network update coefficient") 294 | parser.add_argument("--buffer_size", default=100_000, type=int, help="replay buff size") 295 | parser.add_argument("--output_dir", default="output", type=str, help="result output dir") 296 | parser.add_argument("--model_path", type=str, help="if test mode, import the model") 297 | 298 | parser.add_argument("--do_train", action="store_true") 299 | parser.add_argument("--do_eval", action="store_true") 300 | 301 | args = parser.parse_args() 302 | os.makedirs(args.output_dir, exist_ok=True) 303 | 304 | env = gym.make(args.env) 305 | agent = DDPG(env.observation_space.shape[0], env.action_space.shape[0], args.gamma) 306 | 307 | if args.do_train: 308 | train(args, env, agent) 309 | 310 | if args.do_eval: 311 | eval(args, agent) 312 | -------------------------------------------------------------------------------- /10_td3.py: -------------------------------------------------------------------------------- 1 | """10.4节TD3算法实现。 2 | """ 3 | import argparse 4 | from collections import defaultdict 5 | import os 6 | import random 7 | from dataclasses import dataclass, field 8 | import gym 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import itertools 15 | 16 | import matplotlib.pyplot as plt 17 | 18 | 19 | class QNet(nn.Module): 20 | """QNet. 21 | Input: feature 22 | Output: num_act of values 23 | """ 24 | 25 | def __init__(self, dim_state, dim_action): 26 | super().__init__() 27 | self.fc1 = nn.Linear(dim_state + dim_action, 64) 28 | self.fc2 = nn.Linear(64, 32) 29 | self.fc3 = nn.Linear(32, 1) 30 | 31 | def forward(self, state, action): 32 | sa = torch.cat([state, action], -1) 33 | x = F.relu(self.fc1(sa)) 34 | x = F.relu(self.fc2(x)) 35 | x = self.fc3(x) 36 | return x 37 | 38 | 39 | class PolicyNet(nn.Module): 40 | def __init__(self, dim_state, dim_action, max_action=2.0): 41 | super().__init__() 42 | self.max_action = max_action 43 | self.fc1 = nn.Linear(dim_state, 64) 44 | self.fc2 = nn.Linear(64, 32) 45 | self.fc3 = nn.Linear(32, dim_action) 46 | 47 | def forward(self, state): 48 | x = F.relu(self.fc1(state)) 49 | x = F.relu(self.fc2(x)) 50 | x = self.max_action * torch.tanh(self.fc3(x)) 51 | return x 52 | 53 | 54 | class TD3: 55 | def __init__(self, dim_state, dim_action, max_action): 56 | super().__init__() 57 | 58 | self.max_action = max_action 59 | self.Q1 = QNet(dim_state, dim_action) 60 | self.Q2 = QNet(dim_state, dim_action) 61 | self.Mu = PolicyNet(dim_state, dim_action, max_action) 62 | self.target_Q1 = QNet(dim_state, dim_action) 63 | self.target_Q2 = QNet(dim_state, dim_action) 64 | self.target_Mu = PolicyNet(dim_state, dim_action, max_action) 65 | self.target_Q1.load_state_dict(self.Q1.state_dict()) 66 | self.target_Q2.load_state_dict(self.Q2.state_dict()) 67 | self.target_Mu.load_state_dict(self.Mu.state_dict()) 68 | 69 | def get_action(self, state): 70 | action = self.Mu(state) 71 | return action 72 | 73 | def compute_value_loss(self, args, s_batch, a_batch, r_batch, d_batch, next_s_batch): 74 | with torch.no_grad(): 75 | # 让目标策略网络做预测。 76 | a = self.target_Mu(next_s_batch) 77 | noise = torch.clamp( 78 | torch.randn_like(a) * args.policy_noise, 79 | -args.noise_clip, 80 | args.noise_clip, 81 | ) 82 | a = torch.clamp(a + noise, min=-self.max_action, max=self.max_action) 83 | 84 | # 让两个目标价值网络做预测。 85 | q1 = self.target_Q1(next_s_batch, a).squeeze() 86 | q2 = self.target_Q2(next_s_batch, a).squeeze() 87 | 88 | # 计算 TD 目标。 89 | y = r_batch + args.gamma * torch.min(q1, q2) * (1 - d_batch) 90 | 91 | # 让两个价值网络做预测。 92 | qvals1 = self.Q1(s_batch, a_batch).squeeze() 93 | qvals2 = self.Q2(s_batch, a_batch).squeeze() 94 | value_loss1 = F.mse_loss(y, qvals1) 95 | value_loss2 = F.mse_loss(y, qvals2) 96 | return value_loss1, value_loss2 97 | 98 | def compute_policy_loss(self, s_batch): 99 | a = self.Mu(s_batch) 100 | policy_loss = -self.Q1(s_batch, a).mean() 101 | return policy_loss 102 | 103 | def soft_update(self, tau=0.01): 104 | def soft_update_(target, source, tau_=0.01): 105 | for target_param, param in zip(target.parameters(), source.parameters()): 106 | target_param.data.copy_(target_param.data * (1.0 - tau_) + param.data * tau_) 107 | 108 | soft_update_(self.target_Q1, self.Q1, tau) 109 | soft_update_(self.target_Q2, self.Q2, tau) 110 | soft_update_(self.target_Mu, self.Mu, tau) 111 | 112 | 113 | @dataclass 114 | class ReplayBuffer: 115 | maxsize: int 116 | size: int = 0 117 | state: list = field(default_factory=list) 118 | action: list = field(default_factory=list) 119 | reward: list = field(default_factory=list) 120 | done: list = field(default_factory=list) 121 | next_state: list = field(default_factory=list) 122 | 123 | def push(self, state, action, reward, done, next_state): 124 | if self.size < self.maxsize: 125 | self.state.append(state) 126 | self.action.append(action) 127 | self.reward.append(reward) 128 | self.done.append(done) 129 | self.next_state.append(next_state) 130 | else: 131 | position = self.size % self.maxsize 132 | self.state[position] = state 133 | self.action[position] = action 134 | self.reward[position] = reward 135 | self.done[position] = done 136 | self.next_state[position] = next_state 137 | self.size += 1 138 | 139 | def sample(self, n): 140 | total_number = self.size if self.size < self.maxsize else self.maxsize 141 | indices = np.random.randint(total_number, size=n) 142 | state = [self.state[i] for i in indices] 143 | action = [self.action[i] for i in indices] 144 | reward = [self.reward[i] for i in indices] 145 | done = [self.done[i] for i in indices] 146 | next_state = [self.next_state[i] for i in indices] 147 | return state, action, reward, done, next_state 148 | 149 | 150 | class INFO: 151 | def __init__(self): 152 | self.log = defaultdict(list) 153 | self.episode_length = 0 154 | self.episode_reward = 0 155 | self.max_episode_reward = -float("inf") 156 | 157 | def put(self, done, reward): 158 | if done is True: 159 | self.episode_length += 1 160 | self.episode_reward += reward 161 | self.log["episode_length"].append(self.episode_length) 162 | self.log["episode_reward"].append(self.episode_reward) 163 | 164 | if self.episode_reward > self.max_episode_reward: 165 | self.max_episode_reward = self.episode_reward 166 | 167 | self.episode_length = 0 168 | self.episode_reward = 0 169 | else: 170 | self.episode_length += 1 171 | self.episode_reward += reward 172 | 173 | 174 | def train(args, env, agent: TD3): 175 | Q1_optimizer = torch.optim.Adam(agent.Q1.parameters(), lr=args.lr) 176 | Q2_optimizer = torch.optim.Adam(agent.Q2.parameters(), lr=args.lr) 177 | Mu_optimizer = torch.optim.Adam(agent.Mu.parameters(), lr=args.lr) 178 | replay_buffer = ReplayBuffer(maxsize=100_000) 179 | info = INFO() 180 | 181 | state, _ = env.reset(seed=args.seed) 182 | for step in range(args.max_steps): 183 | if step < args.warmup_steps: 184 | action = env.action_space.sample() 185 | else: 186 | action = agent.get_action(torch.from_numpy(state)) 187 | action = action.cpu().data.numpy() 188 | action_noise = np.clip(np.random.randn(args.dim_action), -args.max_action, args.max_action) 189 | action = np.clip(action + action_noise, -args.max_action, args.max_action) 190 | 191 | next_state, reward, terminated, truncated, _ = env.step(action) 192 | done = terminated or truncated 193 | replay_buffer.push(state, action, reward, done, next_state) 194 | state = next_state 195 | info.put(done, reward) 196 | 197 | if done is True: 198 | # 打印信息。 199 | episode_reward = info.log["episode_reward"][-1] 200 | episode_length = info.log["episode_length"][-1] 201 | value_loss = info.log["value_loss1"][-1] if len(info.log["value_loss1"]) > 0 else 0 202 | print(f"step={step}, reward={episode_reward:.0f}, length={episode_length}, max_reward={info.max_episode_reward}, value_loss={value_loss:.2f}") 203 | 204 | # 如果得分更高,保存模型。 205 | if episode_reward == info.max_episode_reward: 206 | save_path = os.path.join(args.output_dir, "model.bin") 207 | torch.save(agent.Mu.state_dict(), save_path) 208 | 209 | state, _ = env.reset() 210 | 211 | if step > args.warmup_steps: 212 | s_batch, a_batch, r_batch, d_batch, ns_batch = replay_buffer.sample(n=args.batch_size) 213 | 214 | s_batch = np.array(s_batch) 215 | a_batch = np.array(a_batch) 216 | r_batch = np.array(r_batch) 217 | d_batch = np.array(d_batch) 218 | ns_batch = np.array(ns_batch) 219 | 220 | s_batch = torch.tensor(s_batch, dtype=torch.float32) 221 | a_batch = torch.tensor(a_batch, dtype=torch.float32) 222 | r_batch = torch.tensor(r_batch, dtype=torch.float32) 223 | d_batch = torch.tensor(d_batch, dtype=torch.float32) 224 | ns_batch = torch.tensor(ns_batch, dtype=torch.float32) 225 | 226 | value_loss1, value_loss2 = agent.compute_value_loss(args, s_batch, a_batch, r_batch, d_batch, ns_batch) 227 | 228 | Q1_optimizer.zero_grad() 229 | value_loss1.backward(retain_graph=True) 230 | Q1_optimizer.step() 231 | 232 | Q2_optimizer.zero_grad() 233 | value_loss2.backward() 234 | Q2_optimizer.step() 235 | 236 | info.log["value_loss1"].append(value_loss1.item()) 237 | info.log["value_loss2"].append(value_loss2.item()) 238 | 239 | if step % args.K == 0: 240 | policy_loss = agent.compute_policy_loss(s_batch) 241 | Mu_optimizer.zero_grad() 242 | policy_loss.backward() 243 | Mu_optimizer.step() 244 | agent.soft_update() 245 | 246 | info.log["policy_loss"].append(policy_loss.item()) 247 | 248 | if step % 10000 == 0: 249 | # 画图。 250 | plt.plot(info.log["value_loss1"], label="loss1") 251 | plt.plot(info.log["value_loss2"], label="loss2") 252 | plt.legend() 253 | plt.savefig(f"{args.output_dir}/value_loss.png", bbox_inches="tight") 254 | plt.close() 255 | 256 | plt.plot(info.log["episode_reward"]) 257 | plt.savefig(f"{args.output_dir}/episode_reward.png", bbox_inches="tight") 258 | plt.close() 259 | 260 | 261 | def eval(args, env, agent): 262 | agent = TD3(args.dim_state, args.dim_action, args.max_action) 263 | model_path = os.path.join(args.output_dir, "model.bin") 264 | agent.Mu.load_state_dict(torch.load(model_path)) 265 | 266 | episode_length = 0 267 | episode_reward = 0 268 | state, _ = env.reset() 269 | for i in range(5000): 270 | episode_length += 1 271 | action = agent.get_action(torch.from_numpy(state)).cpu().data.numpy() 272 | next_state, reward, terminated, truncated, info = env.step(action) 273 | done = terminated or truncated 274 | env.render() 275 | episode_reward += reward 276 | 277 | state = next_state 278 | if done is True: 279 | print(f"episode reward={episode_reward}, length={episode_length}") 280 | state, _ = env.reset() 281 | episode_length = 0 282 | episode_reward = 0 283 | 284 | 285 | def main(): 286 | parser = argparse.ArgumentParser() 287 | parser.add_argument("--env", default="Pendulum-v1", type=str, help="Environment name.") 288 | parser.add_argument("--dim_state", default=3, type=int, help="Dimension of observation.") 289 | parser.add_argument("--dim_action", default=1, type=int, help="Number of actions.") 290 | parser.add_argument("--max_action", default=2.0, type=float, help="Action scale, [-max, max].") 291 | parser.add_argument("--gamma", default=0.99, type=float, help="Discount coefficient.") 292 | 293 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 294 | parser.add_argument("--warmup_steps", default=10_000, type=int, help="Warmup steps without training.") 295 | parser.add_argument("--lr", default=1e-3, type=float, help="Learning rate.") 296 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") 297 | parser.add_argument("--K", default=2, type=int, help="Delay K steps to update policy and target network.") 298 | parser.add_argument("--policy_noise", default=0.2, type=float, help="Policy noise.") 299 | parser.add_argument("--noise_clip", default=0.5, type=float, help="Policy noise.") 300 | 301 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 302 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 303 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 304 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 305 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 306 | args = parser.parse_args() 307 | 308 | args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 309 | 310 | # 初始化环境。 311 | env = gym.make(args.env) 312 | 313 | agent = TD3(dim_state=args.dim_state, dim_action=args.dim_action, max_action=args.max_action) 314 | 315 | if args.do_train: 316 | train(args, env, agent) 317 | 318 | if args.do_eval: 319 | eval(args, env, agent) 320 | 321 | 322 | if __name__ == "__main__": 323 | main() 324 | -------------------------------------------------------------------------------- /13_a3c.py: -------------------------------------------------------------------------------- 1 | """13.3节A3C算法实现。""" 2 | import argparse 3 | import os 4 | import gym 5 | import numpy as np 6 | import ray 7 | import torch 8 | import torch.nn.functional as F 9 | from torch import nn 10 | from torch.distributions import Categorical 11 | 12 | 13 | class ValueNet(nn.Module): 14 | def __init__(self, dim_state): 15 | super().__init__() 16 | self.fc1 = nn.Linear(dim_state, 64) 17 | self.fc2 = nn.Linear(64, 32) 18 | self.fc3 = nn.Linear(32, 1) 19 | 20 | def forward(self, state): 21 | x = F.relu(self.fc1(state)) 22 | x = F.relu(self.fc2(x)) 23 | x = self.fc3(x) 24 | return x 25 | 26 | 27 | class PolicyNet(nn.Module): 28 | def __init__(self, dim_state, num_action): 29 | super().__init__() 30 | self.fc1 = nn.Linear(dim_state, 64) 31 | self.fc2 = nn.Linear(64, 32) 32 | self.fc3 = nn.Linear(32, num_action) 33 | 34 | def forward(self, state): 35 | x = F.relu(self.fc1(state)) 36 | x = F.relu(self.fc2(x)) 37 | x = self.fc3(x) 38 | prob = F.softmax(x, dim=-1) 39 | return prob 40 | 41 | 42 | @ray.remote(num_cpus=2) 43 | class A3C(nn.Module): 44 | def __init__(self, args, id): 45 | super().__init__() 46 | self.args = args 47 | self.id = id 48 | 49 | self.V = ValueNet(args.dim_state) 50 | self.V_target = ValueNet(args.dim_state) 51 | self.pi = PolicyNet(args.dim_state, args.num_action) 52 | self.V_target.load_state_dict(self.V.state_dict()) 53 | self.env = gym.make(args.env) 54 | 55 | self.ep_reward = 0 56 | 57 | def get_action(self, state): 58 | probs = self.pi(state) 59 | m = Categorical(probs) 60 | action = m.sample() 61 | logp_action = m.log_prob(action) 62 | return action, logp_action 63 | 64 | def play_one_rollout(self): 65 | self.ep_reward = 0 66 | rollout = Rollout() 67 | state = self.env.reset() 68 | while True: 69 | action, logp_action = self.get_action(torch.tensor(state).float()) 70 | next_state, reward, done, _ = self.env.step(action.item()) 71 | 72 | rollout.put( 73 | state, 74 | action, 75 | logp_action, 76 | reward, 77 | done, 78 | next_state, 79 | ) 80 | state = next_state 81 | self.ep_reward += reward 82 | 83 | if done is True: 84 | break 85 | return rollout 86 | 87 | def compute_gradient(self, pi_state_dict, V_state_dict): 88 | """计算网络梯度,送回给Master节点。""" 89 | # 更新策略网络,值网络,目标值网络参数。 90 | self.zero_grad() 91 | self.pi.load_state_dict(pi_state_dict) 92 | self.V.load_state_dict(V_state_dict) 93 | self.soft_update() 94 | 95 | # 与环境进行一个完整回合的游戏。 96 | rollout = self.play_one_rollout() 97 | 98 | # 计算网络参数梯度。 99 | bs, ba, blogp_a, br, bd, bns = rollout.torch() 100 | 101 | value_loss = self.compute_value_loss(bs, blogp_a, br, bd, bns) 102 | policy_loss = self.compute_policy_loss(bs, blogp_a, br, bd, bns) 103 | 104 | loss = value_loss + policy_loss 105 | loss.backward() 106 | 107 | grad_lst = [] 108 | for param in self.parameters(): 109 | grad_lst.append(param.grad) 110 | 111 | return (self.id, self.ep_reward, grad_lst) 112 | 113 | def compute_value_loss(self, bs, blogp_a, br, bd, bns): 114 | # 累积奖励。 115 | r_lst = [] 116 | R = 0 117 | for i in reversed(range(len(br))): 118 | R = self.args.discount * R + br[i] 119 | r_lst.append(R) 120 | r_lst.reverse() 121 | batch_r = torch.tensor(r_lst) 122 | 123 | # 目标价值。 124 | with torch.no_grad(): 125 | target_value = batch_r + self.args.discount * torch.logical_not(bd) * self.V_target(bns).squeeze() 126 | 127 | # 计算value loss。 128 | value_loss = F.mse_loss(self.V(bs).squeeze(), target_value) 129 | return value_loss 130 | 131 | def compute_policy_loss(self, bs, blogp_a, br, bd, bns): 132 | # 累积奖励。 133 | r_lst = [] 134 | R = 0 135 | for i in reversed(range(len(br))): 136 | R = self.args.discount * R + br[i] 137 | r_lst.append(R) 138 | r_lst.reverse() 139 | batch_r = torch.tensor(r_lst) 140 | 141 | # 目标价值。 142 | with torch.no_grad(): 143 | target_value = batch_r + self.args.discount * torch.logical_not(bd) * self.V_target(bns).squeeze() 144 | 145 | # 计算policy loss。 146 | with torch.no_grad(): 147 | advantage = target_value - self.V(bs).squeeze() 148 | policy_loss = 0 149 | for i, logp_a in enumerate(blogp_a): 150 | policy_loss += -logp_a * advantage[i] 151 | policy_loss = policy_loss.mean() 152 | return policy_loss 153 | 154 | def soft_update(self, tau=0.01): 155 | def soft_update_(target, source, tau_=0.01): 156 | for target_param, param in zip(target.parameters(), source.parameters()): 157 | target_param.data.copy_(target_param.data * (1.0 - tau_) + param.data * tau_) 158 | 159 | soft_update_(self.V_target, self.V, tau) 160 | 161 | 162 | class Rollout: 163 | def __init__(self): 164 | self.state_lst = [] 165 | self.action_lst = [] 166 | self.logp_action_lst = [] 167 | self.reward_lst = [] 168 | self.done_lst = [] 169 | self.next_state_lst = [] 170 | 171 | def put(self, state, action, logp_action, reward, done, next_state): 172 | self.state_lst.append(state) 173 | self.action_lst.append(action) 174 | self.logp_action_lst.append(logp_action) 175 | self.reward_lst.append(reward) 176 | self.done_lst.append(done) 177 | self.next_state_lst.append(next_state) 178 | 179 | def torch(self): 180 | bs = torch.as_tensor(self.state_lst).float() 181 | ba = torch.as_tensor(self.action_lst).float() 182 | blogp_a = self.logp_action_lst 183 | br = torch.as_tensor(self.reward_lst).float() 184 | bd = torch.as_tensor(self.done_lst) 185 | bns = torch.as_tensor(self.next_state_lst).float() 186 | return bs, ba, blogp_a, br, bd, bns 187 | 188 | 189 | class Master(nn.Module): 190 | def __init__(self, args): 191 | super().__init__() 192 | self.V = ValueNet(args.dim_state) 193 | self.V_target = ValueNet(args.dim_state) 194 | self.pi = PolicyNet(args.dim_state, args.num_action) 195 | self.V_target.load_state_dict(self.V.state_dict()) 196 | 197 | def get_action(self, state): 198 | probs = self.pi(state) 199 | m = Categorical(probs) 200 | action = m.sample() 201 | logp_action = m.log_prob(action) 202 | return action, logp_action 203 | 204 | 205 | def train(args): 206 | master = Master(args) 207 | optimizer = torch.optim.Adam(master.parameters(), lr=1e-3) 208 | 209 | # 启动N个Workers。 210 | worker_dst = {i: A3C.remote(args, i) for i in range(args.num_workers)} 211 | 212 | # 每个Worker接受Master的网络权重,分别计算梯度。 213 | remaining = [worker_dst[i].compute_gradient.remote(master.pi.state_dict(), master.V.state_dict()) for i in range(args.num_workers)] 214 | 215 | max_ep_reward = {i: 0 for i in range(args.num_workers)} 216 | cnt = 0 217 | ready_id = [] 218 | for _ in range(1000): 219 | # 当有Worker完成梯度计算时,传回给Master节点。 220 | ready, remaining = ray.wait(remaining) 221 | cnt += 1 222 | 223 | id, ep_reward, grad_lst = ray.get(ready[0]) 224 | 225 | if max_ep_reward[id] < ep_reward: 226 | save_path = os.path.join(args.output_dir, "model.bin") 227 | torch.save(master.pi.state_dict(), save_path) 228 | 229 | max_ep_reward[id] = max(max_ep_reward[id], ep_reward) 230 | print("id=%d, ep_reward=%d, max ep_reward=%d" % (id, ep_reward, max_ep_reward[id])) 231 | ready_id.append(id) 232 | 233 | for master_param, grad in zip(master.parameters(), grad_lst): 234 | if master_param.grad is None: 235 | master_param.grad = grad 236 | else: 237 | master_param.grad += grad 238 | 239 | # 每次收集到两个完成的Worker,计算梯度均值,并更新Master模型权重。 240 | if cnt % args.m == 0 and cnt != 0: 241 | # print("hello") 242 | 243 | cnt = 0 244 | for param in master.parameters(): 245 | if param.grad is not None: 246 | param.grad /= 2 247 | optimizer.step() 248 | master.zero_grad() 249 | 250 | # 让完成梯度的Worker使用新的网络权重继续训练。 251 | for id in ready_id: 252 | remaining.append(worker_dst[id].compute_gradient.remote(master.pi.state_dict(), master.V.state_dict())) 253 | ready_id = [] 254 | 255 | 256 | def eval(args): 257 | env = gym.make(args.env) 258 | agent = Master(args) 259 | model_path = os.path.join(args.output_dir, "model.bin") 260 | agent.pi.load_state_dict(torch.load(model_path)) 261 | 262 | episode_length = 0 263 | episode_reward = 0 264 | state = env.reset() 265 | for i in range(5000): 266 | episode_length += 1 267 | action, _ = agent.get_action(torch.from_numpy(state)) 268 | next_state, reward, done, info = env.step(action.item()) 269 | env.render() 270 | episode_reward += reward 271 | 272 | state = next_state 273 | if done is True: 274 | print(f"{episode_reward=}, {episode_length=}") 275 | state = env.reset() 276 | episode_length = 0 277 | episode_reward = 0 278 | 279 | 280 | if __name__ == "__main__": 281 | parser = argparse.ArgumentParser() 282 | parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment name.") 283 | parser.add_argument("--dim_state", default=4, type=int, help="Dimension of state.") 284 | parser.add_argument("--num_action", default=2, type=int, help="Number of action.") 285 | parser.add_argument("--output_dir", default="output", type=str, help="Output directory.") 286 | parser.add_argument("--seed", default=42, type=int, help="Random seed.") 287 | 288 | parser.add_argument("--num_workers", default=4, type=int, help="Number of workers.") 289 | parser.add_argument("--m", default=2, type=int, help="Mean gradients when every m workers get ready.") 290 | parser.add_argument("--max_steps", default=100_000, type=int, help="Maximum steps for interaction.") 291 | parser.add_argument("--discount", default=0.99, type=float, help="Discount coefficient.") 292 | parser.add_argument("--lr", default=1e-3, type=float, help="Learning rate.") 293 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") 294 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 295 | 296 | parser.add_argument("--do_train", action="store_true", help="Train policy.") 297 | parser.add_argument("--do_eval", action="store_true", help="Evaluate policy.") 298 | args = parser.parse_args() 299 | 300 | if args.do_train: 301 | train(args) 302 | 303 | if args.do_eval: 304 | eval(args) 305 | -------------------------------------------------------------------------------- /14_mpe.py: -------------------------------------------------------------------------------- 1 | """14.3节MPE环境。 2 | 安装依赖环境:pip install "pettingzoo[mpe]" 3 | """ 4 | 5 | from pettingzoo.mpe import simple_spread_v2 6 | import time 7 | 8 | env = simple_spread_v2.env(N=2, local_ratio=0.5, max_cycles=25, continuous_actions=False, render_mode="human") 9 | 10 | num_agents = len(env.possible_agents) 11 | num_actions = env.action_space(env.possible_agents[0]).n 12 | observation_size = env.observation_space(env.possible_agents[0]).shape 13 | 14 | print(f"{num_agents} agents") 15 | for i in range(num_agents): 16 | num_actions = env.action_space(env.possible_agents[i]).n 17 | observation_size = env.observation_space(env.possible_agents[i]).shape 18 | print(i, env.possible_agents[i], "num_actions:", num_actions, "observation_size:", observation_size) 19 | 20 | for epoch in range(3): 21 | env.reset() 22 | for i, agent in enumerate(env.agent_iter()): 23 | observation, reward, terminated, truncated, info = env.last() 24 | done = terminated or truncated 25 | action = 0 26 | 27 | if done: 28 | break 29 | 30 | action = env.action_space(agent).sample() 31 | env.step(action) 32 | 33 | print(i, agent) 34 | print(f"action={action}, observation={observation}, reward={reward}, done={done}, info={info}") 35 | 36 | time.sleep(3) 37 | -------------------------------------------------------------------------------- /15_mac_a2c.py: -------------------------------------------------------------------------------- 1 | """15.3节MAC-A2C算法实现,采用中心化训练+中心化决策方案。""" 2 | import argparse 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import torch 6 | from torch.optim import Adam 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from torch.distributions import Categorical 10 | from pettingzoo.mpe import simple_spread_v2 11 | import time 12 | from collections import defaultdict 13 | import os 14 | 15 | 16 | class ValueNet(nn.Module): 17 | def __init__(self, dim_state): 18 | super().__init__() 19 | self.fc1 = nn.Linear(dim_state, 64) 20 | self.fc2 = nn.Linear(64, 32) 21 | self.fc3 = nn.Linear(32, 1) 22 | 23 | def forward(self, state): 24 | x = F.relu(self.fc1(state)) 25 | x = F.relu(self.fc2(x)) 26 | x = self.fc3(x) 27 | return x 28 | 29 | 30 | class PolicyNet(nn.Module): 31 | def __init__(self, dim_state, num_action): 32 | super().__init__() 33 | self.fc1 = nn.Linear(dim_state, 64) 34 | self.fc2 = nn.Linear(64, 32) 35 | self.fc3 = nn.Linear(32, num_action) 36 | 37 | def forward(self, state): 38 | x = F.relu(self.fc1(state)) 39 | x = F.relu(self.fc2(x)) 40 | x = self.fc3(x) 41 | prob = F.softmax(x, dim=-1) 42 | return prob 43 | 44 | def policy(self, observation): 45 | x = F.relu(self.fc1(observation)) 46 | x = F.relu(self.fc2(x)) 47 | x = self.fc3(x) 48 | log_prob_action = F.log_softmax(x, dim=-1) 49 | return log_prob_action 50 | 51 | 52 | class MAC(nn.Module): 53 | def __init__( 54 | self, 55 | num_agents=1, 56 | num_states=6, 57 | num_actions=5, 58 | gamma=0.95, 59 | tau=0.01, 60 | ): 61 | super().__init__() 62 | 63 | self.num_agents = num_agents 64 | self.num_states = num_states 65 | self.num_actions = num_actions 66 | 67 | self.gamma = gamma 68 | self.tau = tau 69 | 70 | self.agent2policy = {} 71 | for i in range(num_agents): 72 | self.agent2policy[f"agent_{i}"] = PolicyNet(num_states, num_actions) 73 | 74 | self.value_net = ValueNet(num_states) 75 | self.target_value_net = ValueNet(num_states) 76 | self.target_value_net.load_state_dict(self.value_net.state_dict()) 77 | 78 | def policy(self, observation, agent): 79 | # 参考https://pytorch.org/docs/stable/distributions.html#score-function。 80 | log_prob_action = self.agent2policy[agent].policy(observation) 81 | m = Categorical(logits=log_prob_action) 82 | action = m.sample() 83 | log_prob_a = m.log_prob(action) 84 | return action.item(), log_prob_a 85 | 86 | def value(self, observation): 87 | value_ = self.value_net(observation) 88 | return value_ 89 | 90 | def target_value(self, observation): 91 | target_value_ = self.target_value_net(observation) 92 | return target_value_ 93 | 94 | def compute_policy_loss(self, bs, br, bd, bns, logp_action_dict): 95 | 96 | with torch.no_grad(): 97 | # td_value = self.target_value(bns).squeeze() 98 | # td_value = br + self.gamma * td_value * (1 - bd) 99 | predicted_value = self.value(bs).squeeze() 100 | # advantage = predicted_value - td_value 101 | 102 | # compute_value_loss使用br作为td目标。计算advantage时,同样使用br作为baseline。 103 | advantage = predicted_value - br 104 | 105 | policy_loss = 0 106 | for i in range(self.num_agents): 107 | policy_loss += logp_action_dict[f"agent_{i}"] * advantage 108 | policy_loss = policy_loss.mean() 109 | return policy_loss 110 | 111 | def compute_value_loss(self, bs, br, bd, bns, blopg_action_dict): 112 | # 注意到simple_spread_v2中,reward是根据当前状态到目标位置的距离而计算的奖励。因此,直接使用reward作为td目标值更合适。 113 | # with torch.no_grad(): 114 | # td_value = self.target_value(bns).squeeze() 115 | # td_value = br + self.gamma * td_value * (1 - bd) 116 | td_value = br 117 | 118 | predicted_value = self.value(bs).squeeze() 119 | value_loss = F.mse_loss(predicted_value, td_value) 120 | return value_loss 121 | 122 | def update_target_value(self): 123 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): 124 | target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) 125 | 126 | 127 | class Rollout: 128 | def __init__(self): 129 | self.state_list = [] 130 | self.reward_list = [] 131 | self.done_list = [] 132 | self.next_state_list = [] 133 | self.logp_actions_dict = defaultdict(list) 134 | 135 | def put(self, state, reward, done, next_state, logp_action_dict): 136 | self.state_list.append(state) 137 | self.reward_list.append(reward) 138 | self.done_list.append(done) 139 | self.next_state_list.append(next_state) 140 | for k, v in logp_action_dict.items(): 141 | self.logp_actions_dict[k].append(v) 142 | 143 | def tensor(self): 144 | bs = torch.tensor(np.asarray(self.state_list)).float() 145 | br = torch.tensor(np.asarray(self.reward_list)).float() 146 | bd = torch.tensor(np.asarray(self.done_list)).float() 147 | bns = torch.tensor(np.asarray(self.next_state_list)).float() 148 | blogp_action_dict = {k: torch.stack(v) for k, v in self.logp_actions_dict.items()} 149 | return bs, br, bd, bns, blogp_action_dict 150 | 151 | 152 | def train(args, env, central_controller: MAC): 153 | # 训练初始化。 154 | policy_params = [] 155 | for i in range(num_agents): 156 | policy_params += list(central_controller.agent2policy[f"agent_{i}"].parameters()) 157 | policy_optimizer = Adam(policy_params, lr=args.lr_policy) 158 | value_optimizer = Adam(central_controller.value_net.parameters(), lr=args.lr_value) 159 | 160 | max_reward = 0 161 | episode_reward_lst = [] 162 | log = defaultdict(list) 163 | 164 | for episode in range(args.num_episode): 165 | env.reset() 166 | state = [env.observe(f"agent_{x}") for x in range(num_agents)] 167 | state = np.concatenate(state) 168 | logp_action_dict = {} 169 | episode_reward = 0 170 | rollout = Rollout() 171 | 172 | for i, agent in enumerate(env.agent_iter()): 173 | action, logp_action = central_controller.policy(torch.as_tensor(state).float(), agent) 174 | logp_action_dict[agent] = logp_action 175 | env.step(action) 176 | 177 | # 当下一个执行动作的agent变成0号agent时,表示所有agent完成了动作选择,此时重新收集所有agent的state。 178 | if env.agent_selection == "agent_0": 179 | # 收集所有agent的observation。 180 | next_state = [env.observe(f"agent_{x}") for x in range(num_agents)] 181 | next_state = np.concatenate(next_state) 182 | reward = env.rewards["agent_0"] # 所有agent的奖励是一样的。 183 | done = env.terminations["agent_0"] or env.truncations["agent_0"] 184 | 185 | rollout.put(state, reward, done, next_state, logp_action_dict) 186 | state = next_state 187 | 188 | episode_reward += reward 189 | # 如果运行到环境终点,训练模型。 190 | if done is True: 191 | episode_reward_lst.append(episode_reward) 192 | 193 | # if episode_reward >= max(episode_reward_lst): 194 | if episode % 1000 == 0: 195 | 196 | agent2policynet = {} 197 | for agent, policynet in central_controller.agent2policy.items(): 198 | agent2policynet[agent] = policynet.state_dict() 199 | torch.save(agent2policynet, os.path.join(args.output_dir, "model.pt")) 200 | 201 | if episode % 1000 == 0: 202 | x_axis = np.arange(len(episode_reward_lst)) 203 | plt.plot(x_axis, episode_reward_lst) 204 | plt.xlabel("episode") 205 | plt.ylabel("reward") 206 | plt.savefig("simple_spread.png", bbox_inches="tight") 207 | plt.close() 208 | 209 | # 检查训练素材。 210 | bs, br, bd, bns, blogp_action_dict = rollout.tensor() 211 | 212 | # 训练模型。 213 | value_loss = central_controller.compute_value_loss(bs, br, bd, bns, blogp_action_dict) 214 | value_optimizer.zero_grad() 215 | value_loss.backward() 216 | value_optimizer.step() 217 | 218 | policy_loss = central_controller.compute_policy_loss(bs, br, bd, bns, blogp_action_dict) 219 | policy_optimizer.zero_grad() 220 | policy_loss.backward() 221 | policy_optimizer.step() 222 | 223 | central_controller.update_target_value() 224 | 225 | log["value_loss"].append(value_loss.item()) 226 | log["policy_loss"].append(policy_loss.item()) 227 | if episode % 20 == 0: 228 | avg_value_loss = np.mean(log["value_loss"][-20:]) 229 | avg_policy_loss = np.mean(log["policy_loss"][-20:]) 230 | avg_reward = np.mean(episode_reward_lst[-20:]) 231 | print(f"episode={episode}, moving reward={avg_reward:.2f}, value loss={avg_value_loss:.4f}, policy loss={avg_policy_loss:.4f}") 232 | 233 | break 234 | 235 | 236 | def eval(args): 237 | env = simple_spread_v2.env(N=args.num_agents, local_ratio=0.5, max_cycles=25, continuous_actions=False, render_mode="human") 238 | central_controller = MAC(num_agents=args.num_agents, num_states=args.num_states, num_actions=args.num_actions) 239 | 240 | agent2policynet = torch.load(os.path.join(args.output_dir, "model.pt")) 241 | for agent, state_dict in agent2policynet.items(): 242 | central_controller.agent2policy[agent].load_state_dict(state_dict) 243 | 244 | central_controller.eval() 245 | 246 | episode_reward_lst = [] 247 | for episode in range(10): 248 | episode_reward = 0 249 | 250 | env.reset() 251 | for i, agent in enumerate(env.agent_iter()): 252 | state = [env.observe(f"agent_{x}") for x in range(num_agents)] 253 | state = np.concatenate(state) 254 | 255 | action, _ = central_controller.policy(torch.as_tensor(state).float(), agent) 256 | env.step(action) 257 | 258 | if env.agent_selection == "agent_0": 259 | next_state = [env.observe(f"agent_{x}") for x in range(num_agents)] 260 | next_state = np.concatenate(next_state) 261 | reward = env.rewards["agent_0"] 262 | done = env.terminations["agent_0"] or env.truncations["agent_0"] 263 | state = next_state 264 | 265 | episode_reward += reward 266 | 267 | time.sleep(0.1) 268 | 269 | if done is True: 270 | episode_reward_lst.append(episode_reward) 271 | avg_reward = np.mean(episode_reward_lst[-20:]) 272 | print(f"episode={episode}, episode reward={episode_reward}, moving reward={avg_reward:.2f}") 273 | break 274 | 275 | 276 | if __name__ == "__main__": 277 | parser = argparse.ArgumentParser(description="合作型游戏。") 278 | parser.add_argument("--num_agents", default=2, type=int) 279 | parser.add_argument("--num_states", default=24, type=int) 280 | parser.add_argument("--num_actions", default=5, type=int) 281 | parser.add_argument("--num_episode", default=20000, type=int) 282 | parser.add_argument("--lr_policy", default=1e-3, type=float) # 1e-3 283 | parser.add_argument("--lr_value", default=1e-3, type=float) # 1e-2 284 | parser.add_argument("--output_dir", default="output", type=str) 285 | parser.add_argument("--do_train", action="store_true") 286 | parser.add_argument("--do_eval", action="store_true") 287 | args = parser.parse_args() 288 | 289 | torch.manual_seed(0) 290 | np.random.seed(0) 291 | 292 | env = simple_spread_v2.env(N=args.num_agents, local_ratio=0.5, max_cycles=25, continuous_actions=False) 293 | central_controller = MAC(num_agents=args.num_agents, num_states=args.num_states, num_actions=args.num_actions) 294 | 295 | num_agents = len(env.possible_agents) 296 | num_actions = env.action_space(env.possible_agents[0]).n 297 | observation_size = env.observation_space(env.possible_agents[0]).shape 298 | 299 | print(f"{num_agents} agents") 300 | for i in range(num_agents): 301 | num_actions = env.action_space(env.possible_agents[i]).n 302 | observation_size = env.observation_space(env.possible_agents[i]).shape 303 | print(i, env.possible_agents[i], "num_actions:", num_actions, "observation_size:", observation_size) 304 | 305 | if args.do_train: 306 | train(args, env, central_controller) 307 | 308 | if args.do_eval: 309 | eval(args) 310 | -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | # 介绍 2 | 这里是《深度强化学习》的主要算法实现。为了方便阅读,单个算法的实现及调用放在一个文件中。调用方式简单: 3 | ```bash 4 | mkdir -p output 5 | python -u 04_dqn.py --do_train --output_dir output 2>&1 | tee output/log.txt 6 | ``` 7 | 8 | 9 | # 环境 10 | 根据动作状态空间是否连续,我们考虑两种环境: 11 | - 离散环境:CartPole,https://www.gymlibrary.dev/environments/classic_control/cart_pole/. 12 | - 连续环境:Pendulum,https://www.gymlibrary.dev/environments/classic_control/pendulum/. 13 | 14 | 测试环境是python3.7,依赖安装: 15 | ```bash 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | 所有代码均用于教学,可在笔记本CPU环境下训练。 20 | 21 | # 算法列表 22 | | 章节 | 算法 | 23 | | ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | 24 | | 1 机器学习基础 | MNIST | 25 | | 2 蒙特卡洛 | PI approximation | 26 | | 3 强化学习基本概念 | CartPole | 27 | | 4 DQN与Q学习 | DQN | 28 | | 5 SARSA算法 | SARSA | 29 | | 6 价值学习与高级技巧 | Dueling DQN, Double DQN | 30 | | 7 策略梯度算法 | REINFORCE, Actor Critic | 31 | | 8 带基线的策略梯度方法 | REINFORCE with baseline, A2C | 32 | | 9 策略学习高级技巧 | TRPO | 33 | | 10 连续控制 | DDPG, TD3 | 34 | | 11 对状态的不完全观测 | | 35 | | 12 模仿学习 | GAIL | 36 | | 13 并行计算 | A3C | 37 | | 14 多智能体系统 | MPE | 38 | | 15 合作关系设定下的多智能体强化学习 | MAC-A2C | 39 | | 16 非合作关系设定下的多智能体强化学习 | | 40 | | 17 注意力机制与多智能体强化学习 | | 41 | | 18 AlphaGo 与蒙特卡洛树搜索 | [AlphaZero](https://github.com/suragnair/alpha-zero-general) | 42 | | 19 现实世界中的应用 | [NAS](https://github.com/titu1994/neural-architecture-search) [Recommender](https://github.com/awarebayes/RecNN) | 43 | 44 | 45 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.26.2 2 | matplotlib==3.5.3 3 | numpy==1.21.6 4 | PettingZoo==1.22.2 5 | torch==1.13.0 --------------------------------------------------------------------------------