├── .gitignore ├── Model ├── 1. DQN │ ├── RL_brain.py │ └── run_this.py ├── 2. Double-DQN │ ├── RL_brain.py │ ├── parsers.py │ └── run_this.py ├── 3. Policy-Gradients │ ├── RL_brain.py │ └── run_this.py ├── 4. Actor-Critic │ ├── RL_brain.py │ └── run_this.py ├── 5. DDPG │ ├── RL_brain.py │ ├── parsers.py │ └── run_this.py ├── 6. PPO_Discrete │ ├── RL_brain.py │ └── run_this.py ├── 7. PPO_Continuous │ ├── RL_brain.py │ └── run_this.py ├── 8. SAC_Discrete │ ├── RL_brain.py │ └── run_this.py └── 9. ippo_discrete │ ├── RL_brain.py │ └── run_this.py └── build_env └── pygame ├── 1. Hit the Plane └── run_this.py └── 2. shopping mall └── run_this.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Model/1. DQN/RL_brain.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import collections 6 | import random 7 | 8 | # --------------------------------------- # 9 | # 经验回放池 10 | # --------------------------------------- # 11 | 12 | class ReplayBuffer(): 13 | def __init__(self, capacity): 14 | # 创建一个先进先出的队列,最大长度为capacity,保证经验池的样本量不变 15 | self.buffer = collections.deque(maxlen=capacity) 16 | # 将数据以元组形式添加进经验池 17 | def add(self, state, action, reward, next_state, done): 18 | self.buffer.append((state, action, reward, next_state, done)) 19 | # 随机采样batch_size行数据 20 | def sample(self, batch_size): 21 | transitions = random.sample(self.buffer, batch_size) # list, len=32 22 | # *transitions代表取出列表中的值,即32项 23 | state, action, reward, next_state, done = zip(*transitions) 24 | return np.array(state), action, reward, np.array(next_state), done 25 | # 目前队列长度 26 | def size(self): 27 | return len(self.buffer) 28 | 29 | # -------------------------------------- # 30 | # 构造深度学习网络,输入状态s,得到各个动作的reward 31 | # -------------------------------------- # 32 | 33 | class Net(nn.Module): 34 | # 构造只有一个隐含层的网络 35 | def __init__(self, n_states, n_hidden, n_actions): 36 | super(Net, self).__init__() 37 | # [b,n_states]-->[b,n_hidden] 38 | self.fc1 = nn.Linear(n_states, n_hidden) 39 | # [b,n_hidden]-->[b,n_actions] 40 | self.fc2 = nn.Linear(n_hidden, n_actions) 41 | # 前传 42 | def forward(self, x): # [b,n_states] 43 | x = self.fc1(x) 44 | x = self.fc2(x) 45 | return x 46 | 47 | # -------------------------------------- # 48 | # 构造深度强化学习模型 49 | # -------------------------------------- # 50 | 51 | class DQN: 52 | #(1)初始化 53 | def __init__(self, n_states, n_hidden, n_actions, 54 | learning_rate, gamma, epsilon, 55 | target_update, device): 56 | # 属性分配 57 | self.n_states = n_states # 状态的特征数 58 | self.n_hidden = n_hidden # 隐含层个数 59 | self.n_actions = n_actions # 动作数 60 | self.learning_rate = learning_rate # 训练时的学习率 61 | self.gamma = gamma # 折扣因子,对下一状态的回报的缩放 62 | self.epsilon = epsilon # 贪婪策略,有1-epsilon的概率探索 63 | self.target_update = target_update # 目标网络的参数的更新频率 64 | self.device = device # 在GPU计算 65 | # 计数器,记录迭代次数 66 | self.count = 0 67 | 68 | # 构建2个神经网络,相同的结构,不同的参数 69 | # 实例化训练网络 [b,4]-->[b,2] 输出动作对应的奖励 70 | self.q_net = Net(self.n_states, self.n_hidden, self.n_actions) 71 | # 实例化目标网络 72 | self.target_q_net = Net(self.n_states, self.n_hidden, self.n_actions) 73 | 74 | # 优化器,更新训练网络的参数 75 | self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=self.learning_rate) 76 | 77 | #(2)动作选择 78 | def take_action(self, state): 79 | # 维度扩充,给行增加一个维度,并转换为张量shape=[1,4] 80 | state = torch.Tensor(state[np.newaxis, :]) 81 | # 如果小于该值就取最大的值对应的索引 82 | if np.random.random() < self.epsilon: # 0-1 83 | # 前向传播获取该状态对应的动作的reward 84 | actions_value = self.q_net(state) 85 | # 获取reward最大值对应的动作索引 86 | action = actions_value.argmax().item() # int 87 | # 如果大于该值就随机探索 88 | else: 89 | # 随机选择一个动作 90 | action = np.random.randint(self.n_actions) 91 | return action 92 | 93 | #(3)网络训练 94 | def update(self, transition_dict): # 传入经验池中的batch个样本 95 | # 获取当前时刻的状态 array_shape=[b,4] 96 | states = torch.tensor(transition_dict['states'], dtype=torch.float) 97 | # 获取当前时刻采取的动作 tuple_shape=[b],维度扩充 [b,1] 98 | actions = torch.tensor(transition_dict['actions']).view(-1,1) 99 | # 当前状态下采取动作后得到的奖励 tuple=[b],维度扩充 [b,1] 100 | rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1,1) 101 | # 下一时刻的状态 array_shape=[b,4] 102 | next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float) 103 | # 是否到达目标 tuple_shape=[b],维度变换[b,1] 104 | dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1,1) 105 | 106 | # 输入当前状态,得到采取各运动得到的奖励 [b,4]==>[b,2]==>[b,1] 107 | # 根据actions索引在训练网络的输出的第1维度上获取对应索引的q值(state_value) 108 | q_values = self.q_net(states).gather(1, actions) # [b,1] 109 | # 下一时刻的状态[b,4]-->目标网络输出下一时刻对应的动作q值[b,2]--> 110 | # 选出下个状态采取的动作中最大的q值[b]-->维度调整[b,1] 111 | max_next_q_values = self.target_q_net(next_states).max(1)[0].view(-1,1) 112 | # 目标网络输出的当前状态的q(state_value):即时奖励+折扣因子*下个时刻的最大回报 113 | q_targets = rewards + self.gamma * max_next_q_values * (1-dones) 114 | 115 | # 目标网络和训练网络之间的均方误差损失 116 | dqn_loss = torch.mean(F.mse_loss(q_values, q_targets)) 117 | # PyTorch中默认梯度会累积,这里需要显式将梯度置为0 118 | self.optimizer.zero_grad() 119 | # 反向传播参数更新 120 | dqn_loss.backward() 121 | # 对训练网络更新 122 | self.optimizer.step() 123 | 124 | # 在一段时间后更新目标网络的参数 125 | if self.count % self.target_update == 0: 126 | # 将目标网络的参数替换成训练网络的参数 127 | self.target_q_net.load_state_dict( 128 | self.q_net.state_dict()) 129 | 130 | self.count += 1 131 | -------------------------------------------------------------------------------- /Model/1. DQN/run_this.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from RL_DQN import DQN, ReplayBuffer 3 | import torch 4 | from tqdm import tqdm 5 | import matplotlib.pyplot as plt 6 | 7 | # GPU运算 8 | device = torch.device("cuda") if torch.cuda.is_available() \ 9 | else torch.device("cpu") 10 | 11 | # ------------------------------- # 12 | # 全局变量 13 | # ------------------------------- # 14 | 15 | capacity = 500 # 经验池容量 16 | lr = 2e-3 # 学习率 17 | gamma = 0.9 # 折扣因子 18 | epsilon = 0.9 # 贪心系数 19 | target_update = 200 # 目标网络的参数的更新频率 20 | batch_size = 32 21 | n_hidden = 128 # 隐含层神经元个数 22 | min_size = 200 # 经验池超过200后再训练 23 | return_list = [] # 记录每个回合的回报 24 | 25 | # 加载环境 26 | env = gym.make("CartPole-v1", render_mode="human") 27 | n_states = env.observation_space.shape[0] # 4 28 | n_actions = env.action_space.n # 2 29 | 30 | # 实例化经验池 31 | replay_buffer = ReplayBuffer(capacity) 32 | # 实例化DQN 33 | agent = DQN(n_states=n_states, 34 | n_hidden=n_hidden, 35 | n_actions=n_actions, 36 | learning_rate=lr, 37 | gamma=gamma, 38 | epsilon=epsilon, 39 | target_update=target_update, 40 | device=device, 41 | ) 42 | 43 | # 训练模型 44 | for i in range(100): # 100回合 45 | # 每个回合开始前重置环境 46 | state = env.reset()[0] # len=4 47 | # 记录每个回合的回报 48 | episode_return = 0 49 | done = False 50 | 51 | # 打印训练进度,一共10回合 52 | with tqdm(total=10, desc='Iteration %d' % i) as pbar: 53 | 54 | while True: 55 | # 获取当前状态下需要采取的动作 56 | action = agent.take_action(state) 57 | # 更新环境 58 | next_state, reward, done, _, _ = env.step(action) 59 | # 添加经验池 60 | replay_buffer.add(state, action, reward, next_state, done) 61 | # 更新当前状态 62 | state = next_state 63 | # 更新回合回报 64 | episode_return += reward 65 | 66 | # 当经验池超过一定数量后,训练网络 67 | if replay_buffer.size() > min_size: 68 | # 从经验池中随机抽样作为训练集 69 | s, a, r, ns, d = replay_buffer.sample(batch_size) 70 | # 构造训练集 71 | transition_dict = { 72 | 'states': s, 73 | 'actions': a, 74 | 'next_states': ns, 75 | 'rewards': r, 76 | 'dones': d, 77 | } 78 | # 网络更新 79 | agent.update(transition_dict) 80 | # 找到目标就结束 81 | if done: break 82 | 83 | # 记录每个回合的回报 84 | return_list.append(episode_return) 85 | 86 | # 更新进度条信息 87 | pbar.set_postfix({ 88 | 'return': '%.3f' % return_list[-1] 89 | }) 90 | pbar.update(1) 91 | 92 | # 绘图 93 | episodes_list = list(range(len(return_list))) 94 | plt.plot(episodes_list, return_list) 95 | plt.xlabel('Episodes') 96 | plt.ylabel('Returns') 97 | plt.title('DQN Returns') 98 | plt.show() 99 | -------------------------------------------------------------------------------- /Model/2. Double-DQN/RL_brain.py: -------------------------------------------------------------------------------- 1 | # 基于策略的学习方法,用于数值连续的问题 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | # ----------------------------------------------------- # 8 | #(1)构建训练网络 9 | # ----------------------------------------------------- # 10 | class Net(nn.Module): 11 | def __init__(self, n_states, n_hiddens, n_actions): 12 | super(Net, self).__init__() 13 | # 只有一层隐含层的网络 14 | self.fc1 = nn.Linear(n_states, n_hiddens) 15 | self.fc2 = nn.Linear(n_hiddens, n_actions) 16 | # 前向传播 17 | def forward(self, x): 18 | x = self.fc1(x) # [b, states]==>[b, n_hiddens] 19 | x = F.relu(x) 20 | x = self.fc2(x) # [b, n_hiddens]==>[b, n_actions] 21 | # 对batch中的每一行样本计算softmax,q值越大,概率越大 22 | x = F.softmax(x, dim=1) # [b, n_actions]==>[b, n_actions] 23 | return x 24 | 25 | # ----------------------------------------------------- # 26 | #(2)强化学习模型 27 | # ----------------------------------------------------- # 28 | class PolicyGradient: 29 | def __init__(self, n_states, n_hiddens, n_actions, 30 | learning_rate, gamma): 31 | # 属性分配 32 | self.n_states = n_states # 状态数 33 | self.n_hiddens = n_hiddens 34 | self.n_actions = n_actions # 动作数 35 | self.learning_rate = learning_rate # 衰减 36 | self.gamma = gamma # 折扣因子 37 | self._build_net() # 构建网络模型 38 | 39 | # 网络构建 40 | def _build_net(self): 41 | # 网络实例化 42 | self.policy_net = Net(self.n_states, self.n_hiddens, self.n_actions) 43 | # 优化器 44 | self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 45 | 46 | # 动作选择,根据概率分布随机采样 47 | def take_action(self, state): # 传入某个人的状态 48 | # numpy[n_states]-->[1,n_states]-->tensor 49 | state = torch.Tensor(state[np.newaxis, :]) 50 | # 获取每个人的各动作对应的概率[1,n_states]-->[1,n_actions] 51 | probs = self.policy_net(state) 52 | # 创建以probs为标准类型的数据分布 53 | action_dist = torch.distributions.Categorical(probs) 54 | # 以该概率分布随机抽样 [1,n_actions]-->[1] 每个状态取一组动作 55 | action = action_dist.sample() 56 | # 将tensor数据变成一个数 int 57 | action = action.item() 58 | return action 59 | 60 | # 获取每个状态最大的state_value 61 | def max_q_value(self, state): 62 | # 维度变换[n_states]-->[1,n_states] 63 | state = torch.tensor(state, dtype=torch.float).view(1,-1) 64 | # 获取状态对应的每个动作的reward的最大值 [1,n_states]-->[1,n_actions]-->[1]-->float 65 | max_q = self.policy_net(state).max().item() 66 | return max_q 67 | 68 | # 训练模型 69 | def learn(self, transitions_dict): # 输入batch组状态[b,n_states] 70 | # 取出该回合中所有的链信息 71 | state_list = transitions_dict['states'] 72 | action_list = transitions_dict['actions'] 73 | reward_list = transitions_dict['rewards'] 74 | 75 | G = 0 # 记录该条链的return 76 | self.optimizer.zero_grad() # 优化器清0 77 | # 梯度上升最大化目标函数 78 | for i in reversed(range(len(reward_list))): 79 | # 获取每一步的reward, float 80 | reward = reward_list[i] 81 | # 获取每一步的状态 [n_states]-->[1,n_states] 82 | state = torch.tensor(state_list[i], dtype=torch.float).view(1,-1) 83 | # 获取每一步的动作 [1]-->[1,1] 84 | action = torch.tensor(action_list[i]).view(1,-1) 85 | # 当前状态下的各个动作价值函数 [1,2] 86 | q_value = self.policy_net(state) 87 | # 获取已action对应的概率 [1,1] 88 | log_prob = torch.log(q_value.gather(1, action)) 89 | # 计算当前状态的state_value = 及时奖励 + 下一时刻的state_value 90 | G = reward + self.gamma * G 91 | # 计算每一步的损失函数 92 | loss = -log_prob * G 93 | # 反向传播 94 | loss.backward() 95 | # 梯度下降 96 | self.optimizer.step() 97 | -------------------------------------------------------------------------------- /Model/2. Double-DQN/parsers.py: -------------------------------------------------------------------------------- 1 | # 参数定义 2 | import argparse # 参数设置 3 | 4 | # 创建解释器 5 | parser = argparse.ArgumentParser() 6 | 7 | # 参数定义 8 | parser.add_argument('--lr', type=float, default=2e-3, help='学习率') 9 | parser.add_argument('--gamma', type=float, default=0.9, help='折扣因子') 10 | parser.add_argument('--epsilon', type=float, default=0.9, help='贪心系数') 11 | parser.add_argument('--target_update', type=int, default=200, help='更新频率') 12 | parser.add_argument('--batch_size', type=int, default=64, help='每次训练64组数据') 13 | parser.add_argument('--capacity', type=int, default=500, help='经验池容量') 14 | parser.add_argument('--min_size', type=int, default=200, help='经验池超过200后再开始训练') 15 | parser.add_argument('--n_hiddens', type=int, default=128, help='隐含层神经元个数') 16 | 17 | # 参数解析 18 | args=parser.parse_args() 19 | -------------------------------------------------------------------------------- /Model/2. Double-DQN/run_this.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from RL_brain import PolicyGradient 5 | 6 | # ------------------------------- # 7 | # 模型参数设置 8 | # ------------------------------- # 9 | 10 | n_hiddens = 16 # 隐含层个数 11 | learning_rate = 2e-3 # 学习率 12 | gamma = 0.9 # 折扣因子 13 | return_list = [] # 保存每回合的reward 14 | max_q_value = 0 # 初始的动作价值函数 15 | max_q_value_list = [] # 保存每一step的动作价值函数 16 | 17 | # ------------------------------- # 18 | #(1)加载环境 19 | # ------------------------------- # 20 | 21 | # 连续性动作 22 | env = gym.make("CartPole-v1", render_mode="human") 23 | n_states = env.observation_space.shape[0] # 状态数 4 24 | n_actions = env.action_space.n # 动作数 2 25 | 26 | # ------------------------------- # 27 | #(2)模型实例化 28 | # ------------------------------- # 29 | 30 | agent = PolicyGradient(n_states=n_states, # 4 31 | n_hiddens=n_hiddens, # 16 32 | n_actions=n_actions, # 2 33 | learning_rate=learning_rate, # 学习率 34 | gamma=gamma) # 折扣因子 35 | 36 | # ------------------------------- # 37 | #(3)训练 38 | # ------------------------------- # 39 | 40 | for i in range(100): # 训练10回合 41 | # 记录每个回合的return 42 | episode_return = 0 43 | # 存放状态 44 | transition_dict = { 45 | 'states': [], 46 | 'actions': [], 47 | 'next_states': [], 48 | 'rewards': [], 49 | 'dones': [], 50 | } 51 | # 获取初始状态 52 | state = env.reset()[0] 53 | # 结束的标记 54 | done = False 55 | 56 | # 开始迭代 57 | while not done: 58 | # 动作选择 59 | action = agent.take_action(state) # 对某一状态采取动作 60 | # 动作价值函数,曲线平滑 61 | max_q_value = agent.max_q_value(state) * 0.005 + max_q_value * 0.995 62 | # 保存每一step的动作价值函数 63 | max_q_value_list.append(max_q_value) 64 | # 环境更新 65 | next_state, reward, done, _, _ = env.step(action) 66 | # 保存每个回合的所有信息 67 | transition_dict['states'].append(state) 68 | transition_dict['actions'].append(action) 69 | transition_dict['next_states'].append(next_state) 70 | transition_dict['rewards'].append(reward) 71 | transition_dict['dones'].append(done) 72 | # 状态更新 73 | state = next_state 74 | # 记录每个回合的return 75 | episode_return += reward 76 | 77 | # 保存每个回合的return 78 | return_list.append(episode_return) 79 | # 一整个回合走完了再训练模型 80 | agent.learn(transition_dict) 81 | 82 | # 打印回合信息 83 | print(f'iter:{i}, return:{np.mean(return_list[-10:])}') 84 | 85 | # 关闭动画 86 | env.close() 87 | 88 | # -------------------------------------- # 89 | # 绘图 90 | # -------------------------------------- # 91 | 92 | plt.subplot(121) 93 | plt.plot(return_list) 94 | plt.title('return') 95 | plt.subplot(122) 96 | plt.plot(max_q_value_list) 97 | plt.title('max_q_value') 98 | plt.show() 99 | -------------------------------------------------------------------------------- /Model/3. Policy-Gradients/RL_brain.py: -------------------------------------------------------------------------------- 1 | # 基于策略的学习方法,用于数值连续的问题 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | # ----------------------------------------------------- # 8 | #(1)构建训练网络 9 | # ----------------------------------------------------- # 10 | class Net(nn.Module): 11 | def __init__(self, n_states, n_hiddens, n_actions): 12 | super(Net, self).__init__() 13 | # 只有一层隐含层的网络 14 | self.fc1 = nn.Linear(n_states, n_hiddens) 15 | self.fc2 = nn.Linear(n_hiddens, n_actions) 16 | # 前向传播 17 | def forward(self, x): 18 | x = self.fc1(x) # [b, states]==>[b, n_hiddens] 19 | x = F.relu(x) 20 | x = self.fc2(x) # [b, n_hiddens]==>[b, n_actions] 21 | # 对batch中的每一行样本计算softmax,q值越大,概率越大 22 | x = F.softmax(x, dim=1) # [b, n_actions]==>[b, n_actions] 23 | return x 24 | 25 | # ----------------------------------------------------- # 26 | #(2)强化学习模型 27 | # ----------------------------------------------------- # 28 | class PolicyGradient: 29 | def __init__(self, n_states, n_hiddens, n_actions, 30 | learning_rate, gamma): 31 | # 属性分配 32 | self.n_states = n_states # 状态数 33 | self.n_hiddens = n_hiddens 34 | self.n_actions = n_actions # 动作数 35 | self.learning_rate = learning_rate # 衰减 36 | self.gamma = gamma # 折扣因子 37 | self._build_net() # 构建网络模型 38 | 39 | # 网络构建 40 | def _build_net(self): 41 | # 网络实例化 42 | self.policy_net = Net(self.n_states, self.n_hiddens, self.n_actions) 43 | # 优化器 44 | self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 45 | 46 | # 动作选择,根据概率分布随机采样 47 | def take_action(self, state): # 传入某个人的状态 48 | # numpy[n_states]-->[1,n_states]-->tensor 49 | state = torch.Tensor(state[np.newaxis, :]) 50 | # 获取每个人的各动作对应的概率[1,n_states]-->[1,n_actions] 51 | probs = self.policy_net(state) 52 | # 创建以probs为标准类型的数据分布 53 | action_dist = torch.distributions.Categorical(probs) 54 | # 以该概率分布随机抽样 [1,n_actions]-->[1] 每个状态取一组动作 55 | action = action_dist.sample() 56 | # 将tensor数据变成一个数 int 57 | action = action.item() 58 | return action 59 | 60 | # 获取每个状态最大的state_value 61 | def max_q_value(self, state): 62 | # 维度变换[n_states]-->[1,n_states] 63 | state = torch.tensor(state, dtype=torch.float).view(1,-1) 64 | # 获取状态对应的每个动作的reward的最大值 [1,n_states]-->[1,n_actions]-->[1]-->float 65 | max_q = self.policy_net(state).max().item() 66 | return max_q 67 | 68 | # 训练模型 69 | def learn(self, transitions_dict): # 输入batch组状态[b,n_states] 70 | # 取出该回合中所有的链信息 71 | state_list = transitions_dict['states'] 72 | action_list = transitions_dict['actions'] 73 | reward_list = transitions_dict['rewards'] 74 | 75 | G = 0 # 记录该条链的return 76 | self.optimizer.zero_grad() # 优化器清0 77 | # 梯度上升最大化目标函数 78 | for i in reversed(range(len(reward_list))): 79 | # 获取每一步的reward, float 80 | reward = reward_list[i] 81 | # 获取每一步的状态 [n_states]-->[1,n_states] 82 | state = torch.tensor(state_list[i], dtype=torch.float).view(1,-1) 83 | # 获取每一步的动作 [1]-->[1,1] 84 | action = torch.tensor(action_list[i]).view(1,-1) 85 | # 当前状态下的各个动作价值函数 [1,2] 86 | q_value = self.policy_net(state) 87 | # 获取已action对应的概率 [1,1] 88 | log_prob = torch.log(q_value.gather(1, action)) 89 | # 计算当前状态的state_value = 及时奖励 + 下一时刻的state_value 90 | G = reward + self.gamma * G 91 | # 计算每一步的损失函数 92 | loss = -log_prob * G 93 | # 反向传播 94 | loss.backward() 95 | # 梯度下降 96 | self.optimizer.step() 97 | -------------------------------------------------------------------------------- /Model/3. Policy-Gradients/run_this.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from RL_brain import PolicyGradient 5 | 6 | # ------------------------------- # 7 | # 模型参数设置 8 | # ------------------------------- # 9 | 10 | n_hiddens = 16 # 隐含层个数 11 | learning_rate = 2e-3 # 学习率 12 | gamma = 0.9 # 折扣因子 13 | return_list = [] # 保存每回合的reward 14 | max_q_value = 0 # 初始的动作价值函数 15 | max_q_value_list = [] # 保存每一step的动作价值函数 16 | 17 | # ------------------------------- # 18 | #(1)加载环境 19 | # ------------------------------- # 20 | 21 | # 连续性动作 22 | env = gym.make("CartPole-v1", render_mode="human") 23 | n_states = env.observation_space.shape[0] # 状态数 4 24 | n_actions = env.action_space.n # 动作数 2 25 | 26 | # ------------------------------- # 27 | #(2)模型实例化 28 | # ------------------------------- # 29 | 30 | agent = PolicyGradient(n_states=n_states, # 4 31 | n_hiddens=n_hiddens, # 16 32 | n_actions=n_actions, # 2 33 | learning_rate=learning_rate, # 学习率 34 | gamma=gamma) # 折扣因子 35 | 36 | # ------------------------------- # 37 | #(3)训练 38 | # ------------------------------- # 39 | 40 | for i in range(100): # 训练10回合 41 | # 记录每个回合的return 42 | episode_return = 0 43 | # 存放状态 44 | transition_dict = { 45 | 'states': [], 46 | 'actions': [], 47 | 'next_states': [], 48 | 'rewards': [], 49 | 'dones': [], 50 | } 51 | # 获取初始状态 52 | state = env.reset()[0] 53 | # 结束的标记 54 | done = False 55 | 56 | # 开始迭代 57 | while not done: 58 | # 动作选择 59 | action = agent.take_action(state) # 对某一状态采取动作 60 | # 动作价值函数,曲线平滑 61 | max_q_value = agent.max_q_value(state) * 0.005 + max_q_value * 0.995 62 | # 保存每一step的动作价值函数 63 | max_q_value_list.append(max_q_value) 64 | # 环境更新 65 | next_state, reward, done, _, _ = env.step(action) 66 | # 保存每个回合的所有信息 67 | transition_dict['states'].append(state) 68 | transition_dict['actions'].append(action) 69 | transition_dict['next_states'].append(next_state) 70 | transition_dict['rewards'].append(reward) 71 | transition_dict['dones'].append(done) 72 | # 状态更新 73 | state = next_state 74 | # 记录每个回合的return 75 | episode_return += reward 76 | 77 | # 保存每个回合的return 78 | return_list.append(episode_return) 79 | # 一整个回合走完了再训练模型 80 | agent.learn(transition_dict) 81 | 82 | # 打印回合信息 83 | print(f'iter:{i}, return:{np.mean(return_list[-10:])}') 84 | 85 | # 关闭动画 86 | env.close() 87 | 88 | # -------------------------------------- # 89 | # 绘图 90 | # -------------------------------------- # 91 | 92 | plt.subplot(121) 93 | plt.plot(return_list) 94 | plt.title('return') 95 | plt.subplot(122) 96 | plt.plot(max_q_value_list) 97 | plt.title('max_q_value') 98 | plt.show() 99 | -------------------------------------------------------------------------------- /Model/4. Actor-Critic/RL_brain.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import numpy as np 5 | 6 | # ------------------------------------ # 7 | # 策略梯度Actor,动作选择 8 | # ------------------------------------ # 9 | 10 | class PolicyNet(nn.Module): 11 | def __init__(self, n_states, n_hiddens, n_actions): 12 | super(PolicyNet, self).__init__() 13 | self.fc1 = nn.Linear(n_states, n_hiddens) 14 | self.fc2 = nn.Linear(n_hiddens, n_actions) 15 | # 前向传播 16 | def forward(self, x): 17 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 18 | x = F.relu(x) 19 | x = self.fc2(x) # [b,n_hiddens]-->[b,n_actions] 20 | # 每个状态对应的动作的概率 21 | x = F.softmax(x, dim=1) # [b,n_actions]-->[b,n_actions] 22 | return x 23 | 24 | # ------------------------------------ # 25 | # 值函数Critic,动作评估输出 shape=[b,1] 26 | # ------------------------------------ # 27 | 28 | class ValueNet(nn.Module): 29 | def __init__(self, n_states, n_hiddens): 30 | super(ValueNet, self).__init__() 31 | self.fc1 = nn.Linear(n_states, n_hiddens) 32 | self.fc2 = nn.Linear(n_hiddens, 1) 33 | # 前向传播 34 | def forward(self, x): 35 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 36 | x = F.relu(x) 37 | x = self.fc2(x) # [b,n_hiddens]-->[b,1] 38 | return x 39 | 40 | # ------------------------------------ # 41 | # Actor-Critic 42 | # ------------------------------------ # 43 | 44 | class ActorCritic: 45 | def __init__(self, n_states, n_hiddens, n_actions, 46 | actor_lr, critic_lr, gamma): 47 | # 属性分配 48 | self.gamma = gamma 49 | 50 | # 实例化策略网络 51 | self.actor = PolicyNet(n_states, n_hiddens, n_actions) 52 | # 实例化价值网络 53 | self.critic = ValueNet(n_states, n_hiddens) 54 | # 策略网络的优化器 55 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) 56 | # 价值网络的优化器 57 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) 58 | 59 | # 动作选择 60 | def take_action(self, state): 61 | # 维度变换numpy[n_states]-->[1,n_sates]-->tensor 62 | state = torch.tensor(state[np.newaxis, :]) 63 | # 动作价值函数,当前状态下各个动作的概率 64 | probs = self.actor(state) 65 | # 创建以probs为标准类型的数据分布 66 | action_dist = torch.distributions.Categorical(probs) 67 | # 随机选择一个动作 tensor-->int 68 | action = action_dist.sample().item() 69 | return action 70 | 71 | # 模型更新 72 | def update(self, transition_dict): 73 | # 训练集 74 | states = torch.tensor(transition_dict['states'], dtype=torch.float) 75 | actions = torch.tensor(transition_dict['actions']).view(-1,1) 76 | rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1,1) 77 | next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float) 78 | dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1,1) 79 | 80 | # 预测的当前时刻的state_value 81 | td_value = self.critic(states) 82 | # 目标的当前时刻的state_value 83 | td_target = rewards + self.gamma * self.critic(next_states) * (1-dones) 84 | # 时序差分的误差计算,目标的state_value与预测的state_value之差 85 | td_delta = td_target - td_value 86 | 87 | # 对每个状态对应的动作价值用log函数 88 | log_probs = torch.log(self.actor(states).gather(1, actions)) 89 | # 策略梯度损失 90 | actor_loss = torch.mean(-log_probs * td_delta.detach()) 91 | # 值函数损失,预测值和目标值之间 92 | critic_loss = torch.mean(F.mse_loss(self.critic(states), td_target.detach())) 93 | 94 | # 优化器梯度清0 95 | self.actor_optimizer.zero_grad() # 策略梯度网络的优化器 96 | self.critic_optimizer.zero_grad() # 价值网络的优化器 97 | # 反向传播 98 | actor_loss.backward() 99 | critic_loss.backward() 100 | # 参数更新 101 | self.actor_optimizer.step() 102 | self.critic_optimizer.step() 103 | -------------------------------------------------------------------------------- /Model/4. Actor-Critic/run_this.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import gym 4 | import torch 5 | from RL_brain import ActorCritic 6 | 7 | # ----------------------------------------- # 8 | # 参数设置 9 | # ----------------------------------------- # 10 | 11 | num_episodes = 100 # 总迭代次数 12 | gamma = 0.9 # 折扣因子 13 | actor_lr = 1e-3 # 策略网络的学习率 14 | critic_lr = 1e-2 # 价值网络的学习率 15 | n_hiddens = 16 # 隐含层神经元个数 16 | env_name = 'CartPole-v1' 17 | return_list = [] # 保存每个回合的return 18 | 19 | # ----------------------------------------- # 20 | # 环境加载 21 | # ----------------------------------------- # 22 | 23 | env = gym.make(env_name, render_mode="human") 24 | n_states = env.observation_space.shape[0] # 状态数 4 25 | n_actions = env.action_space.n # 动作数 2 26 | 27 | # ----------------------------------------- # 28 | # 模型构建 29 | # ----------------------------------------- # 30 | 31 | agent = ActorCritic(n_states=n_states, # 状态数 32 | n_hiddens=n_hiddens, # 隐含层数 33 | n_actions=n_actions, # 动作数 34 | actor_lr=actor_lr, # 策略网络学习率 35 | critic_lr=critic_lr, # 价值网络学习率 36 | gamma=gamma) # 折扣因子 37 | 38 | # ----------------------------------------- # 39 | # 训练--回合更新 40 | # ----------------------------------------- # 41 | 42 | for i in range(num_episodes): 43 | 44 | state = env.reset()[0] # 环境重置 45 | done = False # 任务完成的标记 46 | episode_return = 0 # 累计每回合的reward 47 | 48 | # 构造数据集,保存每个回合的状态数据 49 | transition_dict = { 50 | 'states': [], 51 | 'actions': [], 52 | 'next_states': [], 53 | 'rewards': [], 54 | 'dones': [], 55 | } 56 | 57 | while not done: 58 | action = agent.take_action(state) # 动作选择 59 | next_state, reward, done, _, _ = env.step(action) # 环境更新 60 | # 保存每个时刻的状态\动作\... 61 | transition_dict['states'].append(state) 62 | transition_dict['actions'].append(action) 63 | transition_dict['next_states'].append(next_state) 64 | transition_dict['rewards'].append(reward) 65 | transition_dict['dones'].append(done) 66 | # 更新状态 67 | state = next_state 68 | # 累计回合奖励 69 | episode_return += reward 70 | 71 | # 保存每个回合的return 72 | return_list.append(episode_return) 73 | # 模型训练 74 | agent.update(transition_dict) 75 | 76 | # 打印回合信息 77 | print(f'iter:{i}, return:{np.mean(return_list[-10:])}') 78 | 79 | # -------------------------------------- # 80 | # 绘图 81 | # -------------------------------------- # 82 | 83 | plt.plot(return_list) 84 | plt.title('return') 85 | plt.show() 86 | -------------------------------------------------------------------------------- /Model/5. DDPG/RL_brain.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import numpy as np 5 | import collections 6 | import random 7 | 8 | # ------------------------------------- # 9 | # 经验回放池 10 | # ------------------------------------- # 11 | 12 | class ReplayBuffer: 13 | def __init__(self, capacity): # 经验池的最大容量 14 | # 创建一个队列,先进先出 15 | self.buffer = collections.deque(maxlen=capacity) 16 | # 在队列中添加数据 17 | def add(self, state, action, reward, next_state, done): 18 | # 以list类型保存 19 | self.buffer.append((state, action, reward, next_state, done)) 20 | # 在队列中随机取样batch_size组数据 21 | def sample(self, batch_size): 22 | transitions = random.sample(self.buffer, batch_size) 23 | # 将数据集拆分开来 24 | state, action, reward, next_state, done = zip(*transitions) 25 | return np.array(state), action, reward, np.array(next_state), done 26 | # 测量当前时刻的队列长度 27 | def size(self): 28 | return len(self.buffer) 29 | 30 | # ------------------------------------- # 31 | # 策略网络 32 | # ------------------------------------- # 33 | 34 | class PolicyNet(nn.Module): 35 | def __init__(self, n_states, n_hiddens, n_actions, action_bound): 36 | super(PolicyNet, self).__init__() 37 | # 环境可以接受的动作最大值 38 | self.action_bound = action_bound 39 | # 只包含一个隐含层 40 | self.fc1 = nn.Linear(n_states, n_hiddens) 41 | self.fc2 = nn.Linear(n_hiddens, n_actions) 42 | # 前向传播 43 | def forward(self, x): 44 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 45 | x = F.relu(x) 46 | x = self.fc2(x) # [b,n_hiddens]-->[b,n_actions] 47 | x= torch.tanh(x) # 将数值调整到 [-1,1] 48 | x = x * self.action_bound # 缩放到 [-action_bound, action_bound] 49 | return x 50 | 51 | # ------------------------------------- # 52 | # 价值网络 53 | # ------------------------------------- # 54 | 55 | class QValueNet(nn.Module): 56 | def __init__(self, n_states, n_hiddens, n_actions): 57 | super(QValueNet, self).__init__() 58 | # 59 | self.fc1 = nn.Linear(n_states + n_actions, n_hiddens) 60 | self.fc2 = nn.Linear(n_hiddens, n_hiddens) 61 | self.fc3 = nn.Linear(n_hiddens, 1) 62 | # 前向传播 63 | def forward(self, x, a): 64 | # 拼接状态和动作 65 | cat = torch.cat([x, a], dim=1) # [b, n_states + n_actions] 66 | x = self.fc1(cat) # -->[b, n_hiddens] 67 | x = F.relu(x) 68 | x = self.fc2(x) # -->[b, n_hiddens] 69 | x = F.relu(x) 70 | x = self.fc3(x) # -->[b, 1] 71 | return x 72 | 73 | # ------------------------------------- # 74 | # 算法主体 75 | # ------------------------------------- # 76 | 77 | class DDPG: 78 | def __init__(self, n_states, n_hiddens, n_actions, action_bound, 79 | sigma, actor_lr, critic_lr, tau, gamma, device): 80 | 81 | # 策略网络--训练 82 | self.actor = PolicyNet(n_states, n_hiddens, n_actions, action_bound).to(device) 83 | # 价值网络--训练 84 | self.critic = QValueNet(n_states, n_hiddens, n_actions).to(device) 85 | # 策略网络--目标 86 | self.target_actor = PolicyNet(n_states, n_hiddens, n_actions, action_bound).to(device) 87 | # 价值网络--目标 88 | self.target_critic = QValueNet(n_states, n_hiddens, n_actions).to(device 89 | ) 90 | # 初始化价值网络的参数,两个价值网络的参数相同 91 | self.target_critic.load_state_dict(self.critic.state_dict()) 92 | # 初始化策略网络的参数,两个策略网络的参数相同 93 | self.target_actor.load_state_dict(self.actor.state_dict()) 94 | 95 | # 策略网络的优化器 96 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) 97 | # 价值网络的优化器 98 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) 99 | 100 | # 属性分配 101 | self.gamma = gamma # 折扣因子 102 | self.sigma = sigma # 高斯噪声的标准差,均值设为0 103 | self.tau = tau # 目标网络的软更新参数 104 | self.n_actions = n_actions 105 | self.device = device 106 | 107 | # 动作选择 108 | def take_action(self, state): 109 | # 维度变换 list[n_states]-->tensor[1,n_states]-->gpu 110 | state = torch.tensor(state, dtype=torch.float).view(1,-1).to(self.device) 111 | # 策略网络计算出当前状态下的动作价值 [1,n_states]-->[1,1]-->int 112 | action = self.actor(state).item() 113 | # 给动作添加噪声,增加搜索 114 | action = action + self.sigma * np.random.randn(self.n_actions) 115 | return action 116 | 117 | # 软更新, 意思是每次learn的时候更新部分参数 118 | def soft_update(self, net, target_net): 119 | # 获取训练网络和目标网络需要更新的参数 120 | for param_target, param in zip(target_net.parameters(), net.parameters()): 121 | # 训练网络的参数更新要综合考虑目标网络和训练网络 122 | param_target.data.copy_(param_target.data*(1-self.tau) + param.data*self.tau) 123 | 124 | # 训练 125 | def update(self, transition_dict): 126 | # 从训练集中取出数据 127 | states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) # [b,n_states] 128 | actions = torch.tensor(transition_dict['actions'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 129 | rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 130 | next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) # [b,next_states] 131 | dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 132 | 133 | # 价值目标网络获取下一时刻的每个动作价值[b,n_states]-->[b,n_actors] 134 | next_q_values = self.target_actor(next_states) 135 | # 策略目标网络获取下一时刻状态选出的动作价值 [b,n_states+n_actions]-->[b,1] 136 | next_q_values = self.target_critic(next_states, next_q_values) 137 | # 当前时刻的动作价值的目标值 [b,1] 138 | q_targets = rewards + self.gamma * next_q_values * (1-dones) 139 | 140 | # 当前时刻动作价值的预测值 [b,n_states+n_actions]-->[b,1] 141 | q_values = self.critic(states, actions) 142 | 143 | # 预测值和目标值之间的均方差损失 144 | critic_loss = torch.mean(F.mse_loss(q_values, q_targets)) 145 | # 价值网络梯度 146 | self.critic_optimizer.zero_grad() 147 | critic_loss.backward() 148 | self.critic_optimizer.step() 149 | 150 | # 当前状态的每个动作的价值 [b, n_actions] 151 | actor_q_values = self.actor(states) 152 | # 当前状态选出的动作价值 [b,1] 153 | score = self.critic(states, actor_q_values) 154 | # 计算损失 155 | actor_loss = -torch.mean(score) 156 | # 策略网络梯度 157 | self.actor_optimizer.zero_grad() 158 | actor_loss.backward() 159 | self.actor_optimizer.step() 160 | 161 | # 软更新策略网络的参数 162 | self.soft_update(self.actor, self.target_actor) 163 | # 软更新价值网络的参数 164 | self.soft_update(self.critic, self.target_critic) 165 | -------------------------------------------------------------------------------- /Model/5. DDPG/parsers.py: -------------------------------------------------------------------------------- 1 | # 参数定义 2 | import argparse # 参数设置 3 | 4 | # 创建解释器 5 | parser = argparse.ArgumentParser() 6 | 7 | # 参数定义 8 | parser.add_argument('--actor_lr', type=float, default=3e-4, help='策略网络的学习率') 9 | parser.add_argument('--critic_lr', type=float, default=3e-3, help='价值网络的学习率') 10 | parser.add_argument('--n_hiddens', type=int, default=64, help='隐含层神经元个数') 11 | parser.add_argument('--gamma', type=float, default=0.98, help='折扣因子') 12 | parser.add_argument('--tau', type=float, default=0.005, help='软更新系数') 13 | parser.add_argument('--buffer_size', type=int, default=1000, help='经验池容量') 14 | parser.add_argument('--min_size', type=int, default=200, help='经验池超过200再训练') 15 | parser.add_argument('--batch_size', type=int, default=64, help='每次训练64组样本') 16 | parser.add_argument('--sigma', type=int, default=0.01, help='高斯噪声标准差') 17 | 18 | # 参数解析 19 | args=parser.parse_args() 20 | -------------------------------------------------------------------------------- /Model/5. DDPG/run_this.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import matplotlib.pyplot as plt 4 | import gym 5 | from parsers import args 6 | from RL_brain import ReplayBuffer, DDPG 7 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 8 | 9 | # -------------------------------------- # 10 | # 环境加载 11 | # -------------------------------------- # 12 | 13 | env_name = "MountainCarContinuous-v0" # 连续型动作 14 | env = gym.make(env_name, render_mode="human") 15 | n_states = env.observation_space.shape[0] # 状态数 2 16 | n_actions = env.action_space.shape[0] # 动作数 1 17 | action_bound = env.action_space.high[0] # 动作的最大值 1.0 18 | 19 | 20 | # -------------------------------------- # 21 | # 模型构建 22 | # -------------------------------------- # 23 | 24 | # 经验回放池实例化 25 | replay_buffer = ReplayBuffer(capacity=args.buffer_size) 26 | # 模型实例化 27 | agent = DDPG(n_states = n_states, # 状态数 28 | n_hiddens = args.n_hiddens, # 隐含层数 29 | n_actions = n_actions, # 动作数 30 | action_bound = action_bound, # 动作最大值 31 | sigma = args.sigma, # 高斯噪声 32 | actor_lr = args.actor_lr, # 策略网络学习率 33 | critic_lr = args.critic_lr, # 价值网络学习率 34 | tau = args.tau, # 软更新系数 35 | gamma = args.gamma, # 折扣因子 36 | device = device 37 | ) 38 | 39 | # -------------------------------------- # 40 | # 模型训练 41 | # -------------------------------------- # 42 | 43 | return_list = [] # 记录每个回合的return 44 | mean_return_list = [] # 记录每个回合的return均值 45 | 46 | for i in range(10): # 迭代10回合 47 | episode_return = 0 # 累计每条链上的reward 48 | state = env.reset()[0] # 初始时的状态 49 | done = False # 回合结束标记 50 | 51 | while not done: 52 | # 获取当前状态对应的动作 53 | action = agent.take_action(state) 54 | # 环境更新 55 | next_state, reward, done, _, _ = env.step(action) 56 | # 更新经验回放池 57 | replay_buffer.add(state, action, reward, next_state, done) 58 | # 状态更新 59 | state = next_state 60 | # 累计每一步的reward 61 | episode_return += reward 62 | 63 | # 如果经验池超过容量,开始训练 64 | if replay_buffer.size() > args.min_size: 65 | # 经验池随机采样batch_size组 66 | s, a, r, ns, d = replay_buffer.sample(args.batch_size) 67 | # 构造数据集 68 | transition_dict = { 69 | 'states': s, 70 | 'actions': a, 71 | 'rewards': r, 72 | 'next_states': ns, 73 | 'dones': d, 74 | } 75 | # 模型训练 76 | agent.update(transition_dict) 77 | 78 | # 保存每一个回合的回报 79 | return_list.append(episode_return) 80 | mean_return_list.append(np.mean(return_list[-10:])) # 平滑 81 | 82 | # 打印回合信息 83 | print(f'iter:{i}, return:{episode_return}, mean_return:{np.mean(return_list[-10:])}') 84 | 85 | # 关闭动画窗格 86 | env.close() 87 | 88 | # -------------------------------------- # 89 | # 绘图 90 | # -------------------------------------- # 91 | 92 | x_range = list(range(len(return_list))) 93 | 94 | plt.subplot(121) 95 | plt.plot(x_range, return_list) # 每个回合return 96 | plt.xlabel('episode') 97 | plt.ylabel('return') 98 | plt.subplot(122) 99 | plt.plot(x_range, mean_return_list) # 每回合return均值 100 | plt.xlabel('episode') 101 | plt.ylabel('mean_return') 102 | -------------------------------------------------------------------------------- /Model/6. PPO_Discrete/RL_brain.py: -------------------------------------------------------------------------------- 1 | # 代码用于离散环境的模型 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | # ----------------------------------- # 8 | # 构建策略网络--actor 9 | # ----------------------------------- # 10 | 11 | class PolicyNet(nn.Module): 12 | def __init__(self, n_states, n_hiddens, n_actions): 13 | super(PolicyNet, self).__init__() 14 | self.fc1 = nn.Linear(n_states, n_hiddens) 15 | self.fc2 = nn.Linear(n_hiddens, n_actions) 16 | def forward(self, x): 17 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 18 | x = F.relu(x) 19 | x = self.fc2(x) # [b, n_actions] 20 | x = F.softmax(x, dim=1) # [b, n_actions] 计算每个动作的概率 21 | return x 22 | 23 | # ----------------------------------- # 24 | # 构建价值网络--critic 25 | # ----------------------------------- # 26 | 27 | class ValueNet(nn.Module): 28 | def __init__(self, n_states, n_hiddens): 29 | super(ValueNet, self).__init__() 30 | self.fc1 = nn.Linear(n_states, n_hiddens) 31 | self.fc2 = nn.Linear(n_hiddens, 1) 32 | def forward(self, x): 33 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 34 | x = F.relu(x) 35 | x = self.fc2(x) # [b,n_hiddens]-->[b,1] 评价当前的状态价值state_value 36 | return x 37 | 38 | # ----------------------------------- # 39 | # 构建模型 40 | # ----------------------------------- # 41 | 42 | class PPO: 43 | def __init__(self, n_states, n_hiddens, n_actions, 44 | actor_lr, critic_lr, lmbda, epochs, eps, gamma, device): 45 | # 实例化策略网络 46 | self.actor = PolicyNet(n_states, n_hiddens, n_actions).to(device) 47 | # 实例化价值网络 48 | self.critic = ValueNet(n_states, n_hiddens).to(device) 49 | # 策略网络的优化器 50 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) 51 | # 价值网络的优化器 52 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr = critic_lr) 53 | 54 | self.gamma = gamma # 折扣因子 55 | self.lmbda = lmbda # GAE优势函数的缩放系数 56 | self.epochs = epochs # 一条序列的数据用来训练轮数 57 | self.eps = eps # PPO中截断范围的参数 58 | self.device = device 59 | 60 | # 动作选择 61 | def take_action(self, state): 62 | # 维度变换 [n_state]-->tensor[1,n_states] 63 | state = torch.tensor(state[np.newaxis, :]).to(self.device) 64 | # 当前状态下,每个动作的概率分布 [1,n_states] 65 | probs = self.actor(state) 66 | # 创建以probs为标准的概率分布 67 | action_list = torch.distributions.Categorical(probs) 68 | # 依据其概率随机挑选一个动作 69 | action = action_list.sample().item() 70 | return action 71 | 72 | # 训练 73 | def learn(self, transition_dict): 74 | # 提取数据集 75 | states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) 76 | actions = torch.tensor(transition_dict['actions']).to(self.device).view(-1,1) 77 | rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).to(self.device).view(-1,1) 78 | next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) 79 | dones = torch.tensor(transition_dict['dones'], dtype=torch.float).to(self.device).view(-1,1) 80 | 81 | # 目标,下一个状态的state_value [b,1] 82 | next_q_target = self.critic(next_states) 83 | # 目标,当前状态的state_value [b,1] 84 | td_target = rewards + self.gamma * next_q_target * (1-dones) 85 | # 预测,当前状态的state_value [b,1] 86 | td_value = self.critic(states) 87 | # 目标值和预测值state_value之差 [b,1] 88 | td_delta = td_target - td_value 89 | 90 | # 时序差分值 tensor-->numpy [b,1] 91 | td_delta = td_delta.cpu().detach().numpy() 92 | advantage = 0 # 优势函数初始化 93 | advantage_list = [] 94 | 95 | # 计算优势函数 96 | for delta in td_delta[::-1]: # 逆序时序差分值 axis=1轴上倒着取 [], [], [] 97 | # 优势函数GAE的公式 98 | advantage = self.gamma * self.lmbda * advantage + delta 99 | advantage_list.append(advantage) 100 | # 正序 101 | advantage_list.reverse() 102 | # numpy --> tensor [b,1] 103 | advantage = torch.tensor(advantage_list, dtype=torch.float).to(self.device) 104 | 105 | # 策略网络给出每个动作的概率,根据action得到当前时刻下该动作的概率 106 | old_log_probs = torch.log(self.actor(states).gather(1, actions)).detach() 107 | 108 | # 一组数据训练 epochs 轮 109 | for _ in range(self.epochs): 110 | # 每一轮更新一次策略网络预测的状态 111 | log_probs = torch.log(self.actor(states).gather(1, actions)) 112 | # 新旧策略之间的比例 113 | ratio = torch.exp(log_probs - old_log_probs) 114 | # 近端策略优化裁剪目标函数公式的左侧项 115 | surr1 = ratio * advantage 116 | # 公式的右侧项,ratio小于1-eps就输出1-eps,大于1+eps就输出1+eps 117 | surr2 = torch.clamp(ratio, 1-self.eps, 1+self.eps) * advantage 118 | 119 | # 策略网络的损失函数 120 | actor_loss = torch.mean(-torch.min(surr1, surr2)) 121 | # 价值网络的损失函数,当前时刻的state_value - 下一时刻的state_value 122 | critic_loss = torch.mean(F.mse_loss(self.critic(states), td_target.detach())) 123 | 124 | # 梯度清0 125 | self.actor_optimizer.zero_grad() 126 | self.critic_optimizer.zero_grad() 127 | # 反向传播 128 | actor_loss.backward() 129 | critic_loss.backward() 130 | # 梯度更新 131 | self.actor_optimizer.step() 132 | self.critic_optimizer.step() 133 | -------------------------------------------------------------------------------- /Model/6. PPO_Discrete/run_this.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import gym 4 | import torch 5 | from RL_brain import PPO 6 | 7 | device = torch.device('cuda') if torch.cuda.is_available() \ 8 | else torch.device('cpu') 9 | 10 | # ----------------------------------------- # 11 | # 参数设置 12 | # ----------------------------------------- # 13 | 14 | num_episodes = 100 # 总迭代次数 15 | gamma = 0.9 # 折扣因子 16 | actor_lr = 1e-3 # 策略网络的学习率 17 | critic_lr = 1e-2 # 价值网络的学习率 18 | n_hiddens = 16 # 隐含层神经元个数 19 | env_name = 'CartPole-v1' 20 | return_list = [] # 保存每个回合的return 21 | 22 | # ----------------------------------------- # 23 | # 环境加载 24 | # ----------------------------------------- # 25 | 26 | env = gym.make(env_name, render_mode="human") 27 | n_states = env.observation_space.shape[0] # 状态数 4 28 | n_actions = env.action_space.n # 动作数 2 29 | 30 | # ----------------------------------------- # 31 | # 模型构建 32 | # ----------------------------------------- # 33 | 34 | agent = PPO(n_states=n_states, # 状态数 35 | n_hiddens=n_hiddens, # 隐含层数 36 | n_actions=n_actions, # 动作数 37 | actor_lr=actor_lr, # 策略网络学习率 38 | critic_lr=critic_lr, # 价值网络学习率 39 | lmbda = 0.95, # 优势函数的缩放因子 40 | epochs = 10, # 一组序列训练的轮次 41 | eps = 0.2, # PPO中截断范围的参数 42 | gamma=gamma, # 折扣因子 43 | device = device 44 | ) 45 | 46 | # ----------------------------------------- # 47 | # 训练--回合更新 on_policy 48 | # ----------------------------------------- # 49 | 50 | for i in range(num_episodes): 51 | 52 | state = env.reset()[0] # 环境重置 53 | done = False # 任务完成的标记 54 | episode_return = 0 # 累计每回合的reward 55 | 56 | # 构造数据集,保存每个回合的状态数据 57 | transition_dict = { 58 | 'states': [], 59 | 'actions': [], 60 | 'next_states': [], 61 | 'rewards': [], 62 | 'dones': [], 63 | } 64 | 65 | while not done: 66 | action = agent.take_action(state) # 动作选择 67 | next_state, reward, done, _, _ = env.step(action) # 环境更新 68 | # 保存每个时刻的状态\动作\... 69 | transition_dict['states'].append(state) 70 | transition_dict['actions'].append(action) 71 | transition_dict['next_states'].append(next_state) 72 | transition_dict['rewards'].append(reward) 73 | transition_dict['dones'].append(done) 74 | # 更新状态 75 | state = next_state 76 | # 累计回合奖励 77 | episode_return += reward 78 | 79 | # 保存每个回合的return 80 | return_list.append(episode_return) 81 | # 模型训练 82 | agent.learn(transition_dict) 83 | 84 | # 打印回合信息 85 | print(f'iter:{i}, return:{np.mean(return_list[-10:])}') 86 | 87 | # -------------------------------------- # 88 | # 绘图 89 | # -------------------------------------- # 90 | 91 | plt.plot(return_list) 92 | plt.title('return') 93 | plt.show() 94 | -------------------------------------------------------------------------------- /Model/7. PPO_Continuous/RL_brain.py: -------------------------------------------------------------------------------- 1 | # 用于连续动作的PPO 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | # ------------------------------------- # 8 | # 策略网络--输出连续动作的高斯分布的均值和标准差 9 | # ------------------------------------- # 10 | 11 | class PolicyNet(nn.Module): 12 | def __init__(self, n_states, n_hiddens, n_actions): 13 | super(PolicyNet, self).__init__() 14 | self.fc1 = nn.Linear(n_states, n_hiddens) 15 | self.fc_mu = nn.Linear(n_hiddens, n_actions) 16 | self.fc_std = nn.Linear(n_hiddens, n_actions) 17 | # 前向传播 18 | def forward(self, x): # 19 | x = self.fc1(x) # [b, n_states] --> [b, n_hiddens] 20 | x = F.relu(x) 21 | mu = self.fc_mu(x) # [b, n_hiddens] --> [b, n_actions] 22 | mu = 2 * torch.tanh(mu) # 值域 [-2,2] 23 | std = self.fc_std(x) # [b, n_hiddens] --> [b, n_actions] 24 | std = F.softplus(std) # 值域 小于0的部分逼近0,大于0的部分几乎不变 25 | return mu, std 26 | 27 | # ------------------------------------- # 28 | # 价值网络 -- 评估当前状态的价值 29 | # ------------------------------------- # 30 | 31 | class ValueNet(nn.Module): 32 | def __init__(self, n_states, n_hiddens): 33 | super(ValueNet, self).__init__() 34 | self.fc1 = nn.Linear(n_states, n_hiddens) 35 | self.fc2 = nn.Linear(n_hiddens, 1) 36 | # 前向传播 37 | def forward(self, x): 38 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 39 | x = F.relu(x) 40 | x = self.fc2(x) # [b,n_hiddens]-->[b,1] 41 | return x 42 | 43 | # ------------------------------------- # 44 | # 模型构建--处理连续动作 45 | # ------------------------------------- # 46 | 47 | class PPO: 48 | def __init__(self, n_states, n_hiddens, n_actions, 49 | actor_lr, critic_lr, 50 | lmbda, epochs, eps, gamma, device): 51 | # 实例化策略网络 52 | self.actor = PolicyNet(n_states, n_hiddens, n_actions).to(device) 53 | # 实例化价值网络 54 | self.critic = ValueNet(n_states, n_hiddens).to(device) 55 | # 策略网络的优化器 56 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) 57 | # 价值网络的优化器 58 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) 59 | 60 | # 属性分配 61 | self.lmbda = lmbda # GAE优势函数的缩放因子 62 | self.epochs = epochs # 一条序列的数据用来训练多少轮 63 | self.eps = eps # 截断范围 64 | self.gamma = gamma # 折扣系数 65 | self.device = device 66 | 67 | # 动作选择 68 | def take_action(self, state): # 输入当前时刻的状态 69 | # [n_states]-->[1,n_states]-->tensor 70 | state = torch.tensor(state[np.newaxis, :]).to(self.device) 71 | # 预测当前状态的动作,输出动作概率的高斯分布 72 | mu, std = self.actor(state) 73 | # 构造高斯分布 74 | action_dict = torch.distributions.Normal(mu, std) 75 | # 随机选择动作 76 | action = action_dict.sample().item() 77 | return [action] # 返回动作值 78 | 79 | # 训练 80 | def update(self, transition_dict): 81 | # 提取数据集 82 | states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) # [b,n_states] 83 | actions = torch.tensor(transition_dict['actions'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 84 | rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 85 | next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) # [b,n_states] 86 | dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 87 | 88 | # 价值网络--目标,获取下一时刻的state_value [b,n_states]-->[b,1] 89 | next_states_target = self.critic(next_states) 90 | # 价值网络--目标,当前时刻的state_value [b,1] 91 | td_target = rewards + self.gamma * next_states_target * (1-dones) 92 | # 价值网络--预测,当前时刻的state_value [b,n_states]-->[b,1] 93 | td_value = self.critic(states) 94 | # 时序差分,预测值-目标值 # [b,1] 95 | td_delta = td_value - td_target 96 | 97 | # 对时序差分结果计算GAE优势函数 98 | td_delta = td_delta.cpu().detach().numpy() # [b,1] 99 | advantage_list = [] # 保存每个时刻的优势函数 100 | advantage = 0 # 优势函数初始值 101 | # 逆序遍历时序差分结果,把最后一时刻的放前面 102 | for delta in td_delta[::-1]: 103 | advantage = self.gamma * self.lmbda * advantage + delta 104 | advantage_list.append(advantage) 105 | # 正序排列优势函数 106 | advantage_list.reverse() 107 | # numpy --> tensor 108 | advantage = torch.tensor(advantage_list, dtype=torch.float).to(self.device) 109 | 110 | # 策略网络--预测,当前状态选择的动作的高斯分布 111 | mu, std = self.actor(states) # [b,1] 112 | # 基于均值和标准差构造正态分布 113 | action_dists = torch.distributions.Normal(mu.detch(), std.detch()) 114 | # 从正态分布中选择动作,并使用log函数 115 | old_log_prob = action_dists.log_prob(actions) 116 | 117 | # 一个序列训练epochs次 118 | for _ in range(self.epochs): 119 | # 预测当前状态下的动作 120 | mu, std = self.actor(states) 121 | # 构造正态分布 122 | action_dists = torch.distributions.Normal(mu, std) 123 | # 当前策略在 t 时刻智能体处于状态 s 所采取的行为概率 124 | log_prob = action_dists.log_prob(actions) 125 | # 计算概率的比值来控制新策略更新幅度 126 | ratio = torch.exp(log_prob - old_log_prob) 127 | 128 | # 公式的左侧项 129 | surr1 = ratio * advantage 130 | # 公式的右侧项,截断 131 | surr2 = torch.clamp(ratio, 1-self.eps, 1+self.eps) 132 | 133 | # 策略网络的损失PPO-clip 134 | actor_loss = torch.mean(-torch.min(surr1,surr2)) 135 | # 价值网络的当前时刻预测值,与目标价值网络当前时刻的state_value之差 136 | critic_loss = torch.mean(F.mse_loss(self.critic(states), td_target.detch())) 137 | 138 | # 优化器清0 139 | self.actor_optimizer.zero_grad() 140 | self.critic_optimizer.zero_grad() 141 | # 梯度反传 142 | actor_loss.backward() 143 | critic_loss.backward() 144 | # 参数更新 145 | self.actor_optimizer.step() 146 | self.critic_optimizer.step() 147 | -------------------------------------------------------------------------------- /Model/7. PPO_Continuous/run_this.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import gym 4 | import torch 5 | from RL_brain import PPO 6 | 7 | device = torch.device('cuda') if torch.cuda.is_available() \ 8 | else torch.device('cpu') 9 | 10 | # ----------------------------------------- # 11 | # 参数设置 12 | # ----------------------------------------- # 13 | 14 | num_episodes = 1 # 总迭代次数 15 | gamma = 0.9 # 折扣因子 16 | actor_lr = 1e-3 # 策略网络的学习率 17 | critic_lr = 1e-2 # 价值网络的学习率 18 | n_hiddens = 16 # 隐含层神经元个数 19 | env_name = 'Pendulum-v1' # 连续环境 20 | return_list = [] # 保存每个回合的return 21 | 22 | # ----------------------------------------- # 23 | # 环境加载 24 | # ----------------------------------------- # 25 | 26 | env = gym.make(env_name, render_mode="human") 27 | n_states = env.observation_space.shape[0] # 状态数 3 28 | n_actions = env.action_space.shape[0] # 动作数 1 29 | 30 | # ----------------------------------------- # 31 | # 模型构建 32 | # ----------------------------------------- # 33 | 34 | agent = PPO(n_states=n_states, # 状态数3 35 | n_hiddens=n_hiddens, # 隐含层数 36 | n_actions=n_actions, # 动作数1 37 | actor_lr=actor_lr, # 策略网络学习率 38 | critic_lr=critic_lr, # 价值网络学习率 39 | lmbda = 0.95, # 优势函数的缩放因子 40 | epochs = 10, # 一组序列训练的轮次 41 | eps = 0.2, # PPO中截断范围的参数 42 | gamma=gamma, # 折扣因子 43 | device = device 44 | ) 45 | 46 | # ----------------------------------------- # 47 | # 训练--回合更新 on_policy 48 | # ----------------------------------------- # 49 | 50 | for i in range(num_episodes): 51 | 52 | state = env.reset()[0] # 环境重置 53 | done = False # 任务完成的标记 54 | episode_return = 0 # 累计每回合的reward 55 | 56 | # 构造数据集,保存每个回合的状态数据 57 | transition_dict = { 58 | 'states': [], 59 | 'actions': [], 60 | 'next_states': [], 61 | 'rewards': [], 62 | 'dones': [], 63 | } 64 | 65 | while not done: 66 | action = agent.take_action(state) # 动作选择 67 | next_state, reward, done, _, _ = env.step(action) # 环境更新 68 | # 保存每个时刻的状态\动作\... 69 | transition_dict['states'].append(state) 70 | transition_dict['actions'].append(action) 71 | transition_dict['next_states'].append(next_state) 72 | transition_dict['rewards'].append(reward) 73 | transition_dict['dones'].append(done) 74 | # 更新状态 75 | state = next_state 76 | # 累计回合奖励 77 | episode_return += reward 78 | 79 | # 保存每个回合的return 80 | return_list.append(episode_return) 81 | # 模型训练 82 | agent.learn(transition_dict) 83 | 84 | # 打印回合信息 85 | print(f'iter:{i}, return:{np.mean(return_list[-10:])}') 86 | 87 | # -------------------------------------- # 88 | # 绘图 89 | # -------------------------------------- # 90 | 91 | plt.plot(return_list) 92 | plt.title('return') 93 | plt.show() 94 | -------------------------------------------------------------------------------- /Model/8. SAC_Discrete/RL_brain.py: -------------------------------------------------------------------------------- 1 | # 处理离散问题的模型 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | import numpy as np 6 | import collections 7 | import random 8 | 9 | # ----------------------------------------- # 10 | # 经验回放池 11 | # ----------------------------------------- # 12 | 13 | class ReplayBuffer: 14 | def __init__(self, capacity): # 经验池容量 15 | self.buffer = collections.deque(maxlen=capacity) # 队列,先进先出 16 | # 经验池增加 17 | def add(self, state, action, reward, next_state, done): 18 | self.buffer.append((state, action, reward, next_state, done)) 19 | # 随机采样batch组 20 | def sample(self, batch_size): 21 | transitions = random.sample(self.buffer, batch_size) 22 | # 取出这batch组数据 23 | state, action, reward, next_state, done = zip(*transitions) 24 | return np.array(state), action, reward, np.array(next_state), done 25 | # 当前时刻的经验池容量 26 | def size(self): 27 | return len(self.buffer) 28 | 29 | # ----------------------------------------- # 30 | # 策略网络 31 | # ----------------------------------------- # 32 | 33 | class PolicyNet(nn.Module): 34 | def __init__(self, n_states, n_hiddens, n_actions): 35 | super(PolicyNet, self).__init__() 36 | self.fc1 = nn.Linear(n_states, n_hiddens) 37 | self.fc2 = nn.Linear(n_hiddens, n_actions) 38 | # 前向传播 39 | def forward(self, x): # 获取当前状态下的动作选择概率 40 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 41 | x = F.relu(x) 42 | x = self.fc2(x) # [b,n_hiddens]-->[b,n_actions] 43 | # 每个状态下对应的每个动作的动作概率 44 | x = F.softmax(x, dim=1) # [b,n_actions] 45 | return x 46 | 47 | # ----------------------------------------- # 48 | # 价值网络 49 | # ----------------------------------------- # 50 | 51 | class ValueNet(nn.Module): 52 | def __init__(self, n_states, n_hiddens, n_actions): 53 | super(ValueNet, self).__init__() 54 | self.fc1 = nn.Linear(n_states, n_hiddens) 55 | self.fc2 = nn.Linear(n_hiddens, n_actions) 56 | # 当前时刻的state_value 57 | def forward(self, x): 58 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 59 | x = F.relu(x) 60 | x = self.fc2(x) # [b,n_hiddens]-->[b,n_actions] 61 | return x 62 | 63 | # ----------------------------------------- # 64 | # 模型构建 65 | # ----------------------------------------- # 66 | 67 | class SAC: 68 | def __init__(self, n_states, n_hiddens, n_actions, 69 | actor_lr, critic_lr, alpha_lr, 70 | target_entropy, tau, gamma, device): 71 | 72 | # 实例化策略网络 73 | self.actor = PolicyNet(n_states, n_hiddens, n_actions).to(device) 74 | # 实例化第一个价值网络--预测 75 | self.critic_1 = ValueNet(n_states, n_hiddens, n_actions).to(device) 76 | # 实例化第二个价值网络--预测 77 | self.critic_2 = ValueNet(n_states, n_hiddens, n_actions).to(device) 78 | # 实例化价值网络1--目标 79 | self.target_critic_1 = ValueNet(n_states, n_hiddens, n_actions).to(device) 80 | # 实例化价值网络2--目标 81 | self.target_critic_2 = ValueNet(n_states, n_hiddens, n_actions).to(device) 82 | 83 | # 预测和目标的价值网络的参数初始化一样 84 | self.target_critic_1.load_state_dict(self.critic_1.state_dict()) 85 | self.target_critic_2.load_state_dict(self.critic_2.state_dict()) 86 | 87 | # 策略网络的优化器 88 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) 89 | # 目标网络的优化器 90 | self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(), lr=critic_lr) 91 | self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(), lr=critic_lr) 92 | 93 | # 初始化可训练参数alpha 94 | self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float) 95 | # alpha可以训练求梯度 96 | self.log_alpha.requires_grad = True 97 | # 定义alpha的优化器 98 | self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) 99 | 100 | # 属性分配 101 | self.target_entropy = target_entropy 102 | self.gamma = gamma 103 | self.tau = tau 104 | self.device = device 105 | 106 | # 动作选择 107 | def take_action(self, state): # 输入当前状态 [n_states] 108 | # 维度变换 numpy[n_states]-->tensor[1,n_states] 109 | state = torch.tensor(state[np.newaxis,:], dtype=torch.float).to(self.device) 110 | # 预测当前状态下每个动作的概率 [1,n_actions] 111 | probs = self.actor(state) 112 | # 构造与输出动作概率相同的概率分布 113 | action_dist = torch.distributions.Categorical(probs) 114 | # 从当前概率分布中随机采样tensor-->int 115 | action = action_dist.sample().item() 116 | return action 117 | 118 | # 计算目标,当前状态下的state_value 119 | def calc_target(self, rewards, next_states, dones): 120 | # 策略网络预测下一时刻的state_value [b,n_states]-->[b,n_actions] 121 | next_probs = self.actor(next_states) 122 | # 对每个动作的概率计算ln [b,n_actions] 123 | next_log_probs = torch.log(next_probs + 1e-8) 124 | # 计算熵 [b,1] 125 | entropy = -torch.sum(next_probs * next_log_probs, dim=1, keepdims=True) 126 | # 目标价值网络,下一时刻的state_value [b,n_actions] 127 | q1_value = self.target_critic_1(next_states) 128 | q2_value = self.target_critic_2(next_states) 129 | # 取出最小的q值 [b, 1] 130 | min_qvalue = torch.sum(next_probs * torch.min(q1_value,q2_value), dim=1, keepdims=True) 131 | # 下个时刻的state_value [b, 1] 132 | next_value = min_qvalue + self.log_alpha.exp() * entropy 133 | 134 | # 时序差分,目标网络输出当前时刻的state_value [b, n_actions] 135 | td_target = rewards + self.gamma * next_value * (1-dones) 136 | return td_target 137 | 138 | # 软更新,每次训练更新部分参数 139 | def soft_update(self, net, target_net): 140 | # 遍历预测网络和目标网络的参数 141 | for param_target, param in zip(target_net.parameters(), net.parameters()): 142 | # 预测网络的参数赋给目标网络 143 | param_target.data.copy_(param_target.data*(1-self.tau) + param.data*self.tau) 144 | 145 | # 模型训练 146 | def update(self, transition_dict): 147 | # 提取数据集 148 | states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) # [b,n_states] 149 | actions = torch.tensor(transition_dict['actions']).view(-1,1).to(self.device) # [b,1] 150 | rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 151 | next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) # [b,n_states] 152 | dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 153 | 154 | # --------------------------------- # 155 | # 更新2个价值网络 156 | # --------------------------------- # 157 | 158 | # 目标网络的state_value [b, 1] 159 | td_target = self.calc_target(rewards, next_states, dones) 160 | # 价值网络1--预测,当前状态下的动作价值 [b, 1] 161 | critic_1_qvalues = self.critic_1(states).gather(1, actions) 162 | # 均方差损失 预测-目标 163 | critic_1_loss = torch.mean(F.mse_loss(critic_1_qvalues, td_target.detach())) 164 | # 价值网络2--预测 165 | critic_2_qvalues = self.critic_2(states).gather(1, actions) 166 | # 均方差损失 167 | critic_2_loss = torch.mean(F.mse_loss(critic_2_qvalues, td_target.detach())) 168 | 169 | # 梯度清0 170 | self.critic_1_optimizer.zero_grad() 171 | self.critic_2_optimizer.zero_grad() 172 | # 梯度反传 173 | critic_1_loss.backward() 174 | critic_2_loss.backward() 175 | # 梯度更新 176 | self.critic_1_optimizer.step() 177 | self.critic_2_optimizer.step() 178 | 179 | # --------------------------------- # 180 | # 更新策略网络 181 | # --------------------------------- # 182 | 183 | probs = self.actor(states) # 预测当前时刻的state_value [b,n_actions] 184 | log_probs = torch.log(probs + 1e-8) # [b,n_actions] 185 | # 计算策略网络的熵 [b,1] 186 | entropy = -torch.sum(probs * log_probs, dim=1, keepdim=True) 187 | # 价值网络预测当前时刻的state_value 188 | q1_value = self.critic_1(states) # [b,n_actions] 189 | q2_value = self.critic_2(states) 190 | # 取出价值网络输出的最小的state_value [b,1] 191 | min_qvalue = torch.sum(probs * torch.min(q1_value, q2_value), dim=1, keepdim=True) 192 | 193 | # 策略网络的损失 194 | actor_loss = torch.mean(-self.log_alpha.exp() * entropy - min_qvalue) 195 | # 梯度更新 196 | self.actor_optimizer.zero_grad() 197 | actor_loss.backward() 198 | self.actor_optimizer.step() 199 | 200 | # --------------------------------- # 201 | # 更新可训练遍历alpha 202 | # --------------------------------- # 203 | 204 | alpha_loss = torch.mean((entropy-self.target_entropy).detach() * self.log_alpha.exp()) 205 | # 梯度更新 206 | self.log_alpha_optimizer.zero_grad() 207 | alpha_loss.backward() 208 | self.log_alpha_optimizer.step() 209 | 210 | # 软更新目标价值网络 211 | self.soft_update(self.critic_1, self.target_critic_1) 212 | self.soft_update(self.critic_2, self.target_critic_2) 213 | -------------------------------------------------------------------------------- /Model/8. SAC_Discrete/run_this.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from RL_brain import ReplayBuffer, SAC 6 | 7 | # -------------------------------------- # 8 | # 参数设置 9 | # -------------------------------------- # 10 | 11 | num_epochs = 1 # 训练回合数 12 | capacity = 500 # 经验池容量 13 | min_size = 200 # 经验池训练容量 14 | batch_size = 64 15 | n_hiddens = 64 16 | actor_lr = 1e-3 # 策略网络学习率 17 | critic_lr = 1e-2 # 价值网络学习率 18 | alpha_lr = 1e-2 # 课训练变量的学习率 19 | target_entropy = -1 20 | tau = 0.005 # 软更新参数 21 | gamma = 0.9 # 折扣因子 22 | device = torch.device('cuda') if torch.cuda.is_available() \ 23 | else torch.device('cpu') 24 | 25 | # -------------------------------------- # 26 | # 环境加载 27 | # -------------------------------------- # 28 | 29 | env_name = "CartPole-v1" 30 | env = gym.make(env_name, render_mode="human") 31 | n_states = env.observation_space.shape[0] # 状态数 4 32 | n_actions = env.action_space.n # 动作数 2 33 | 34 | # -------------------------------------- # 35 | # 模型构建 36 | # -------------------------------------- # 37 | 38 | agent = SAC(n_states = n_states, 39 | n_hiddens = n_hiddens, 40 | n_actions = n_actions, 41 | actor_lr = actor_lr, 42 | critic_lr = critic_lr, 43 | alpha_lr = alpha_lr, 44 | target_entropy = target_entropy, 45 | tau = tau, 46 | gamma = gamma, 47 | device = device, 48 | ) 49 | 50 | # -------------------------------------- # 51 | # 经验回放池 52 | # -------------------------------------- # 53 | 54 | buffer = ReplayBuffer(capacity=capacity) 55 | 56 | # -------------------------------------- # 57 | # 模型构建 58 | # -------------------------------------- # 59 | 60 | return_list = [] # 保存每回合的return 61 | 62 | for i in range(num_epochs): 63 | state = env.reset()[0] 64 | epochs_return = 0 # 累计每个时刻的reward 65 | done = False # 回合结束标志 66 | 67 | while not done: 68 | # 动作选择 69 | action = agent.take_action(state) 70 | # 环境更新 71 | next_state, reward, done, _, _ = env.step(action) 72 | # 将数据添加到经验池 73 | buffer.add(state, action, reward, next_state, done) 74 | # 状态更新 75 | state = next_state 76 | # 累计回合奖励 77 | epochs_return += reward 78 | 79 | # 经验池超过要求容量,就开始训练 80 | if buffer.size() > min_size: 81 | s, a, r, ns, d = buffer.sample(batch_size) # 每次取出batch组数据 82 | # 构造数据集 83 | transition_dict = {'states': s, 84 | 'actions': a, 85 | 'rewards': r, 86 | 'next_states': ns, 87 | 'dones': d} 88 | # 模型训练 89 | agent.update(transition_dict) 90 | # 保存每个回合return 91 | return_list.append(epochs_return) 92 | 93 | # 打印回合信息 94 | print(f'iter:{i}, return:{np.mean(return_list[-10:])}') 95 | 96 | # -------------------------------------- # 97 | # 绘图 98 | # -------------------------------------- # 99 | 100 | plt.plot(return_list) 101 | plt.title('return') 102 | plt.show() 103 | -------------------------------------------------------------------------------- /Model/9. ippo_discrete/RL_brain.py: -------------------------------------------------------------------------------- 1 | # 和PPO离散模型基本一致 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | import numpy as np 6 | 7 | # ----------------------------------------- # 8 | # 策略网络--actor 9 | # ----------------------------------------- # 10 | 11 | class PolicyNet(nn.Module): # 输入当前状态,输出动作的概率分布 12 | def __init__(self, n_states, n_hiddens, n_actions): 13 | super(PolicyNet, self).__init__() 14 | self.fc1 = nn.Linear(n_states, n_hiddens) 15 | self.fc2 = nn.Linear(n_hiddens, n_hiddens) 16 | self.fc3 = nn.Linear(n_hiddens, n_actions) 17 | def forward(self, x): # [b,n_states] 18 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 19 | x = F.relu(x) 20 | x = self.fc2(x) # [b,n_hiddens]-->[b,n_hiddens] 21 | x = F.relu(x) 22 | x = self.fc3(x) # [b,n_hiddens]-->[b,n_actions] 23 | x = F.softmax(x, dim=1) # 每种动作选择的概率 24 | return x 25 | 26 | # ----------------------------------------- # 27 | # 价值网络--critic 28 | # ----------------------------------------- # 29 | 30 | class ValueNet(nn.Module): # 评价当前状态的价值 31 | def __init__(self, n_states, n_hiddens): 32 | super(ValueNet, self).__init__() 33 | self.fc1 = nn.Linear(n_states, n_hiddens) 34 | self.fc2 = nn.Linear(n_hiddens, n_hiddens) 35 | self.fc3 = nn.Linear(n_hiddens, 1) 36 | def forward(self, x): # [b,n_states] 37 | x = self.fc1(x) # [b,n_states]-->[b,n_hiddens] 38 | x = F.relu(x) 39 | x = self.fc2(x) # [b,n_hiddens]-->[b,n_hiddens] 40 | x = F.relu(x) 41 | x = self.fc3(x) # [b,n_hiddens]-->[b,1] 42 | return x 43 | 44 | # ----------------------------------------- # 45 | # 模型构建 46 | # ----------------------------------------- # 47 | 48 | class PPO: 49 | def __init__(self, n_states, n_hiddens, n_actions, 50 | actor_lr, critic_lr, 51 | lmbda, eps, gamma, device): 52 | # 属性分配 53 | self.n_hiddens = n_hiddens 54 | self.actor_lr = actor_lr # 策略网络的学习率 55 | self.critic_lr = critic_lr # 价值网络的学习率 56 | self.lmbda = lmbda # 优势函数的缩放因子 57 | self.eps = eps # ppo截断范围缩放因子 58 | self.gamma = gamma # 折扣因子 59 | self.device = device 60 | # 网络实例化 61 | self.actor = PolicyNet(n_states, n_hiddens, n_actions).to(device) # 策略网络 62 | self.critic = ValueNet(n_states, n_hiddens).to(device) # 价值网络 63 | # 优化器 64 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) 65 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) 66 | 67 | # 动作选择 68 | def take_action(self, state): # [n_states] 69 | state = torch.tensor([state], dtype=torch.float).to(self.device) # [1,n_states] 70 | probs = self.actor(state) # 当前状态的动作概率 [b,n_actions] 71 | action_dist = torch.distributions.Categorical(probs) # 构造概率分布 72 | action = action_dist.sample().item() # 从概率分布中随机取样 int 73 | return action 74 | 75 | # 训练 76 | def update(self, transition_dict): 77 | # 取出数据集 78 | states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) # [b,n_states] 79 | actions = torch.tensor(transition_dict['actions']).view(-1,1).to(self.device) # [b,1] 80 | next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) # [b,n_states] 81 | dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 82 | rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1,1).to(self.device) # [b,1] 83 | 84 | # 价值网络 85 | next_state_value = self.critic(next_states) # 下一时刻的state_value [b,1] 86 | td_target = rewards + self.gamma * next_state_value * (1-dones) # 目标--当前时刻的state_value [b,1] 87 | td_value = self.critic(states) # 预测--当前时刻的state_value [b,1] 88 | td_delta = td_value - td_target # 时序差分 # [b,1] 89 | 90 | # 计算GAE优势函数,当前状态下某动作相对于平均的优势 91 | advantage = 0 # 累计一个序列上的优势函数 92 | advantage_list = [] # 存放每个时序的优势函数值 93 | td_delta = td_delta.cpu().detach().numpy() # gpu-->numpy 94 | for delta in td_delta[::-1]: # 逆序取出时序差分值 95 | advantage = self.gamma * self.lmbda * advantage + delta 96 | advantage_list.append(advantage) # 保存每个时刻的优势函数 97 | advantage_list.reverse() # 正序 98 | advantage = torch.tensor(advantage_list, dtype=torch.float).to(self.device) 99 | 100 | # 计算当前策略下状态s的行为概率 / 在之前策略下状态s的行为概率 101 | old_log_probs = torch.log(self.actor(states).gather(1,actions)) # [b,1] 102 | log_probs = torch.log(self.actor(states).gather(1,actions)) 103 | ratio = log_probs / old_log_probs 104 | 105 | # clip截断 106 | surr1 = ratio * advantage 107 | surr2 = torch.clamp(ratio, 1-self.eps, 1+self.eps) * advantage 108 | 109 | # 损失计算 110 | actor_loss = torch.mean(-torch.min(surr1, surr2)) # clip截断 111 | critic_loss = torch.mean(F.mse_loss(td_value, td_target)) # 112 | # 梯度更新 113 | self.actor_optimizer.zero_grad() 114 | self.critic_optimizer.zero_grad() 115 | actor_loss.backward() 116 | critic_loss.backward() 117 | self.actor_optimizer.step() 118 | self.critic_optimizer.step() 119 | -------------------------------------------------------------------------------- /Model/9. ippo_discrete/run_this.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import torch 4 | from ma_gym.envs.combat.combat import Combat 5 | from RL_brain import PPO 6 | import time 7 | 8 | # ----------------------------------------- # 9 | # 参数设置 10 | # ----------------------------------------- # 11 | 12 | n_hiddens = 64 # 隐含层数量 13 | actor_lr = 3e-4 14 | critic_lr = 1e-3 15 | gamma = 0.9 16 | lmbda = 0.97 17 | eps = 0.2 18 | device = torch.device('cuda') if torch.cuda.is_available() \ 19 | else torch.device('cpu') 20 | num_episodes = 10 # 回合数 21 | team_size = 2 # 智能体数量 22 | grid_size = (15, 15) 23 | 24 | # ----------------------------------------- # 25 | # 环境设置--onpolicy 26 | # ----------------------------------------- # 27 | 28 | # 创建Combat环境,格子世界的大小为15x15,己方智能体和敌方智能体数量都为2 29 | env = Combat(grid_shape=grid_size, n_agents=team_size, n_opponents=team_size) 30 | n_states = env.observation_space[0].shape[0] # 状态数 31 | n_actions = env.action_space[0].n # 动作数 32 | 33 | # 两个智能体共享同一个策略 34 | agent = PPO(n_states = n_states, 35 | n_hiddens = n_hiddens, 36 | n_actions = n_actions, 37 | actor_lr = actor_lr, 38 | critic_lr = critic_lr, 39 | lmbda = lmbda, 40 | eps = eps, 41 | gamma = gamma, 42 | device = device, 43 | ) 44 | 45 | # ----------------------------------------- # 46 | # 模型训练 47 | # ----------------------------------------- # 48 | 49 | for i in range(num_episodes): 50 | # 每回合开始前初始化两支队伍的数据集 51 | transition_dict_1 = { 52 | 'states': [], 53 | 'actions': [], 54 | 'next_states': [], 55 | 'rewards': [], 56 | 'dones': [], 57 | } 58 | transition_dict_2 = { 59 | 'states': [], 60 | 'actions': [], 61 | 'next_states': [], 62 | 'rewards': [], 63 | 'dones': [], 64 | } 65 | 66 | s = env.reset() # 状态初始化 67 | terminal = False # 结束标记 68 | 69 | while not terminal: 70 | 71 | env.render() 72 | 73 | # 动作选择 74 | a_1 = agent.take_action(s[0]) 75 | a_2 = agent.take_action(s[1]) 76 | 77 | # 环境更新 78 | next_s, r, done, info = env.step([a_1, a_2]) 79 | 80 | # 构造数据集 81 | transition_dict_1['states'].append(s[0]) 82 | transition_dict_1['actions'].append(a_1) 83 | transition_dict_1['next_states'].append(next_s[0]) 84 | transition_dict_1['dones'].append(False) 85 | transition_dict_1['rewards'].append(r[0]) 86 | 87 | transition_dict_2['states'].append(s[1]) 88 | transition_dict_2['actions'].append(a_2) 89 | transition_dict_2['next_states'].append(next_s[1]) 90 | transition_dict_2['dones'].append(False) 91 | transition_dict_2['rewards'].append(r[1]) 92 | 93 | s = next_s # 状态更新 94 | terminal = all(done) # 判断当前回合是否都为True,是返回True,不是返回False 95 | 96 | time.sleep(0.1) 97 | 98 | print('epoch:', i) 99 | 100 | # 回合训练 101 | agent.update(transition_dict_1) 102 | agent.update(transition_dict_2) 103 | -------------------------------------------------------------------------------- /build_env/pygame/1. Hit the Plane/run_this.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | import random 3 | import math 4 | 5 | # x向右为正,y向下为正 6 | # ----------------------------------- # 7 | # 窗口初始化 8 | # ----------------------------------- # 9 | 10 | game_over = False 11 | score = 0 # 初始分数 12 | 13 | width = 800 14 | height = 600 15 | 16 | pygame.init() # 初始化 17 | screen = pygame.display.set_mode((width, height)) # 设置窗口 18 | pygame.display.set_caption('hit planes') # 窗口名称 19 | 20 | # 添加背景音效 21 | bg_music = 'D:/zhouli/其他代码语言练习/pygame/picture/music.wav' 22 | pygame.mixer_music.load(bg_music) 23 | pygame.mixer.music.play(-1) # 循环播放 24 | # 添加爆炸音效 25 | bao = 'D:/zhouli/其他代码语言练习/pygame/picture/bao.wav' 26 | bao_sound = pygame.mixer.Sound(bao) 27 | 28 | # ----------------------------------- # 29 | # 创建对象 30 | # ----------------------------------- # 31 | 32 | # 背景 33 | bg_fp = 'D:/zhouli/其他代码语言练习/pygame/picture/space.jpg' 34 | bg = pygame.image.load(bg_fp) # 图片加载 35 | bg = pygame.transform.scale(bg, (width, height)) # 调整尺寸 36 | pygame.display.set_icon(bg) # 显示图片 37 | 38 | # 玩家飞船 39 | craft_fp = 'D:/zhouli/其他代码语言练习/pygame/picture/飞船.png' 40 | icon = pygame.image.load(craft_fp) # 图片加载 41 | icon = pygame.transform.scale(icon, (width//10, height//10)) # 调整尺寸 42 | pygame.display.set_icon(icon) # 显示图片 43 | # 玩家初始位置、速度 44 | playerx = 360 45 | playery = 500 46 | playerStep = 0 47 | 48 | # 创建敌人飞船 49 | num_enemy = 6 50 | enemy_fp = 'D:/zhouli/其他代码语言练习/pygame/picture/外星飞船.png' 51 | class Enemy(): 52 | def __init__(self): 53 | self.img = pygame.image.load(enemy_fp) # 图片加载 54 | self.img = pygame.transform.scale(self.img, (width//10, height//10)) # 调整尺寸 55 | self.x = random.randint(200, 600) 56 | self.y = random.randint(50, 250) 57 | self.step = random.random() / 5 # 速度 58 | # 敌人死亡后位置恢复 59 | def reset(self): 60 | self.x = random.randint(200, 600) 61 | self.y = random.randint(50, 250) 62 | 63 | 64 | # 创建敌人飞船 65 | enemies = [] # 保存创建的飞船 66 | for i in range(num_enemy): 67 | enemies.append(Enemy()) 68 | 69 | # ----------------------------------- # 70 | # 运动 71 | # ----------------------------------- # 72 | 73 | # 敌人位置 74 | def show_enemy(): 75 | global game_over 76 | for e in enemies: # 遍历所有飞船 77 | screen.blit(e.img, (e.x, e.y)) 78 | e.x += e.step 79 | # 飞机碰到边界就改变方向 80 | if e.x > width - width//10 or e.x < 0: # 右界限, 左界限 81 | e.step *= -1 # 反方向 82 | e.y += 50 # 下沉 83 | # 如果下沉到一定边就是敌人成功 84 | if e.y > 400: 85 | game_over = True 86 | print('game_over') 87 | enemies.clear() # 清空敌人列表 88 | 89 | # 玩家运动 90 | def move_player(): 91 | # 修改全局变量 92 | global playerx 93 | # 根据方向键调整飞船位置 94 | playerx += playerStep 95 | # 防止飞机出界 96 | if playerx > width - width//10: # 右界限 97 | playerx = width - width//10 98 | if playerx < 0: # 左界限 99 | playerx = 0 100 | 101 | # ------------------------------------ # 102 | # 计算子弹和敌人之间的距离 103 | # ------------------------------------ # 104 | 105 | def distance(bx,by, ex,ey): 106 | return math.sqrt((bx-ex)**2 + (by-ey)**2) 107 | 108 | # ----------------------------------- # 109 | # 子弹 110 | # ----------------------------------- # 111 | 112 | but_fp = 'D:/zhouli/其他代码语言练习/pygame/picture/子弹.png' 113 | class Buttet(): 114 | def __init__(self): 115 | self.img = pygame.image.load(but_fp) # 图片加载 116 | self.img = pygame.transform.scale(self.img, (width//20, height//20)) # 调整尺寸 117 | # 子弹初始位置是玩家当前位置 118 | self.x = playerx + 20 119 | self.y = playery - 10 120 | self.step = 0.5 # 速度 121 | 122 | # 子弹击中敌人 123 | def hit(self): 124 | global score 125 | # 遍历所有敌人 126 | for e in enemies: 127 | # 计算敌人和子弹的距离 128 | if distance(self.x, self.y, e.x, e.y) < 20: 129 | # 击中--子弹消失 130 | bullets.remove(self) 131 | # 敌人位置复原 132 | e.reset() 133 | # 爆炸音效 134 | bao_sound.play() 135 | # 分数增加 136 | score += 1 137 | 138 | # 保存现有的子弹 139 | bullets = [] 140 | 141 | # 显示并移动子弹 142 | def show_buttet(): 143 | for b in bullets: # 遍历所有子弹 144 | screen.blit(b.img, (b.x, b.y)) # 显示子弹 145 | b.hit() # 攻击目标 146 | b.y -= b.step # 向上移动 147 | # 飞出界就移除子弹 148 | if b.y < 0: 149 | bullets.remove(b) 150 | 151 | # --------------------------------- # 152 | # 显示分数 153 | # --------------------------------- # 154 | 155 | # font = pygame.font.SysFont('simsunnsimsun', 40) # 字体 156 | font = pygame.font.Font(None, 40) 157 | def show_score(): 158 | text = f'scores: {score}' 159 | # 渲染字体再显示 160 | score_render = font.render(text, True, (255,255,255)) 161 | screen.blit(score_render, (10,10)) 162 | 163 | # --------------------------------- # 164 | # 游戏结束提示 165 | # --------------------------------- # 166 | over_font = pygame.font.Font(None, 72) 167 | def check_is_over(): 168 | if game_over: 169 | text = 'Game Over' 170 | # 渲染字体再显示 171 | render = font.render(text, True, (255,0,0)) 172 | screen.blit(render, (300,300)) 173 | 174 | # ----------------------------------- # 175 | # 游戏循环 176 | # ----------------------------------- # 177 | 178 | # 循环 179 | running = True 180 | while running: 181 | 182 | # 绘制背景,锚点放在左上角(0,0) 183 | screen.blit(bg, (0,0)) 184 | # 显示分数 185 | show_score() 186 | 187 | # 获取事件 188 | for event in pygame.event.get(): 189 | # QUIT常量代表退出,点击窗口的X退出 190 | if event.type == pygame.QUIT: 191 | running = False 192 | 193 | # 按下键盘触发事件 194 | if event.type == pygame.KEYDOWN: 195 | # 判断哪一个按键 196 | if event.key == pygame.K_RIGHT: # 向右的方向键 197 | playerStep = 0.5 198 | elif event.key == pygame.K_LEFT: # 向左的方向键 199 | playerStep = -0.5 200 | elif event.key == pygame.K_SPACE: # 按下空格 201 | print('发射子弹') 202 | # 创建子弹 203 | bullets.append(Buttet()) 204 | 205 | # 抬起键盘触发事件 206 | if event.type == pygame.KEYUP: 207 | playerStep = 0 208 | 209 | # 绘制玩家飞船 210 | screen.blit(icon, (playerx, playery)) 211 | move_player() 212 | # 绘制敌人飞船 213 | show_enemy() 214 | # 显示子弹 215 | show_buttet() 216 | 217 | # 每一帧都检查一下是否结束 218 | check_is_over() 219 | 220 | # 界面更新 221 | pygame.display.update() 222 | -------------------------------------------------------------------------------- /build_env/pygame/2. shopping mall/run_this.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | 3 | # ------------------------------------ # 4 | # 环境渲染 5 | # ------------------------------------ # 6 | 7 | class MyRender: 8 | def __init__(self): 9 | self.px, self.py = 850, 700 # 行人坐标 10 | self.ax, self.ay = 50, 250 # 箭头坐标 11 | self.obs_list = [] # 保存所有障碍物的 12 | self.exit_list = [] # 出口坐标 13 | self.peo_list = {} # 行人属性 14 | 15 | # ---------------------------------- # 16 | # 参数设置 17 | # ---------------------------------- # 18 | 19 | self.width = 950 # 窗口显示尺寸 20 | self.height = 950 21 | 22 | self.grid_size = 50 # 每个网格的size 23 | self.num_grid = 19 # 横纵方向的网格数 24 | 25 | root = 'D:/programmes/强化学习/图库/素材/' 26 | wall_fp = root + '棕墙.png' 27 | weilan_fp = root + '蓝色栅栏.png' 28 | person_fp = root + '行人.png' 29 | lay_fp = root + '躺.png' 30 | fire_fp = root + '燃脂.png' 31 | door_fp = root + '大门.png' 32 | plane_fp = root + '无人机.png' 33 | 34 | # 图片加载 35 | wall = pygame.image.load(wall_fp) # 墙体图片加载 36 | door = pygame.image.load(door_fp) 37 | weilan = pygame.image.load(weilan_fp) 38 | person = pygame.image.load(person_fp) 39 | lay = pygame.image.load(lay_fp) 40 | fire = pygame.image.load(fire_fp) 41 | plane = pygame.image.load(plane_fp) 42 | 43 | # 尺寸调整 44 | self.wall = pygame.transform.scale(wall, (self.grid_size, self.grid_size)) 45 | self.door = pygame.transform.scale(door, (self.grid_size, self.grid_size)) 46 | self.weilan = pygame.transform.scale(weilan, (self.grid_size, self.grid_size)) 47 | self.person = pygame.transform.scale(person, (self.grid_size, self.grid_size)) 48 | self.lay = pygame.transform.scale(lay, (self.grid_size, self.grid_size)) 49 | self.fire = pygame.transform.scale(fire, (self.grid_size, self.grid_size)) 50 | self.plane = pygame.transform.scale(plane, (self.grid_size, self.grid_size)) 51 | 52 | # 渲染 53 | def render(self): 54 | 55 | # ---------------------------------- # 56 | # 窗口化 57 | # ---------------------------------- # 58 | 59 | pygame.init() # 初始化 60 | screen = pygame.display.set_mode((self.width, self.height)) # 设置窗口 61 | pygame.display.set_caption('Grid World') # 窗口名称 62 | screen.fill((255, 255, 255)) # 窗口填充为白色 63 | 64 | # ------------------------------------- # 65 | # 构建墙体 66 | # ------------------------------------- # 67 | 68 | for i in range(self.num_grid//2-1): # 第0行左 69 | pygame.display.set_icon(self.wall) # 显示图片 70 | screen.blit(self.wall, (i*50, 0)) 71 | self.obs_list.append((i*50, 0)) 72 | for i in range(self.num_grid//2+2, self.num_grid): # 第0行右 73 | pygame.display.set_icon(self.wall) # 显示图片 74 | screen.blit(self.wall, (i*50, 0)) 75 | self.obs_list.append((i*50, 0)) 76 | 77 | for i in range(self.num_grid//2-1): # 最后一行左 78 | pygame.display.set_icon(self.wall) # 显示图片 79 | screen.blit(self.wall, (i*50, (self.num_grid-1)*50)) 80 | self.obs_list.append((i*50, (self.num_grid-1)*50)) 81 | for i in range(self.num_grid//2+2, self.num_grid): # 最后一行右 82 | pygame.display.set_icon(self.wall) # 显示图片 83 | screen.blit(self.wall, (i*50, (self.num_grid-1)*50)) 84 | self.obs_list.append((i*50, (self.num_grid-1)*50)) 85 | 86 | for j in range(1, self.num_grid-1): # 最右侧一列 87 | pygame.display.set_icon(self.wall) # 显示图片 88 | screen.blit(self.wall, ((self.num_grid-1)*50, j*50)) 89 | self.obs_list.append(((self.num_grid-1)*50, j*50)) 90 | 91 | for j in range(1, self.num_grid-1): # 最左侧一列,偏上 92 | pygame.display.set_icon(self.wall) # 显示图片 93 | screen.blit(self.wall, (0, j*50)) 94 | self.obs_list.append((0, j*50)) 95 | 96 | # ------------------------------------- # 97 | # 出口 98 | # ------------------------------------- # 99 | 100 | for i in range(self.num_grid//2-1, self.num_grid//2+2): # 第0行左 101 | pygame.display.set_icon(self.door) # 显示图片 102 | screen.blit(self.door, (i*50, 0)) 103 | 104 | for i in range(self.num_grid//2-1, self.num_grid//2+2): # 第0行左 105 | pygame.display.set_icon(self.door) # 显示图片 106 | screen.blit(self.door, (i*50, 900)) 107 | 108 | # ------------------------------------- # 109 | # 围栏 110 | # ------------------------------------- # 111 | 112 | # 横向 113 | for i in range(3, self.num_grid-3): # 第1行 114 | pygame.display.set_icon(self.weilan) # 显示图片 115 | screen.blit(self.weilan, (i*50, 150)) 116 | 117 | for i in range(3, self.num_grid-3): # 第2行 118 | pygame.display.set_icon(self.weilan) # 显示图片 119 | screen.blit(self.weilan, (i*50, 350)) 120 | 121 | for i in range(3, self.num_grid-3): # 第3行 122 | pygame.display.set_icon(self.weilan) # 显示图片 123 | screen.blit(self.weilan, (i*50, 550)) 124 | 125 | for i in range(3, self.num_grid-3): # 第4行 126 | pygame.display.set_icon(self.weilan) # 显示图片 127 | screen.blit(self.weilan, (i*50, 750)) 128 | 129 | # 纵向 130 | for j in range(4, 5): # 列向-左上 131 | pygame.display.set_icon(self.weilan) # 显示图片 132 | screen.blit(self.weilan, (150, j*50)) 133 | for j in range(6, 7): # 列向-左上 134 | pygame.display.set_icon(self.weilan) # 显示图片 135 | screen.blit(self.weilan, (150, j*50)) 136 | 137 | for j in range(12, 13): # 列向-左下 138 | pygame.display.set_icon(self.weilan) 139 | screen.blit(self.weilan, (150, j*50)) 140 | for j in range(14, 15): # 列向-左下 141 | pygame.display.set_icon(self.weilan) 142 | screen.blit(self.weilan, (150, j*50)) 143 | 144 | for j in range(4, 5): # 列向-右上 145 | pygame.display.set_icon(self.weilan) # 显示图片 146 | screen.blit(self.weilan, (750, j*50)) 147 | for j in range(6, 7): # 列向-右上 148 | pygame.display.set_icon(self.weilan) # 显示图片 149 | screen.blit(self.weilan, (750, j*50)) 150 | 151 | for j in range(12, 13): # 列向-右下 152 | pygame.display.set_icon(self.weilan) 153 | screen.blit(self.weilan, (750, j*50)) 154 | for j in range(14, 15): # 列向-右下 155 | pygame.display.set_icon(self.weilan) 156 | screen.blit(self.weilan, (750, j*50)) 157 | 158 | # 中间 159 | for j in range(4, 7): # 列向-上 160 | pygame.display.set_icon(self.weilan) 161 | screen.blit(self.weilan, (450, j*50)) 162 | 163 | for j in range(12, 15): # 列向-下 164 | pygame.display.set_icon(self.weilan) 165 | screen.blit(self.weilan, (450, j*50)) 166 | 167 | # ---------------------------------------- # 168 | # 火焰 169 | # ---------------------------------------- # 170 | pygame.display.set_icon(self.fire) # 显示图片 171 | screen.blit(self.fire, (450, 450)) 172 | 173 | # ---------------------------------------- # 174 | # 行人 175 | # ---------------------------------------- # 176 | 177 | pygame.display.set_icon(self.person) # 显示图片 178 | screen.blit(self.person, (self.px, self.py)) 179 | 180 | pygame.display.set_icon(self.lay) # 显示图片 181 | screen.blit(self.lay, (300, 50)) 182 | 183 | # ----------------------------------------- # 184 | # 无人机 185 | # ----------------------------------------- # 186 | pygame.display.set_icon(self.plane) # 显示图片 187 | screen.blit(self.plane, (150, 450)) 188 | 189 | 190 | # ---------------------------------------- # 191 | # 动画展示 192 | # ---------------------------------------- # 193 | 194 | env = MyRender() 195 | env.render() 196 | running = True 197 | while running: 198 | # 获取事件 199 | for event in pygame.event.get(): 200 | # QUIT常量代表退出,点击窗口的X退出 201 | if event.type == pygame.QUIT: 202 | running = False 203 | # 界面更新 204 | pygame.display.update() 205 | --------------------------------------------------------------------------------