├── .gitignore ├── MADDPG_Continous ├── .gitignore ├── KNOWN_ISSUES.md ├── README.md ├── README_EN.md ├── agents │ ├── Centralized │ │ └── readme.md │ ├── Independent │ │ └── readme.md │ └── maddpg │ │ ├── DDPG_agent.py │ │ ├── MADDPG_agent.py │ │ ├── NN_actor.py │ │ ├── NN_critic.py │ │ ├── buffer.py │ │ └── readme.md ├── envs │ ├── custom_agents_dynamics.py │ ├── simple_env_fixed render.py │ └── simple_tag_env.py ├── main_evaluate.py ├── main_evaluate_save_render2gif.py ├── main_parameters.py ├── main_train.py ├── plot │ ├── convert_gif_to_loop.py │ ├── demo-rewards_plot_ma.png │ ├── plot_rewards.py │ ├── simple_tag_v3_demo.gif │ └── simple_tag_v3_demo_loop.gif └── utils │ ├── conda-environment.yml │ ├── linux_environment.yml │ ├── logger.py │ ├── mac_arm_M4_environment.yml │ ├── pip-requirements.txt │ ├── pip-requirements_mac_arm_M4.txt │ ├── runner.py │ └── setupPettingzoo.py ├── MATD3_Continous ├── agents │ ├── MATD3_agent.py │ ├── MATD3_runner.py │ ├── NN_actor_td3.py │ ├── NN_critic_td3.py │ ├── TD3_agent.py │ └── buffer.py ├── envs │ ├── custom_agents_dynamics.py │ └── simple_tag_env.py ├── main │ ├── main_evaluate.py │ ├── main_parameters.py │ └── main_train.py ├── plot │ ├── README.md │ ├── plot_rewards.py │ └── training_rewards_demo.png ├── readme.md ├── readme_en.md └── utils │ ├── conda-environment.yml │ ├── linux_environment.yml │ ├── logger.py │ ├── mac_arm_M4_environment.yml │ ├── pip-requirements.txt │ ├── pip-requirements_mac_arm_M4.txt │ └── setupPettingzoo.py ├── README.md ├── README_en.md ├── RL_Learning-main ├── README.md └── scripts │ ├── Chapter10_Actor Critic │ ├── 1.[QAC]Simplest actor critic.py │ ├── 2.[A2C]Advantage actor critic.py │ ├── 3.1Importance sampling.py │ ├── 3.[Importance sampling]Off-policy actor critic.py │ └── 4.[DPG]Deterministic actor critic.py │ ├── Chapter4_Value iteration and Policy iteration │ ├── plot_figure │ │ ├── policy_iteration.png │ │ └── value_iteration.png │ ├── policy_iteration.py │ └── value iteration.py │ ├── Chapter5_Monte Carlo Methods │ ├── MC_Basic.py │ ├── MC_Exploring_Starts.py │ └── MC_epsilon_greedy.py │ ├── Chapter6_Stochastic_approximation │ └── Robbins-Monro algorithm.py │ ├── Chapter7_Temporal-Difference learning │ ├── 1.Sarsa.py │ ├── 2.n-step Sarsa.py │ ├── 3.Q-learning.py │ └── 4.Q-learning on policy.py │ ├── Chapter8_Value Function Approximaton │ ├── 1.TD-Linear.py │ ├── 2.Sarsa with function approximation.py │ ├── 3.Q-learning with function approximation.py │ └── 4.[DQN]Deep Q-Network or Q-learning.py │ ├── Chapter9_Policy Gradient │ └── [Reinforce]Monte Carlo policy gradient.py │ ├── grid_env.py │ ├── model.py │ ├── render.py │ └── solver.py ├── img.png └── 动手学强化学习 ├── DQN ├── DQN.py ├── display.py └── main.py ├── Hands-on-RL ├── README.md ├── rl_utils.py ├── 第10章-Actor-Critic算法.ipynb ├── 第11章-TRPO算法.ipynb ├── 第12章-PPO算法.ipynb ├── 第13章-DDPG算法.ipynb ├── 第14章-SAC算法.ipynb ├── 第15章-模仿学习.ipynb ├── 第16章-模型预测控制.ipynb ├── 第17章-基于模型的策略优化.ipynb ├── 第18章-离线强化学习.ipynb ├── 第19章-目标导向的强化学习.ipynb ├── 第20章-多智能体强化学习入门.ipynb ├── 第21章-多智能体强化学习进阶.ipynb ├── 第2章-多臂老虎机问题.ipynb ├── 第3章-马尔可夫决策过程.ipynb ├── 第4章-动态规划算法.ipynb ├── 第5章-时序差分算法.ipynb ├── 第6章-Dyna-Q算法.ipynb ├── 第7章-DQN算法.ipynb ├── 第8章-DQN改进算法.ipynb └── 第9章-策略梯度算法.ipynb ├── README.md ├── rl_utils.py └── 策略梯度 ├── Reinforce.py ├── display.py └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | 3 | #忽略所有的__pycache__目录以及所有的.pyc、.pyo和.pyd文件 4 | **/__pycache__ 5 | *.py[cod] 6 | 7 | #如果你想忽略所有名为logs的文件夹,无论它们在项目中的位置如何,你可以在.gitignore文件中添加以下内容: 8 | **/logs/ 9 | # ignore the work I'm doing now 10 | RL-basic-algorithm/Graph based Multi-agent path planning/ 11 | 12 | #忽略 .\Multi-agents-RL\Multi-agent Partial Environment\MADDPG\maddpg-pettingzoo-pytorch-master下的* 13 | **/results/ 14 | 15 | 16 | #macOS 17 | **/.DS_Store 18 | Desktop.ini 19 | 20 | # Thumbnail cache files 21 | ._* 22 | Thumbs.db 23 | 24 | # Files that might appear on external disks 25 | .Spotlight-V100 26 | .Trashes 27 | 28 | # Compiled Python files 29 | *.pyc 30 | 31 | # 32 | plot/* 33 | 34 | # Temp File 35 | *.swp 36 | *.swa 37 | *.swo 38 | 39 | # github merge file 40 | *.orig 41 | 42 | # virtualenv 43 | venv 44 | __pycache__ 45 | 46 | #vscode 47 | .vscode 48 | 49 | 50 | 51 | **/matd3_models/ 52 | # 保留 maddpg_models 文件夹本身 53 | !**/matd3_models/.gitkeep 54 | 55 | # 忽略所有名为 data 的文件夹中的内容,但保留 data 文件夹本身 56 | **/matd3_data/* 57 | !**/matd3_data/.gitkeep 58 | 59 | **/log_td3_main/* 60 | !**/log_td3_main/.gitkeep 61 | -------------------------------------------------------------------------------- /MADDPG_Continous/.gitignore: -------------------------------------------------------------------------------- 1 | # Python 编译文件 2 | **/__pycache__/ 3 | *.py[cod] 4 | *.pyo 5 | *.pyd 6 | 7 | # PyCharm 设置 8 | .idea/ 9 | 10 | # VSCode 设置 11 | .vscode/ 12 | 13 | #如果你想忽略所有名为maddpg_models的文件夹,无论它们在项目中的位置如何,你可以在.gitignore文件中添加以下内容: 14 | **/maddpg_models/ 15 | # 保留 maddpg_models 文件夹本身 16 | !**/maddpg_models/.gitkeep 17 | 18 | # 忽略所有名为 data 的文件夹中的内容,但保留 data 文件夹本身 19 | **/data/* 20 | !**/data/.gitkeep 21 | 22 | **/logs/* 23 | !**/logs/.gitkeep 24 | 25 | # 深度学习模型 26 | *.pth 27 | 28 | # 操作系统特定的文件 29 | .DS_Store # macOS 30 | Thumbs.db # Windows -------------------------------------------------------------------------------- /MADDPG_Continous/README.md: -------------------------------------------------------------------------------- 1 | [🇨🇳 中文文档](README.md) | [🇺🇸 English](README_EN.md) 2 | 3 | # 多智能体深度强化学习MADDPG算法 - Predator-Prey追逃博弈 4 | 5 | ![项目状态](https://img.shields.io/badge/状态-重构完成-green) ![MADDPG](https://img.shields.io/badge/MADDPG-已实现-success)![Python](https://img.shields.io/badge/python-3.11.8%2B-blue) 6 | 7 | >**本项目专为Predator-Prey追逃博弈任务优化!** 在`PettingZoo MPE`环境基础上重构修改,提供了完整的多智能体协作与对抗环境,适用于围捕控制、群体智能和策略博弈研究。 8 | 9 | > Pettingzoo MPE环境:https://github.com/Farama-Foundation/PettingZoo 10 | 11 | > MADDPG algorithm Reference: https://github.com/Git-123-Hub/maddpg-pettingzoo-pytorch 12 | 13 | > 2025.4.26 update: MPE环境已经拆分出PettingZoo,详情请见MPE2:https://github.com/Farama-Foundation/MPE2 14 | 15 | ## 📈 训练效果 16 |
17 | 智能体行为 18 |

训练后的智能体行为展示:捕食者(红色)追逐猎物(绿色)的过程

19 | 20 | 训练收敛结果 21 |

MADDPG算法在simple_tag_v3环境中的奖励收敛曲线

22 |
23 | 24 | > **⚠️ 重要提示**:使用前请查看🔍 [**已知问题与解决方案KNOWN_ISSUES.md**](KNOWN_ISSUES.md)文档,了解常见问题的解决方法,特别是Windows系统的渲染卡死问题和PettingZoo版本兼容性问题。 25 | 26 | > **奖励函数修改**:官方的奖励配置无法训练出好的效果,需要修改追捕者的奖励函数 27 | 28 | > 当前状态:MADDPG算法已在 `/agents/maddpg/*.py` 中实现 29 | 30 | ## 🚀 实现进度 31 | | 算法 | 状态 | 位置 | 核心组件 | 32 | |----------------|--------|----------------------|----------------------------------| 33 | | MADDPG | ✅ 1.0 | `agents/maddpg/` | MADDPG_agent, DDPG_agent, buffer | 34 | | Independent RL | ⏳ 待完成 | `agents/independent/`| IndependentRL (计划中) | 35 | | Centralized RL | ⏳ 待完成 | `agents/centralized/`| CentralizedRL (计划中) | 36 | 37 | > 注意:MADDPG模块目前位于agents根目录(buffer.py, DDPG_agent.py等),但功能完整可用! 38 | 39 | ## 🏗️ 项目结构 40 | ```tree 41 | MADDPG_Continous/ 42 | ├── agents/ # 核心实现 43 | │ ├── maddpg/ # MADDPG算法实现 44 | │ │ ├── MADDPG_agent.py # 多智能体控制器 45 | │ │ ├── DDPG_agent.py # 基础DDPG实现 46 | │ │ ├── buffer.py # 经验回放缓冲区 47 | │ │ └── NN_(actor|critic).py # 神经网络模块 48 | │ ├── Independent/ # 独立RL实现(计划中) 49 | │ └── Centralized/ # 集中式RL实现(计划中) 50 | ├── envs/ # 自定义环境 51 | │ ├── custom_agents_dynamics.py # 扩展物理引擎 52 | │ └── simple_tag_env.py # 修改后的标签环境 53 | ├── utils/ # 工具模块 54 | │ ├── runner.py # 训练运行器 55 | │ ├── logger.py # 训练日志记录器 56 | │ ├── conda-environment.yml # Conda环境配置文件 57 | │ ├── linux_environment.yml # Linux环境配置文件 58 | │ ├── mac_arm_M4_environment.yml # Mac M系列芯片环境配置文件 59 | │ ├── pip-requirements.txt # 通用依赖项要求 60 | │ ├── pip-requirements_mac_arm_M4.txt # Mac M芯片特定依赖项 61 | │ └── setupPettingzoo.py # PettingZoo环境设置脚本 62 | ├── main_train.py # 统一训练入口 63 | ├── main_evaluate.py # 统一评估入口 64 | ├── main_evaluate_save_render2gif.py # 渲染并保存GIF 65 | └── main_parameters.py # 统一参数配置 66 | ``` 67 | 68 | ## 🛠️ 快速开始 69 | 70 | ### 环境配置 71 | 72 | > 相关配置需求在utils/文件夹下。 73 | 74 | ### Linux环境(ubuntu) 75 | 1. 使用linux_environment.yml创建新环境 76 | ```bash 77 | # 注意:将"MPE"替换为您喜欢的环境名称 78 | conda env create -f utils/linux_environment.yml -n MPE 79 | # 激活刚创建的环境 80 | conda activate MPE 81 | ``` 82 | 2. pip安装核心依赖 83 | ```bash 84 | pip install -r utils/pip-requirements.txt 85 | ``` 86 | ### Mac M系列芯片环境 87 | 1. 使用mac_arm_M4_environment.yml创建新conda环境 88 | ```bash 89 | # 注意:将"MPE"替换为您喜欢的环境名称 90 | conda env create -f utils/mac_arm_M4_environment.yml -n MPE 91 | # 激活刚创建的环境 92 | conda activate MPE 93 | ``` 94 | 2. pip安装Mac M芯片专用依赖 95 | ```bash 96 | pip install -r utils/pip-requirements_mac_arm_M4.txt 97 | ``` 98 | 99 | ### Windows创建并激活虚拟环境(推荐) 100 | 1. 使用conda-environment.yml创建新环境 101 | ```bash 102 | # 注意:将"MPE"替换为您喜欢的环境名称 103 | conda env create -f utils/conda-environment.yml -n MPE 104 | # 激活刚创建的环境 105 | conda activate MPE 106 | ``` 107 | 2. pip安装核心依赖 108 | ```bash 109 | pip install -r utils/pip-requirements.txt 110 | ``` 111 | ### 手动安装依赖 112 | > 上述虚拟环境创建成功后,您需要手动安装以下依赖: 113 | 3. 从PyTorch官网安装对应版本的PyTorch 114 | ```bash 115 | # 请访问 https://pytorch.org 选择适合您系统的安装命令 116 | # 例如: 117 | pip3 install torch torchvision torchaudio 118 | ``` 119 | 120 | 4. 2025.4.26 update: 安装`PettingZoo 1.25.0`版本,官方PyPI仓库最新版本更新为为1.25.0,内容与1.24.4相同。MPE被拆分出PettingZoo, **警告可忽略**,`MPE2`详情可见:https://github.com/Farama-Foundation/MPE2 121 | ```bash 122 | pip install pettingzoo==1.25.0 123 | ``` 124 | 125 | 4. ~~安装PettingZoo 1.24.4版本~~ 126 | ```bash 127 | # 重要说明:本项目需要PettingZoo 1.24.4版本,但官方PyPI仓库最新版本仅为1.24.3 128 | # 必须从GitHub源码安装才能获取1.24.4版本,安装命令为: 129 | # pip install "pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git" 130 | # 或者,您可以直接运行提供的安装脚本: 131 | # python utils/setupPettingzoo.py 132 | ``` 133 | 134 | ### 🖥️ 运行配置 135 | > **注意:** 为简化使用,当前版本已不再依赖Visdom进行可视化,您可跳过下述visdom配置,但保留相关配置供需要时参考。 136 | 137 | ```bash 138 | # 启动Visdom可视化服务器(新终端) 139 | python -m visdom.server 140 | # 或指定端口 141 | python -m visdom.server -port 8097 142 | 143 | # 访问训练仪表盘: 144 | # http://localhost:8097 145 | ``` 146 | 147 | ## 🔄 训练流程 148 | 1. **参数配置** 149 | 在 [`main_parameter.py`](main_parameters.py) 中设置环境参数: 150 | ```python 151 | env_name = 'simple_tag_v3' # 可选:simple_adversary_v3/simple_spread_v3 152 | episode_num = 5000 # 总训练回合数 153 | # 训练参数 154 | batch_size = 128 # 经验回放批次大小 155 | actor_lr = 0.01 # Actor网络学习率 156 | critic_lr = 0.01 # Critic网络学习率 157 | ``` 158 | 159 | 2. **启动Visdom服务器** 160 | ```bash 161 | # 在单独的终端中启动Visdom可视化服务器 162 | python -m visdom.server 163 | # 或指定端口 164 | python -m visdom.server -port 8097 165 | 166 | # 访问训练仪表盘: 167 | # http://localhost:8097 168 | ``` 169 | 3. **运行训练脚本** 170 | ```bash 171 | # 使用默认参数训练 172 | python main_train.py 173 | ``` 174 | 4. **在 `http://localhost:8097` 监控训练进度** 175 | 176 | 5. **评估训练模型** 177 | ```bash 178 | # 渲染训练好的模型策略 179 | python main_evaluate.py 180 | ``` 181 | 182 | ### 🌐 环境定制 183 | [`envs/simple_tag_env.py`](envs/simple_tag_env.py) 扩展了PettingZoo的MPE环境: 184 | - 在 [`envs/custom_agents_dynamics.py`](envs/custom_agents_dynamics.py) 中自定义智能体动力学 185 | - 修改的奖励函数,专为Predator-Prey任务优化 186 | - 可调节的智能体物理参数: 187 | - 世界大小:2.5单位(可根据追逃需求自定义) 188 | - 时间步长:0.1秒(影响动作响应速度) 189 | - 阻尼系数:0.2(影响智能体的惯性) 190 | - 碰撞参数: 191 | - 接触力:1e2(控制碰撞强度,影响围捕效果) 192 | - 接触边界:1e-3(控制碰撞柔软度) 193 | 194 | #### 🔄 自定义追逃场景 195 | 您可以轻松配置自己的追逃环境: 196 | - 自定义Predator数量、速度和加速度 197 | - 配置Evader的逃跑策略和敏捷度 198 | - 设计围捕奖励机制,鼓励协作或竞争行为 199 | - 实现复杂地形和障碍物(通过自定义碰撞处理) 200 | 201 | 202 | ## 📦 数据管理 203 | ### 模型存储 204 | 训练模型自动保存在: 205 | ```tree 206 | ./models/ 207 | └── maddpg_models/ # MADDPG检查点目录 208 | ├── {timestamp}_agent_0_actor.pth # Actor网络参数 209 | ├── {timestamp}_agent_0_critic.pth # Critic网络参数 210 | └── ... # 其他智能体网络 211 | ``` 212 | 213 | ### 可视化系统 214 | 训练指标可视化: 215 | ```tree 216 | plot/ 217 | ├── data/ # 序列化训练指标 218 | │ └── plot_data_20240515.pkl # PyTorch张量存储 219 | └── plot_rewards.py # 可视化工具 220 | ``` 221 | 222 | ### 日志系统 223 | 实现于 [`logger.py`](utils/logger.py): 224 | - 记录训练元数据(设备、时长) 225 | - 序列化超参数 226 | - 生成训练报告 227 | 228 | ```tree 229 | logs/ 230 | ├── training_log.json # 可读训练报告 231 | └── plot_data_20240515.pkl # 原始指标数据 232 | ``` 233 | 234 | 235 | ## 🐛 已知问题与解决方案 236 | 我们整理了一份详细的已知问题及其解决方案文档,包括: 237 | - **Windows系统渲染无响应问题**:修复PettingZoo的渲染问题 238 | - **PettingZoo版本兼容性问题**:本项目需要1.24.4版本 239 | - **Visdom服务器连接问题**:确保可视化服务正常运行 240 | - **奖励函数修改**:官方的奖励配置无法训练出好的效果,需要修改追捕者的奖励函数 241 | 👉 **[点击查看完整的已知问题与解决方案文档](KNOWN_ISSUES.md)** 242 | 243 | 如果您遇到文档中未提及的问题,请在Issues中提交,我们将尽快解决。 244 | 245 | ## 🤝 贡献 246 | 本项目的主要贡献在于: 247 | - 针对Predator-Prey追逃博弈任务的环境适配与优化 248 | - 改进的奖励函数设计,解决官方环境训练效果不佳的问题 249 | - 灵活的围捕控制参数配置,支持多种追逃场景 250 | 251 | 如遇到任何问题,欢迎提交Issue或Pull Request。若您有兴趣扩展更多追逃博弈场景,欢迎您的贡献! 252 | -------------------------------------------------------------------------------- /MADDPG_Continous/agents/Centralized/readme.md: -------------------------------------------------------------------------------- 1 | # Centralized RL 实现 2 | 3 | ## 算法特点 4 | - 集中式训练和执行 5 | - 将多智能体系统视为单一控制问题 6 | - 考虑所有智能体的全局状态和联合动作 7 | 8 | ## 核心组件 9 | - `CentralizedRL.py`: 集中式学习算法的主要实现 10 | - `DDPG_agent.py`: 改进的 DDPG 算法 11 | - `NN_actor.py`: 集中式 Actor 网络 12 | - `NN_critic.py`: 集中式 Critic 网络 13 | 14 | ## 优缺点 15 | 优点: 16 | - 能获得理论上的最优策略 17 | - 完整利用全局信息 18 | 19 | 缺点: 20 | - 状态空间和动作空间随智能体数量指数增长 21 | - 实际部署时需要集中式控制 22 | 23 | 24 | | 2025.2.18 updated. 25 |
26 | 1. 共享reward函数。 27 | 2. 定义:所有智能体共享同一个全局网络,通过智能体ID或角色标识区分。
28 | 输入输出:
29 | Actor:接收自身观测+智能体ID,输出动作。(待核实)
30 | Critic:接收全局状态+所有动作+智能体ID,输出Q值。(待核实)
-------------------------------------------------------------------------------- /MADDPG_Continous/agents/Independent/readme.md: -------------------------------------------------------------------------------- 1 | # Independent RL 实现 2 | 3 | ## 算法特点 4 | - 每个智能体独立学习和决策 5 | - 将多智能体问题转化为多个单智能体问题 6 | - 不考虑其他智能体的行为和策略 7 | 8 | ## 核心组件 9 | - `IndependentRL.py`: 独立学习算法的主要实现 10 | - `DDPG_agent.py`: 单智能体 DDPG 算法 11 | - `NN_actor.py`: Actor 网络结构 12 | - `NN_critic.py`: Critic 网络结构 13 | 14 | ## 优缺点 15 | 优点: 16 | - 实现简单,训练稳定 17 | - 易于并行化 18 | 19 | 缺点: 20 | - 忽略智能体间的交互 21 | - 难以学习协作行为 22 | 23 | 24 | | 2025.2.18 updated. 25 |
26 | 1. reward独立. 27 | 2. 智能体独自决策,没有信息共享。 action = actor(obs); Q = critic(obs, action). (应该没错) -------------------------------------------------------------------------------- /MADDPG_Continous/agents/maddpg/DDPG_agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | from typing import List 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import nn, Tensor 8 | from torch.optim import Adam 9 | from agents.maddpg.NN_actor import MLPNetworkActor 10 | from agents.maddpg.NN_critic import MLPNetworkCritic 11 | 12 | class DDPG(): 13 | def __init__(self, obs_dim, act_dim, global_obs_dim, actor_lr, critic_lr, device, action_bound, chkpt_dir, chkpt_name): 14 | self.actor = MLPNetworkActor(in_dim=obs_dim, out_dim=act_dim, hidden_dim = 64, action_bound=action_bound, chkpt_dir = chkpt_dir, chkpt_name = (chkpt_name + 'actor.pth')).to(device) 15 | self.critic = MLPNetworkCritic(in_dim=global_obs_dim, out_dim=1, hidden_dim = 64, chkpt_dir = chkpt_dir, chkpt_name = (chkpt_name + 'critic.pth')).to(device) 16 | #优化器 17 | self.actor_optimizer = Adam(self.actor.parameters(), lr = actor_lr) 18 | self.critic_optimizer = Adam(self.critic.parameters(), lr = critic_lr) 19 | # 创建相对于的target网络 20 | """ 21 | 使用 deepcopy 创建 target 网络是一个更好的选择,原因如下: 22 | 初始化一致性: 23 | - deepcopy 确保 target 网络和原网络完全相同的初始参数 24 | - 重新创建网络可能因为随机初始化导致参数不一致 25 | """ 26 | self.target_actor = deepcopy(self.actor) 27 | self.target_critic = deepcopy(self.critic) 28 | 29 | def action(self, obs, model_out = False): 30 | # 其中没有用到logi, 接受其返回值第二项为 '_' 具体地: a, _ = self.agents[agent].action(o) 31 | action, logi = self.actor(obs) 32 | return action, logi 33 | 34 | def target_action(self,obs): 35 | action, logi = self.target_actor(obs) 36 | return action, logi 37 | 38 | def critic_value(self, state_list: List[Tensor], act_list: List[Tensor]): # 包含Tensor对象的列表 39 | x = torch.cat(state_list + act_list, 1) 40 | return self.critic(x).squeeze(1) # tensor with a given length 41 | 42 | def target_critic_value(self, state_list: List[Tensor], act_list: List[Tensor]): 43 | x = torch.cat(state_list + act_list, 1) 44 | return self.target_critic(x).squeeze(1) # tensor with a given length 45 | 46 | def update_actor(self, loss): 47 | self.actor_optimizer.zero_grad() 48 | loss.backward() 49 | nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5) # clip_grad_norm_ :带有下划线后缀,表示这是一个就地操作,会直接修改传入的参数梯度。 50 | self.actor_optimizer.step() 51 | 52 | def update_critic(self, loss): 53 | self.critic_optimizer.zero_grad() 54 | loss.backward() 55 | nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) # clip_grad_norm_ :带有下划线后缀,表示这是一个就地操作,会直接修改传入的参数梯度。 56 | self.critic_optimizer.step() 57 | -------------------------------------------------------------------------------- /MADDPG_Continous/agents/maddpg/MADDPG_agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from agents.maddpg.DDPG_agent import DDPG 7 | from agents.maddpg.buffer import BUFFER 8 | 9 | class MADDPG(): 10 | # device = 'cpu' 11 | # device = 'cuda' if torch.cuda.is_available() else 'cpu' 12 | 13 | def __init__(self, dim_info, capacity, batch_size, actor_lr, critic_lr, action_bound, _chkpt_dir, _device = 'cpu', _model_timestamp = None): 14 | # 确保模型保存路径存在 15 | if _chkpt_dir is not None: 16 | os.makedirs(_chkpt_dir, exist_ok=True) 17 | 18 | self.device = _device 19 | self.model_timestamp = _model_timestamp 20 | # 状态(全局观测)与所有智能体动作维度的和 即critic网络的输入维度 dim_info = [obs_dim, act_dim] 21 | global_obs_act_dim = sum(sum(val) for val in dim_info.values()) 22 | # 创建智能体与buffer,每个智能体有自己的buffer, actor, critic 23 | self.agents = {} 24 | self.buffers = {} 25 | for agent_id, (obs_dim, act_dim) in dim_info.items(): 26 | # print("dim_info -> agent_id:",agent_id) 27 | # 每一个智能体都是一个DDPG智能体 28 | 29 | self.agents[agent_id] = DDPG(obs_dim, act_dim, global_obs_act_dim, actor_lr, critic_lr, self.device, action_bound[agent_id], chkpt_name = (agent_id + '_'), chkpt_dir = _chkpt_dir) 30 | # buffer均只是存储自己的观测与动作 31 | self.buffers[agent_id] = BUFFER(capacity, obs_dim, act_dim, self.device) 32 | self.dim_info = dim_info 33 | self.batch_size = batch_size 34 | 35 | def add(self, obs, action, reward, next_obs, done): 36 | #NOTE that the experience is a dict with agent name as its key 37 | for agent_id in obs.keys(): 38 | o = obs[agent_id] 39 | a = action[agent_id] 40 | if isinstance(a, int): #返回值为True or False, 判断a是否为int类型,是,返回True。 41 | # the action from env.action_space.sample() is int, we have to convert it to onehot 42 | a = np.eye(self.dim_info[agent_id][1])[a] 43 | r = reward[agent_id] 44 | next_o = next_obs[agent_id] 45 | d = done[agent_id] 46 | self.buffers[agent_id].add(o, a, r, next_o, d) 47 | 48 | def sample(self, batch_size): 49 | """sample experience from all the agents' buffers, and collect data for network input""" 50 | # get the total num of transitions, these buffers should have same number of transitions 51 | total_num = len(self.buffers['agent_0']) 52 | indices = np.random.choice(total_num, size = batch_size, replace = False) 53 | # NOTE that in MADDPG, we need the obs and actions of all agents 54 | # but only the reward and done of the current agent is needed in the calculation 55 | obs, act, reward, next_obs, done, next_act = {}, {}, {}, {}, {}, {} 56 | for agent_id, buffer in self.buffers.items(): 57 | o, a, r, n_o, d = buffer.sample(indices) 58 | obs[agent_id] = o 59 | act[agent_id] = a 60 | reward[agent_id] = r 61 | next_obs[agent_id] = n_o 62 | done[agent_id] = d 63 | # calculate next_action using target_network and next_state 64 | next_act[agent_id], _ = self.agents[agent_id].target_action(n_o) 65 | 66 | return obs, act, reward, next_obs, done, next_act 67 | 68 | def select_action(self, obs): 69 | action = {} 70 | for agent, o in obs.items(): 71 | o = torch.from_numpy(o).unsqueeze(0).float().to(self.device) 72 | a, _ = self.agents[agent].action(o) # torch.Size([1, action_size]) #action函数: action, logi = self.actor(obs) 73 | # NOTE that the output is a tensor, convert it to int before input to the environment 74 | action[agent] = a.squeeze(0).detach().cpu().numpy() 75 | return action 76 | # 更多解释-飞书链接:https://m6tsmtxj3r.feishu.cn/docx/Kb1vdqvBholiIUxcvYxcIcBcnEg?from=from_copylink 密码:6u2257#8 77 | def learn(self, batch_size, gamma): 78 | for agent_id, agent in self.agents.items(): 79 | obs, act, reward, next_obs, done, next_act = self.sample(batch_size) 80 | # upate critic 81 | critic_value = agent.critic_value( list(obs.values()), list(act.values()) ) 82 | 83 | next_target_critic_value = agent.target_critic_value(list(next_obs.values()), 84 | list(next_act.values())) 85 | target_value = reward[agent_id] + gamma * next_target_critic_value* (1-done[agent_id]) 86 | critic_loss = F.mse_loss(critic_value, target_value.detach(), reduction = 'mean') 87 | agent.update_critic(critic_loss) 88 | 89 | #update actor 90 | action, logits = agent.action(obs[agent_id], model_out = True) 91 | act[agent_id] = action 92 | actor_loss = - agent.critic_value( list(obs.values()), list(act.values()) ).mean() 93 | actor_loss_pse = torch.pow(logits, 2).mean() #这个是干嘛的? 94 | agent.update_actor(actor_loss + 1e-3 *actor_loss_pse) 95 | 96 | def update_target(self, tau): # 嵌套函数定义 97 | def soft_update(from_network, to_network): 98 | """ copy the parameters of `from_network` to `to_network` with a proportion of tau """ 99 | for from_p, to_p in zip(from_network.parameters(), to_network.parameters()): 100 | to_p.data.copy_(tau * from_p.data + (1.0 - tau) * to_p.data) 101 | 102 | for agent in self.agents.values(): 103 | soft_update(agent.actor, agent.target_actor) #体现使用嵌套函数的作用! 易于维护和使用 104 | soft_update(agent.critic, agent.target_critic) 105 | 106 | @classmethod 107 | def load( cls, dim_info, file): 108 | """ init maddpg using the model saved in `file` """ 109 | instance = cls(dim_info, 0, 0, 0, 0, os.path.dirname(file)) 110 | data = torch.load(file, map_location=instance.device) 111 | for agent_id, agent in instance.agents.items(): 112 | agent.actor.load_state_dict(data[agent_id]) 113 | return instance 114 | 115 | def save_model(self): 116 | for agent_id in self.dim_info.keys(): 117 | self.agents[agent_id].actor.save_checkpoint(is_target = False, timestamp = True) 118 | self.agents[agent_id].target_actor.save_checkpoint(is_target = True, timestamp = True) 119 | self.agents[agent_id].critic.save_checkpoint(is_target = False, timestamp = True) 120 | self.agents[agent_id].target_critic.save_checkpoint(is_target = True, timestamp = True) 121 | 122 | agent_id = list(self.dim_info.keys())[0] # 获取第一个代理的 ID 123 | agent = self.agents[agent_id] 124 | for name, param in agent.actor.state_dict().items(): 125 | # 仅打印前几个值(例如前5个) 126 | print(f"Layer: {name}, Shape: {param.shape}, Values: {param.flatten()[:5]}") # flatten() 展开参数为一维数组 127 | 128 | 129 | def load_model(self): 130 | for agent_id in self.dim_info.keys(): 131 | self.agents[agent_id].actor.load_checkpoint(device = self.device, is_target = False, timestamp = self.model_timestamp) 132 | self.agents[agent_id].target_actor.load_checkpoint(device = self.device, is_target = True, timestamp = self.model_timestamp) 133 | self.agents[agent_id].critic.load_checkpoint(device = self.device, is_target = False, timestamp = self.model_timestamp) 134 | self.agents[agent_id].target_critic.load_checkpoint(device = self.device, is_target = True, timestamp = self.model_timestamp) 135 | 136 | agent_id = list(self.dim_info.keys())[0] # 获取第一个代理的 ID 137 | agent = self.agents[agent_id] 138 | for name, param in agent.actor.state_dict().items(): 139 | # 仅打印前几个值(例如前5个) 140 | print(f"Layer: {name}, Shape: {param.shape}, Values: {param.flatten()[:5]}") # flatten() 展开参数为一维数组 141 | 142 | -------------------------------------------------------------------------------- /MADDPG_Continous/agents/maddpg/NN_actor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.functional as F 4 | import os 5 | from datetime import datetime 6 | 7 | class MLPNetworkActor(nn.Module): 8 | def __init__(self,chkpt_name, chkpt_dir, in_dim, out_dim, action_bound, hidden_dim = 64, non_linear = nn.ReLU()): 9 | super(MLPNetworkActor, self).__init__() 10 | self.chkpt_dir = chkpt_dir 11 | self.chkpt_name = chkpt_name 12 | 13 | # different ,为什么要保持这两个信息? 14 | self.out_dim = out_dim 15 | self.action_bound = action_bound 16 | 17 | self.net = nn.Sequential( 18 | nn.Linear(in_dim, hidden_dim), 19 | non_linear, 20 | nn.Linear(hidden_dim, hidden_dim), 21 | non_linear, 22 | nn.Linear(hidden_dim, out_dim), 23 | ).apply(self.init) 24 | 25 | @staticmethod 26 | def init(m): 27 | '''init patameters of the module''' 28 | gain = nn.init.calculate_gain('relu') 29 | if isinstance(m, nn.Linear): 30 | nn.init.xavier_uniform_(m.weight, gain = gain) #使用了 Xavier 均匀分布初始化(也叫 Glorot 初始化) 31 | m.bias.data.fill_(0.01) 32 | 33 | def forward(self, x): 34 | x = self.net(x) 35 | logi = x 36 | a_min = self.action_bound[0] 37 | a_max = self.action_bound[1] 38 | ''' 这三行为什么要这么处理? 引入了bias项干嘛''' 39 | k = torch.tensor( (a_max - a_min) /2 , device=x.device ) 40 | bias = torch.tensor( (a_max + a_min) /2, device=x.device ) 41 | action = k * torch.tanh(x) + bias 42 | return action, logi 43 | 44 | def save_checkpoint(self, is_target=False, timestamp = False): 45 | # 使用时间戳保存功能 46 | if timestamp is True: 47 | # 使用时间戳创建新文件夹 48 | current_timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M') 49 | save_dir = os.path.join(self.chkpt_dir, current_timestamp) 50 | else: 51 | # 直接保存在主目录下,不使用时间戳 52 | save_dir = self.chkpt_dir 53 | 54 | # 确保目录存在 55 | os.makedirs(save_dir, exist_ok=True) 56 | 57 | # 创建保存路径 58 | self.chkpt_file = os.path.join(save_dir, self.chkpt_name) 59 | 60 | if is_target: 61 | target_chkpt_name = self.chkpt_file.replace('actor', 'target_actor') 62 | os.makedirs(os.path.dirname(target_chkpt_name), exist_ok=True) 63 | torch.save(self.state_dict(), target_chkpt_name) 64 | else: 65 | os.makedirs(os.path.dirname(self.chkpt_file), exist_ok=True) 66 | torch.save(self.state_dict(), self.chkpt_file) 67 | 68 | def load_checkpoint(self, device = 'cpu', is_target = False, timestamp = None): # 默认加载target 69 | if timestamp and isinstance(timestamp, str): 70 | # 如果提供了有效的时间戳字符串,从对应文件夹加载 71 | load_dir = os.path.join(self.chkpt_dir, timestamp) 72 | else: 73 | # 否则从主目录加载 74 | load_dir = self.chkpt_dir 75 | 76 | # 使用os.path.join确保路径分隔符的一致性 77 | self.chkpt_file = os.path.join(load_dir, self.chkpt_name) 78 | 79 | if is_target: 80 | target_chkpt_name = self.chkpt_file.replace('actor', 'target_actor') 81 | # 确保路径存在 82 | if not os.path.exists(target_chkpt_name): 83 | print(f"警告: 找不到目标模型文件: {target_chkpt_name}") 84 | return 85 | self.load_state_dict(torch.load(target_chkpt_name, map_location=torch.device(device))) 86 | else: 87 | # 确保路径存在 88 | if not os.path.exists(self.chkpt_file): 89 | print(f"警告: 找不到模型文件: {self.chkpt_file}") 90 | return 91 | self.load_state_dict(torch.load(self.chkpt_file, map_location=torch.device(device))) -------------------------------------------------------------------------------- /MADDPG_Continous/agents/maddpg/NN_critic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.functional as F 4 | import os 5 | from datetime import datetime 6 | """ 7 | self.target_critic = CriticNetwork(*, *, 8 | chkpt_dir=chkpt_dir, 9 | name=self.agent_name+'_target_critic') 10 | """ 11 | class MLPNetworkCritic(nn.Module): 12 | def __init__(self, chkpt_name, chkpt_dir, in_dim, out_dim, hidden_dim = 64, non_linear = nn.ReLU()): 13 | super(MLPNetworkCritic, self).__init__() 14 | self.chkpt_dir = chkpt_dir 15 | self.chkpt_name = chkpt_name 16 | 17 | self.net = nn.Sequential( 18 | nn.Linear(in_dim, hidden_dim), 19 | non_linear, 20 | nn.Linear(hidden_dim, hidden_dim), 21 | non_linear, 22 | nn.Linear(hidden_dim, out_dim), 23 | ).apply(self.init) 24 | 25 | @staticmethod 26 | def init(m): 27 | '''init patameters of the module''' 28 | gain = nn.init.calculate_gain('relu') 29 | if isinstance(m, nn.Linear): 30 | nn.init.xavier_uniform_(m.weight, gain = gain) #使用了 Xavier 均匀分布初始化(也叫 Glorot 初始化) 31 | m.bias.data.fill_(0.01) 32 | 33 | def forward(self, x): 34 | return self.net(x) 35 | 36 | def save_checkpoint(self, is_target = False, timestamp = False): 37 | if timestamp is True: 38 | # 使用时间戳创建新文件夹 39 | current_timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M') 40 | save_dir = os.path.join(self.chkpt_dir, current_timestamp) 41 | else: 42 | # 直接保存在主目录下 43 | save_dir = self.chkpt_dir 44 | 45 | # 确保目录存在 46 | os.makedirs(save_dir, exist_ok=True) 47 | 48 | self.chkpt_file = os.path.join(save_dir, self.chkpt_name) 49 | 50 | if is_target: 51 | target_chkpt_name = self.chkpt_file.replace('critic', 'target_critic') 52 | os.makedirs(os.path.dirname(target_chkpt_name), exist_ok=True) 53 | torch.save(self.state_dict(), target_chkpt_name) 54 | else: 55 | os.makedirs(os.path.dirname(self.chkpt_file), exist_ok=True) 56 | torch.save(self.state_dict(), self.chkpt_file) 57 | 58 | def load_checkpoint(self, device = 'cpu', is_target = False, timestamp = None): 59 | if timestamp and isinstance(timestamp, str): 60 | # 如果提供了有效的时间戳字符串,从对应文件夹加载 61 | load_dir = os.path.join(self.chkpt_dir, timestamp) 62 | else: 63 | # 否则从主目录加载 64 | load_dir = self.chkpt_dir 65 | 66 | self.chkpt_file = os.path.join(load_dir, self.chkpt_name) 67 | 68 | if is_target: 69 | target_chkpt_name = self.chkpt_file.replace('critic', 'target_critic') 70 | self.load_state_dict(torch.load(target_chkpt_name, map_location=torch.device(device))) 71 | else: 72 | self.load_state_dict(torch.load(self.chkpt_file, map_location=torch.device(device))) 73 | -------------------------------------------------------------------------------- /MADDPG_Continous/agents/maddpg/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | class BUFFER(): 5 | 6 | def __init__(self,capacity, obs_dim, act_dim, device): 7 | self.capacity = capacity 8 | self.obs = np.zeros((capacity, obs_dim)) 9 | self.action = np.zeros((capacity, act_dim)) 10 | self.reward = np.zeros(capacity) 11 | self.next_obs = np.zeros((capacity, obs_dim)) 12 | self.done = np.zeros(capacity, dtype = bool) 13 | self._index = 0 14 | self._size = 0 15 | self.device = device 16 | 17 | def add(self,obs, action, reward, next_obs, done): 18 | self.obs[self._index] = obs 19 | self.action[self._index] = action 20 | self.reward[self._index] = reward 21 | self.next_obs[self._index] = next_obs 22 | self.done[self._index] = done 23 | 24 | self._index = (self._index +1) % self.capacity 25 | if self._size < self.capacity: 26 | self._size += 1 27 | 28 | 29 | def sample(self, indices): 30 | obs = self.obs[indices] 31 | action = self.action[indices] 32 | reward = self.reward[indices] 33 | next_obs = self.next_obs[indices] 34 | done = self.done[indices] 35 | 36 | obs = torch.from_numpy(obs).float().to(self.device) # torch.Size([batch_size, state_dim]) 37 | action = torch.from_numpy(action).float().to(self.device) # torch.Size([batch_size, action_dim]) 38 | reward = torch.from_numpy(reward).float().to(self.device) # just a tensor with length: batch_size 39 | # reward = (reward - reward.mean()) / (reward.std() + 1e-7) 40 | next_obs = torch.from_numpy(next_obs).float().to(self.device) # Size([batch_size, state_dim]) 41 | done = torch.from_numpy(done).float().to(self.device) # just a tensor with length: batch_size 42 | 43 | return obs, action, reward, next_obs, done 44 | 45 | def __len__(self): #保留方法 46 | return self._size 47 | -------------------------------------------------------------------------------- /MADDPG_Continous/agents/maddpg/readme.md: -------------------------------------------------------------------------------- 1 | 2025.2.18 2 | 3 | TODO: 4 | 5 | 1. 经典MADDPG:Critic是集中式的(全局输入),但每个智能体可独立更新Critic。 (需找经典代码)~ 6 | 7 | 8 | # 一、独立Critic网络的核心意义 9 | 1. 异构奖励函数支持 10 | 竞争场景:若智能体间存在利益冲突(如对抗游戏),每个Critic需学习不同的Q函数以反映各自奖励目标。 11 | 例如:足球游戏中,进攻方Critic需评估射门收益,防守方Critic需评估拦截收益。 12 | 混合协作场景:部分智能体可能有辅助性奖励(如无人机编队中的领航者与跟随者)。 13 | 14 | 2. 策略独立性 15 | 策略空间差异:即使输入相同,不同智能体的Actor网络输出动作分布不同,Critic需独立评估各自策略的全局影响。 16 | 非对称学习速率:独立Critic允许智能体以不同速度学习,避免共享网络导致的策略耦合震荡。 17 | 3. 实现灵活性 18 | 扩展性:支持未来扩展至异构观测/动作空间(如部分智能体为连续控制,其他为离散决策)。 19 | 调试便利:独立网络便于单独监控和调整特定智能体的学习过程。 20 | 21 | # 二、输入相同时的Critic差异性来源 22 | 即使Critic输入相同(所有Agent的obs+actions),以下因素仍会导致各Critic输出不同: 23 | 24 | 1. 网络参数独立性 25 | 初始随机化:独立网络参数初始值不同,导致梯度更新路径分化。 26 | 优化过程差异:不同Critic的优化器状态(如动量)独立积累。 27 | 2. 目标Q值差异 28 | 奖励函数不同:若 r_i ≠ r_j,目标Q值 target_q = r_i + γQ' 直接不同。 29 | 下一状态动作差异:不同智能体的目标Actor生成的动作策略不同(如进攻者选择突破,防守者选择拦截)。 30 | 3. 环境动力学影响 31 | 状态转移差异:不同智能体对环境的改变方式不同(如机器人推箱子任务中,不同推法导致不同后续状态)。 32 | # 三、独立Critic的代价与优化 33 | 1. 计算开销分析 34 | 训练速度:独立Critic的并行计算可通过GPU批处理缓解,实际影响有限。 35 | 内存占用:网络参数数量与智能体数量线性增长,可通过网络结构简化(如共享隐层)优化。 36 | 2. 优化策略 37 | 参数共享试探:在同构完全协作场景中,可尝试同类智能体共享Critic。 38 | ``` 39 | { 40 | # 示例:追击者共享Critic 41 | class SharedCritic(nn.Module): 42 | def __init__(self): 43 | super().__init__() 44 | self.fc1 = nn.Linear(global_input_dim, 64) 45 | } 46 | ``` 47 | 48 | # 初始化时分配共享实例 49 | chaser_critic = SharedCritic() 50 | for agent in chaser_agents: 51 | agent.critic = chaser_critic} 52 | # 初始化时分配共享实例 53 | chaser_critic = SharedCritic() 54 | for agent in chaser_agents: 55 | agent.critic = chaser_critic 56 | 分布式训练:利用多GPU或Ray框架实现并行更新。 57 | # 四、场景驱动的设计选择 58 | 59 | |场景类型|推荐架构|理由| 60 | |---|---|---| 61 | 完全协作+同构| 共享Critic(同类智能体) |减少冗余计算,利用环境对称性
62 | 竞争/混合奖励| 独立Critic| 反映不同奖励函数和策略目标 63 | 异构观测/动作空间| 独立Critic| 适应不同输入输出维度 64 | 初步算法验证| 独立Critic| 实现简单,避免共享逻辑复杂性 65 | 66 | # 五、代码实现对比解析 67 | ### 用户代码1(混合MADDPG/DDPG) 68 | https://github.com/shariqiqbal2810/maddpg-pytorch/blob/master/algorithms/maddpg.py
69 | 1. Critic输入:
70 | - MADDPG模式:全局obs+actions → 输入相同但Critic独立。
71 | - DDPG模式:仅自身obs+action → 输入不同。
72 | 2. 设计意图:兼容独立训练(DDPG)与协作训练(MADDPG),牺牲效率换取灵活性。 73 | ### 用户代码2(标准MADDPG) 74 | https://github.com/starry-sky6688/MADDPG/blob/master/maddpg/maddpg.py 75 | 1. Critic输入:强制全局obs+actions → 输入相同但Critic独立。 76 | 2. 设计意图:严格遵循CTDE范式,适合同构协作场景,扩展性较弱但结构清晰。 77 | 78 | # 六、总结 79 | 1. 必要性:独立Critic是处理异构奖励、策略差异和环境非平稳性的核心设计,即使输入相同,各Critic仍需独立更新以捕捉不同策略的全局影响。 80 | 2. 效率权衡:通过参数共享试探和分布式训练可缓解计算开销,但在多数复杂场景中,独立Critic的收益远大于其成本。 81 | 3. 实践建议:优先采用独立Critic实现,待任务明确后针对性优化(如同类共享)。 -------------------------------------------------------------------------------- /MADDPG_Continous/envs/custom_agents_dynamics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 该文件定义了自定义的环境,用于测试自定义的智能体动力学模型 3 | 4 | 继承自core.py 5 | 6 | """ 7 | import numpy as np 8 | from pettingzoo.mpe._mpe_utils.core import EntityState, AgentState, Action, Entity, Landmark, Agent 9 | from pettingzoo.mpe._mpe_utils.core import World 10 | 11 | class CustomWorld(World): 12 | def __init__(self, world_size = 2.5 ): # 13 | super().__init__() # 调用父类的构造函数 14 | self.world_size = world_size # Ronchy 添加世界大小 15 | self.dt = 0.1 # 时间步长 16 | self.damping = 0.2 # 阻尼系数 17 | # contact response parameters 18 | self.contact_force = 1e2 # 控制碰撞强度(默认1e2,值越大反弹越强) 19 | self.contact_margin = 1e-3 # 控制碰撞"柔软度"(默认1e-3,值越小越接近刚体) 20 | """ 21 | 常见问题示例 22 | 实体重叠穿透 contact_force太小 增大contact_force至1e3或更高 23 | 碰撞后震荡 damping太低 增大阻尼系数(如0.5) 24 | 微小距离抖动 contact_margin不合理 调整到1e-2~1e-4之间 25 | """ 26 | """ 27 | 重载底层动力学逻辑 28 | 主要是integrate_state()函数 29 | """ 30 | def step(self): 31 | # set actions for scripted agents 32 | # print("Using world -> step()") # 重载成功! 33 | for agent in self.scripted_agents: 34 | agent.action = agent.action_callback(agent, self) 35 | # gather forces applied to entities 36 | p_force = [None] * len(self.entities) 37 | # apply agent physical controls 38 | p_force = self.apply_action_force(p_force) # 加入噪声 39 | # apply environment forces 40 | p_force = self.apply_environment_force(p_force) # 碰撞力计算 collide为True时 41 | # integrate physical state 42 | self.integrate_state(p_force) # 动力学逻辑 43 | # update agent state 44 | for agent in self.agents: 45 | self.update_agent_state(agent) # 更新 communication action 后的状态 46 | 47 | # integrate physical state 48 | #函数功能:动力学逻辑。更新实体的位置和速度 49 | def integrate_state(self, p_force): 50 | for i, entity in enumerate(self.entities): 51 | if not entity.movable: 52 | continue 53 | # 速度阻尼衰减 54 | entity.state.p_vel *= (1 - self.damping) # 正确应用阻尼 55 | # 动力学 -> 运动学 56 | if p_force[i] is not None: 57 | acceleration = p_force[i] / entity.mass # F = ma 58 | entity.state.p_vel += acceleration * self.dt # v = v_0 + a * t 59 | # 更新位置 60 | entity.state.p_pos += entity.state.p_vel * self.dt # 更新位置 61 | # 限制位置在世界大小范围内 62 | # entity.state.p_pos = np.clip(entity.state.p_pos, -self.world_size, self.world_size) # Ronchy 添加世界大小限制 63 | 64 | # 速度限幅 65 | if entity.max_speed is not None: 66 | ######## 67 | speed = np.sqrt( 68 | np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1]) 69 | ) 70 | if speed > entity.max_speed: 71 | entity.state.p_vel = ( 72 | entity.state.p_vel 73 | / np.sqrt( 74 | np.square(entity.state.p_vel[0]) 75 | + np.square(entity.state.p_vel[1]) 76 | ) 77 | * entity.max_speed 78 | ) 79 | ########可替换为下列代码 ,效果相同 80 | # speed = np.linalg.norm(entity.state.p_vel) # 计算向量模长 81 | # if speed > entity.max_speed: 82 | # entity.state.p_vel = entity.state.p_vel * (entity.max_speed / speed) # 向量缩放 83 | 84 | 85 | # get collision forces for any contact between two entities 86 | # TODO: 碰撞逻辑待细化 87 | def get_collision_force(self, entity_a, entity_b): 88 | if (not entity_a.collide) or (not entity_b.collide): 89 | return [None, None] # not a collider 90 | if entity_a is entity_b: 91 | return [None, None] # don't collide against itself 92 | # compute actual distance between entities 93 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos 94 | dist = np.sqrt(np.sum(np.square(delta_pos))) #用norm更简洁 95 | # minimum allowable distance 96 | dist_min = entity_a.size + entity_b.size # 两个实体的半径之和 97 | # softmax penetration 98 | k = self.contact_margin 99 | penetration = np.logaddexp(0, -(dist - dist_min) / k) * k #渗透深度, 当 dist < dist_min 时产生虚拟渗透量 100 | force = self.contact_force * delta_pos / dist * penetration 101 | force_a = +force if entity_a.movable else None 102 | force_b = -force if entity_b.movable else None 103 | return [force_a, force_b] 104 | -------------------------------------------------------------------------------- /MADDPG_Continous/main_evaluate.py: -------------------------------------------------------------------------------- 1 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3 2 | from main_parameters import main_parameters 3 | from utils.runner import RUNNER 4 | from agents.maddpg.MADDPG_agent import MADDPG 5 | import torch 6 | from envs import simple_tag_env 7 | import os 8 | 9 | def get_env(env_name, ep_len=50, render_mode = "None"): 10 | """create environment and get observation and action dimension of each agent in this environment""" 11 | new_env = None 12 | if env_name == 'simple_adversary_v3': 13 | new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True) 14 | if env_name == 'simple_spread_v3': 15 | new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array") 16 | if env_name == 'simple_tag_v3': 17 | new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 18 | # new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 19 | new_env.reset() 20 | _dim_info = {} 21 | action_bound = {} 22 | for agent_id in new_env.agents: 23 | print("agent_id:",agent_id) 24 | _dim_info[agent_id] = [] # [obs_dim, act_dim] 25 | action_bound[agent_id] = [] #[low action, hign action] 26 | _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0]) 27 | _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0]) 28 | action_bound[agent_id].append(new_env.action_space(agent_id).low) 29 | action_bound[agent_id].append(new_env.action_space(agent_id).high) 30 | 31 | return new_env, _dim_info, action_bound 32 | 33 | 34 | 35 | if __name__ == '__main__': 36 | device ='cpu' 37 | # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 38 | print("Using device:",device) 39 | # 模型存储路径 40 | current_dir = os.path.dirname(os.path.abspath(__file__)) 41 | chkpt_dir = os.path.join(current_dir, 'models/maddpg_models/') 42 | # 加载模型的时间戳 43 | load_timestamp = "" # 请输入形如:2025-04-15_15-51 -> 时间戳位置models/maddpg_models/xxxx 44 | model_timestamp = None if load_timestamp == '' else load_timestamp 45 | # 定义参数 46 | args = main_parameters() 47 | args.render_mode = "human" 48 | 49 | # 创建环境 50 | env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode) 51 | # print(env, dim_info, action_bound) 52 | # 创建MA-DDPG智能体 dim_info: 字典,键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意 53 | agent = MADDPG(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, _chkpt_dir = chkpt_dir, _model_timestamp = model_timestamp) 54 | print("--- Loading models ---") 55 | agent.load_model() 56 | print('---- Evaluating ----') 57 | env.reset() 58 | runner = RUNNER(agent, env, args, device, mode = 'evaluate') 59 | runner.evaluate() # 使用evaluate方法 60 | print('---- Done! ----') 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /MADDPG_Continous/main_evaluate_save_render2gif.py: -------------------------------------------------------------------------------- 1 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3 2 | from main_parameters import main_parameters 3 | from utils.runner import RUNNER 4 | from agents.maddpg.MADDPG_agent import MADDPG 5 | import torch 6 | from envs import simple_tag_env 7 | import os 8 | import numpy as np 9 | import imageio # 需要安装: pip install imageio 10 | 11 | 12 | def get_env(env_name, ep_len=50, render_mode = "None"): 13 | """create environment and get observation and action dimension of each agent in this environment""" 14 | new_env = None 15 | if env_name == 'simple_adversary_v3': 16 | new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True) 17 | if env_name == 'simple_spread_v3': 18 | new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array") 19 | if env_name == 'simple_tag_v3': 20 | new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 21 | # new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 22 | new_env.reset() 23 | _dim_info = {} 24 | action_bound = {} 25 | for agent_id in new_env.agents: 26 | print("agent_id:",agent_id) 27 | _dim_info[agent_id] = [] # [obs_dim, act_dim] 28 | action_bound[agent_id] = [] #[low action, hign action] 29 | _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0]) 30 | _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0]) 31 | action_bound[agent_id].append(new_env.action_space(agent_id).low) 32 | action_bound[agent_id].append(new_env.action_space(agent_id).high) 33 | 34 | return new_env, _dim_info, action_bound 35 | 36 | # 修改RUNNER类以捕获帧 37 | class RecordingRunner(RUNNER): 38 | def evaluate(self): 39 | # 记录每个episode的和奖励 用于平滑,显示平滑奖励函数 40 | self.reward_sum_record = [] 41 | # 记录每个智能体在每个episode的奖励 42 | self.episode_rewards = {agent_id: np.zeros(self.par.episode_num) for agent_id in self.env.agents} 43 | # episode循环 44 | for episode in range(self.par.episode_num): 45 | step = 0 # 每回合step重置 46 | print(f"评估第 {episode + 1} 回合") 47 | # 初始化环境 返回初始状态 48 | obs, _ = self.env.reset() # 重置环境,开始新回合 49 | self.done = {agent_id: False for agent_id in self.env_agents} 50 | # 每个智能体当前episode的奖励 51 | agent_reward = {agent_id: 0 for agent_id in self.env.agents} 52 | 53 | # 捕获初始帧 54 | frame = self.env.render() 55 | if frame is not None: 56 | frames.append(frame) 57 | 58 | # 每个智能体与环境进行交互 59 | while self.env.agents: 60 | step += 1 61 | # 使用训练好的智能体选择动作 62 | action = self.agent.select_action(obs) 63 | # 执行动作 获得下一状态 奖励 终止情况 64 | next_obs, reward, terminated, truncated, info = self.env.step(action) 65 | 66 | # 捕获当前帧 67 | frame = self.env.render() 68 | if frame is not None: 69 | frames.append(frame) 70 | 71 | self.done = {agent_id: bool(terminated[agent_id] or truncated[agent_id]) for agent_id in self.env_agents} 72 | # 累积每个智能体的奖励 73 | for agent_id, r in reward.items(): 74 | agent_reward[agent_id] += r 75 | obs = next_obs 76 | if step % 10 == 0: 77 | print(f"Step {step}, obs: {obs}, action: {action}, reward: {reward}, done: {self.done}") 78 | sum_reward = sum(agent_reward.values()) 79 | self.reward_sum_record.append(sum_reward) 80 | 81 | if __name__ == '__main__': 82 | device ='cpu' 83 | # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 84 | print("Using device:",device) 85 | # 模型存储路径 86 | current_dir = os.path.dirname(os.path.abspath(__file__)) 87 | chkpt_dir = os.path.join(current_dir, 'models/maddpg_models/') 88 | # 加载模型的时间戳 89 | load_timestamp = "2025-02-19_16-38" 90 | model_timestamp = None if load_timestamp == '' else load_timestamp 91 | # 定义参数 92 | args = main_parameters() 93 | 94 | # 设置为rgb_array模式以便捕获帧 95 | args.render_mode = "rgb_array" # 修改为rgb_array以便捕获帧 96 | args.episode_num = 5 97 | 98 | # 创建环境 99 | env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode) 100 | # print(env, dim_info, action_bound) 101 | # 创建MA-DDPG智能体 dim_info: 字典,键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意 102 | agent = MADDPG(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, _chkpt_dir = chkpt_dir, _model_timestamp = model_timestamp) 103 | print("--- Loading models ---") 104 | agent.load_model() 105 | print('---- Evaluating and Recording ----') 106 | 107 | # 准备录制 108 | frames = [] 109 | # 使用修改后的Runner 110 | runner = RecordingRunner(agent, env, args, device, mode='evaluate') 111 | runner.evaluate() 112 | 113 | # 保存为GIF 114 | gif_path = os.path.join(current_dir, 'plot', f'{args.env_name}_demo.gif') 115 | print(f"正在保存GIF到: {gif_path}") 116 | imageio.mimsave(gif_path, frames, fps=10) 117 | 118 | print(f'---- 完成! GIF已保存到 {gif_path} ----') -------------------------------------------------------------------------------- /MADDPG_Continous/main_parameters.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def main_parameters(): 4 | parser = argparse.ArgumentParser() 5 | ############################################ 选择环境 ############################################ 6 | parser.add_argument("--env_name", type =str, default = "simple_tag_v3", help = "name of the env", 7 | choices=['simple_adversary_v3', 'simple_spread_v3', 'simple_tag_v3', 'simple_tag_env']) 8 | parser.add_argument("--render_mode", type=str, default = "None", help = "None | human | rgb_array") 9 | parser.add_argument("--episode_num", type = int, default = 5) # 5000 10 | parser.add_argument("--episode_length", type = int, default = 500) #50 11 | parser.add_argument('--learn_interval', type=int, default=10, 12 | help='steps interval between learning time') 13 | parser.add_argument('--random_steps', type=int, default=500, help='random steps before the agent start to learn') # 2e3 14 | parser.add_argument('--tau', type=float, default=0.001, help='soft update parameter') 15 | parser.add_argument('--gamma', type=float, default=0.99, help='discount factor') 16 | parser.add_argument('--buffer_capacity', type=int, default=int(1e6), help='capacity of replay buffer') 17 | parser.add_argument('--batch_size', type=int, default=128, help='batch-size of replay buffer') 18 | parser.add_argument('--actor_lr', type=float, default=0.0002, help='learning rate of actor') # .00002 19 | parser.add_argument('--critic_lr', type=float, default=0.002, help='learning rate of critic') # .002 20 | # The parameters for the communication network 21 | # TODO 22 | parser.add_argument('--visdom', type=bool, default=False, help="Open the visdom") 23 | parser.add_argument('--size_win', type=int, default=200, help="Open the visdom") # 1000 24 | 25 | 26 | args = parser.parse_args() 27 | return args -------------------------------------------------------------------------------- /MADDPG_Continous/main_train.py: -------------------------------------------------------------------------------- 1 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3 2 | from envs import simple_tag_env, custom_agents_dynamics 3 | 4 | from main_parameters import main_parameters 5 | from utils.runner import RUNNER 6 | 7 | from agents.maddpg.MADDPG_agent import MADDPG 8 | import torch 9 | import os 10 | 11 | import time 12 | from datetime import timedelta 13 | from utils.logger import TrainingLogger # 添加导入 14 | 15 | def get_env(env_name, ep_len=25, render_mode ="None"): 16 | """create environment and get observation and action dimension of each agent in this environment""" 17 | new_env = None 18 | if env_name == 'simple_adversary_v3': 19 | new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True) 20 | if env_name == 'simple_spread_v3': 21 | new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array") 22 | if env_name == 'simple_tag_v3': 23 | new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 24 | if env_name == 'simple_tag_env': 25 | new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 26 | new_env.reset() 27 | _dim_info = {} 28 | action_bound = {} 29 | for agent_id in new_env.agents: 30 | print("agent_id:",agent_id) 31 | _dim_info[agent_id] = [] # [obs_dim, act_dim] 32 | action_bound[agent_id] = [] #[low action, hign action] 33 | _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0]) 34 | _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0]) 35 | action_bound[agent_id].append(new_env.action_space(agent_id).low) 36 | action_bound[agent_id].append(new_env.action_space(agent_id).high) 37 | print("_dim_info:",_dim_info) 38 | print("action_bound:",action_bound) 39 | return new_env, _dim_info, action_bound 40 | 41 | 42 | 43 | if __name__ == '__main__': 44 | # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 45 | # device = torch.device('mps' if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() 46 | # else 'cuda' if torch.cuda.is_available() else 'cpu') 47 | device = "cpu" 48 | print("Using device:",device) 49 | start_time = time.time() # 记录开始时间 50 | 51 | # 模型保存路径 52 | current_dir = os.path.dirname(os.path.abspath(__file__)) 53 | chkpt_dir = os.path.join(current_dir, 'models/maddpg_models/') 54 | # 定义参数 55 | args = main_parameters() 56 | # 创建环境 57 | print("Using Env's name",args.env_name) 58 | env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode) 59 | # print(env, dim_info, action_bound) 60 | # 创建MA-DDPG智能体 dim_info: 字典,键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意。 61 | agent = MADDPG(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, _chkpt_dir = chkpt_dir, _device = device) 62 | # 创建运行对象 63 | runner = RUNNER(agent, env, args, device, mode = 'train') 64 | # 开始训练 65 | runner.train() 66 | print("agent",agent) 67 | 68 | # 计算训练时间 69 | end_time = time.time() 70 | training_time = end_time - start_time 71 | # 转换为时分秒格式 72 | training_duration = str(timedelta(seconds=int(training_time))) 73 | print(f"\n===========训练完成!===========") 74 | print(f"训练设备: {device}") 75 | print(f"训练用时: {training_duration}") 76 | 77 | # 使用logger保存训练日志 78 | # 使用logger保存训练日志 79 | logger = TrainingLogger() 80 | current_time = logger.save_training_log(args, device, training_duration, runner) 81 | print(f"完成时间: {current_time}") 82 | 83 | print("--- saving trained models ---") 84 | agent.save_model() 85 | print("--- trained models saved ---") 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /MADDPG_Continous/plot/convert_gif_to_loop.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import subprocess 6 | import argparse 7 | from pathlib import Path 8 | 9 | try: 10 | from PIL import Image 11 | PIL_AVAILABLE = True 12 | except ImportError: 13 | PIL_AVAILABLE = False 14 | 15 | def convert_gif_to_loop(input_path, output_path=None, backup=True): 16 | """ 17 | 将GIF转换为循环播放的GIF 18 | 19 | 参数: 20 | input_path: 输入GIF文件路径或包含GIF文件的目录 21 | output_path: 输出GIF文件路径或目录,默认为在原文件名后添加"_loop" 22 | backup: 是否备份原始文件 23 | """ 24 | # 检查是否安装了PIL 25 | if not PIL_AVAILABLE: 26 | print("错误: 未安装PIL/Pillow库。请使用以下命令安装:") 27 | print("pip install Pillow") 28 | return 29 | 30 | input_path = Path(input_path) 31 | 32 | # 检查输入路径是文件还是目录 33 | if input_path.is_file() and input_path.suffix.lower() == '.gif': 34 | gif_files = [input_path] 35 | elif input_path.is_dir(): 36 | gif_files = list(input_path.glob('*.gif')) 37 | else: 38 | print(f"错误: {input_path} 不是有效的GIF文件或目录") 39 | return 40 | 41 | if not gif_files: 42 | print(f"在 {input_path} 中没有找到GIF文件") 43 | return 44 | 45 | for gif_file in gif_files: 46 | # 确定输出文件路径 47 | if output_path is None: 48 | output_file = gif_file.parent / f"{gif_file.stem}_loop{gif_file.suffix}" 49 | elif Path(output_path).is_dir(): 50 | output_file = Path(output_path) / f"{gif_file.stem}_loop{gif_file.suffix}" 51 | else: 52 | output_file = Path(output_path) 53 | 54 | # 备份原始文件 55 | if backup and gif_file.exists(): 56 | backup_file = gif_file.parent / f"{gif_file.stem}_original{gif_file.suffix}" 57 | if not backup_file.exists(): 58 | print(f"备份 {gif_file} 到 {backup_file}") 59 | subprocess.run(['cp', str(gif_file), str(backup_file)]) 60 | 61 | # 使用PIL/Pillow转换GIF为循环播放 62 | print(f"转换 {gif_file} 为循环播放GIF: {output_file}") 63 | 64 | try: 65 | # 打开GIF文件 66 | img = Image.open(gif_file) 67 | 68 | # 提取所有帧 69 | frames = [] 70 | durations = [] 71 | 72 | try: 73 | while True: 74 | # 记录当前帧的持续时间 75 | durations.append(img.info.get('duration', 100)) # 默认100ms 76 | # 复制当前帧 77 | frames.append(img.copy()) 78 | # 尝试移动到下一帧 79 | img.seek(img.tell() + 1) 80 | except EOFError: 81 | pass # 到达文件末尾 82 | 83 | # 保存为循环播放的GIF 84 | if frames: 85 | frames[0].save( 86 | str(output_file), 87 | save_all=True, 88 | append_images=frames[1:], 89 | optimize=False, 90 | duration=durations, 91 | loop=0 # 0表示无限循环 92 | ) 93 | print(f"成功创建循环播放GIF: {output_file}") 94 | else: 95 | print(f"警告: {gif_file} 似乎不是有效的GIF动画") 96 | 97 | except Exception as e: 98 | print(f"处理 {gif_file} 时出错: {e}") 99 | 100 | if __name__ == "__main__": 101 | parser = argparse.ArgumentParser(description='将GIF转换为循环播放的GIF') 102 | parser.add_argument('input', help='输入GIF文件路径或包含GIF文件的目录') 103 | parser.add_argument('-o', '--output', help='输出GIF文件路径或目录,默认为在原文件名后添加"_loop"') 104 | parser.add_argument('--no-backup', action='store_false', dest='backup', 105 | help='不备份原始文件') 106 | 107 | args = parser.parse_args() 108 | convert_gif_to_loop(args.input, args.output, args.backup) -------------------------------------------------------------------------------- /MADDPG_Continous/plot/demo-rewards_plot_ma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/MADDPG_Continous/plot/demo-rewards_plot_ma.png -------------------------------------------------------------------------------- /MADDPG_Continous/plot/plot_rewards.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import os 4 | from datetime import datetime 5 | import numpy as np 6 | import platform 7 | ''' 8 | 注意: 9 | 作者用pands==2.2.3出错了。 10 | pip install pandas==2.2.1 没问题。 11 | ''' 12 | 13 | def moving_average(data, window_size=50): 14 | """简单移动平均""" 15 | weights = np.ones(window_size) / window_size 16 | return np.convolve(data, weights, mode='valid') 17 | 18 | def exponential_moving_average(data, alpha=0.1): 19 | """指数移动平均""" 20 | ema = np.zeros_like(data) 21 | ema[0] = data[0] 22 | for i in range(1, len(data)): 23 | ema[i] = alpha * data[i] + (1 - alpha) * ema[i-1] 24 | return ema 25 | 26 | # def plot_rewards(csv_file, window_size=50, alpha=0.1): 27 | # # 读取CSV文件,不指定数据类型 28 | # df = pd.read_csv(csv_file) 29 | # # 设置中文字体(如果需要显示中文) 30 | # plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # MacOS 31 | # plt.rcParams['axes.unicode_minus'] = False # 正常显示负号 32 | 33 | 34 | # # 计算平滑后的数据 35 | # adv_ma = moving_average(df['Adversary Average Reward'].values) 36 | # adv_ema = exponential_moving_average(df['Adversary Average Reward'].values) 37 | # sum_ma = moving_average(df['Sum Reward of All Agents'].values) 38 | # sum_ema = exponential_moving_average(df['Sum Reward of All Agents'].values) 39 | 40 | # # 创建图形 41 | # plt.figure(figsize=(15, 10)) 42 | 43 | # # 绘制追捕者平均奖励 44 | # plt.subplot(2, 1, 1) 45 | # plt.plot(df['Episode'], df['Adversary Average Reward'], 'lightgray', alpha=0.3, label='原始数据') 46 | # plt.plot(df['Episode'][window_size-1:], adv_ma, 'b-', linewidth=2, label='移动平均') 47 | # plt.plot(df['Episode'], adv_ema, 'r-', linewidth=2, label='指数移动平均') 48 | # plt.title('追捕者平均奖励随回合数变化') 49 | # plt.xlabel('回合数') 50 | # plt.ylabel('平均奖励') 51 | # plt.grid(True, linestyle='--', alpha=0.7) 52 | # plt.legend() 53 | 54 | # # 绘制所有智能体总奖励 55 | # plt.subplot(2, 1, 2) 56 | # plt.plot(df['Episode'], df['Sum Reward of All Agents'], 'lightgray', alpha=0.3, label='原始数据') 57 | # plt.plot(df['Episode'][window_size-1:], sum_ma, 'b-', linewidth=2, label='移动平均') 58 | # plt.plot(df['Episode'], sum_ema, 'r-', linewidth=2, label='指数移动平均') 59 | # plt.title('所有智能体总奖励随回合数变化') 60 | # plt.xlabel('回合数') 61 | # plt.ylabel('总奖励') 62 | # plt.grid(True, linestyle='--', alpha=0.7) 63 | # plt.legend() 64 | 65 | # # 调整子图之间的间距 66 | # plt.tight_layout() 67 | 68 | # # 保存图片 69 | # save_path = os.path.join(os.path.dirname(csv_file), f'rewards_plot.png') 70 | # plt.savefig(save_path, dpi=300, bbox_inches='tight') 71 | # print(f"图片已保存至 {save_path}") 72 | 73 | # # 显示图形 74 | # plt.show() 75 | 76 | def set_font_for_plot(): 77 | """根据平台动态设置字体""" 78 | system_platform = platform.system() 79 | print("system_platform:", system_platform) 80 | if system_platform == "Darwin": # MacOS 81 | font = 'Arial Unicode MS' 82 | elif system_platform == "Windows": # Windows 83 | font = 'SimHei' 84 | else: # Linux 85 | font = 'DejaVu Sans' 86 | 87 | # 设置matplotlib的字体 88 | plt.rcParams['font.sans-serif'] = [font] 89 | plt.rcParams['axes.unicode_minus'] = False # 正常显示负号 90 | 91 | def different_plot_rewards(csv_file, window_size=50, alpha=0.1): 92 | df = pd.read_csv(csv_file) 93 | 94 | # plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] 95 | # plt.rcParams['axes.unicode_minus'] = False 96 | set_font_for_plot() # 设置字体 97 | 98 | # 计算平滑后的数据 99 | adv_ma = moving_average(df['Adversary Average Reward'].values, window_size) 100 | adv_ema = exponential_moving_average(df['Adversary Average Reward'].values, alpha) 101 | sum_ma = moving_average(df['Sum Reward of All Agents'].values, window_size) 102 | sum_ema = exponential_moving_average(df['Sum Reward of All Agents'].values, alpha) 103 | 104 | # 创建两个图形 105 | # 1. 移动平均对比图 106 | plt.figure(figsize=(15, 10)) 107 | # 追捕者奖励 108 | plt.subplot(2, 1, 1) 109 | plt.plot(df['Episode'], df['Adversary Average Reward'], 'lightgray', alpha=0.3, label='原始数据') 110 | plt.plot(df['Episode'][window_size-1:], adv_ma, 'b-', linewidth=2, label='移动平均') 111 | plt.title('追捕者平均奖励 - 移动平均对比') 112 | plt.xlabel('回合数') 113 | plt.ylabel('平均奖励') 114 | plt.grid(True, linestyle='--', alpha=0.7) 115 | plt.legend() 116 | 117 | # 总奖励 118 | plt.subplot(2, 1, 2) 119 | plt.plot(df['Episode'], df['Sum Reward of All Agents'], 'lightgray', alpha=0.3, label='原始数据') 120 | plt.plot(df['Episode'][window_size-1:], sum_ma, 'b-', linewidth=2, label='移动平均') 121 | plt.title('所有智能体总奖励 - 移动平均对比') 122 | plt.xlabel('回合数') 123 | plt.ylabel('总奖励') 124 | plt.grid(True, linestyle='--', alpha=0.7) 125 | plt.legend() 126 | plt.tight_layout() 127 | 128 | # 保存移动平均对比图 129 | save_path_ma = os.path.join(os.path.dirname(csv_file), f'rewards_plot_ma.png') 130 | plt.savefig(save_path_ma, dpi=300, bbox_inches='tight') 131 | 132 | # 2. 指数移动平均对比图 133 | plt.figure(figsize=(15, 10)) 134 | # 追捕者奖励 135 | plt.subplot(2, 1, 1) 136 | plt.plot(df['Episode'], df['Adversary Average Reward'], 'lightgray', alpha=0.3, label='原始数据') 137 | plt.plot(df['Episode'], adv_ema, 'r-', linewidth=2, label='指数移动平均') 138 | plt.title('追捕者平均奖励 - 指数移动平均对比') 139 | plt.xlabel('回合数') 140 | plt.ylabel('平均奖励') 141 | plt.grid(True, linestyle='--', alpha=0.7) 142 | plt.legend() 143 | 144 | # 总奖励 145 | plt.subplot(2, 1, 2) 146 | plt.plot(df['Episode'], df['Sum Reward of All Agents'], 'lightgray', alpha=0.3, label='原始数据') 147 | plt.plot(df['Episode'], sum_ema, 'r-', linewidth=2, label='指数移动平均') 148 | plt.title('所有智能体总奖励 - 指数移动平均对比') 149 | plt.xlabel('回合数') 150 | plt.ylabel('总奖励') 151 | plt.grid(True, linestyle='--', alpha=0.7) 152 | plt.legend() 153 | plt.tight_layout() 154 | 155 | # 保存指数移动平均对比图 156 | save_path_ema = os.path.join(os.path.dirname(csv_file), f'rewards_plot_ema.png') 157 | plt.savefig(save_path_ema, dpi=300, bbox_inches='tight') 158 | 159 | print(f"移动平均对比图已保存至 {save_path_ma}") 160 | print(f"指数移动平均对比图已保存至 {save_path_ema}") 161 | 162 | plt.show() 163 | 164 | if __name__ == "__main__": 165 | # CSV文件路径(相对于当前脚本的路径) 166 | csv_file = os.path.join(os.path.dirname(__file__), 'data', 'data_rewards_2025-02-25_04-39.csv') 167 | print("csv_file name:",csv_file) 168 | 169 | if os.path.exists(csv_file): 170 | df = pd.read_csv(csv_file) 171 | # print(df.head()) 172 | # plot_rewards(csv_file) 173 | different_plot_rewards(csv_file) 174 | else: 175 | print(f"错误:未找到CSV文件:{csv_file}") -------------------------------------------------------------------------------- /MADDPG_Continous/plot/simple_tag_v3_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/MADDPG_Continous/plot/simple_tag_v3_demo.gif -------------------------------------------------------------------------------- /MADDPG_Continous/plot/simple_tag_v3_demo_loop.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/MADDPG_Continous/plot/simple_tag_v3_demo_loop.gif -------------------------------------------------------------------------------- /MADDPG_Continous/utils/conda-environment.yml: -------------------------------------------------------------------------------- 1 | name: MARL 2 | channels: 3 | - pytorch 4 | - defaults 5 | - https://repo.anaconda.com/pkgs/main 6 | - https://repo.anaconda.com/pkgs/r 7 | dependencies: 8 | - blas=1.0 9 | - brotli-python=1.0.9 10 | - bzip2=1.0.8 11 | - ca-certificates=2025.2.25 12 | - certifi=2025.1.31 13 | - filelock=3.13.1 14 | - freetype=2.12.1 15 | - gmpy2=2.1.2 16 | - intel-openmp=2023.1.0 17 | - jinja2=3.1.4 18 | - jpeg=9e 19 | - lcms2=2.16 20 | - lerc=4.0.0 21 | - libdeflate=1.22 22 | - libffi=3.4.4 23 | - libjpeg-turbo=2.0.0 24 | - libpng=1.6.39 25 | - libtiff=4.5.1 26 | - libwebp-base=1.3.2 27 | - llvm-openmp=14.0.6 28 | - lz4-c=1.9.4 29 | - markupsafe=2.1.3 30 | - mkl=2023.1.0 31 | - mkl-service=2.4.0 32 | - mkl_fft=1.3.8 33 | - mkl_random=1.2.4 34 | - mpc=1.1.0 35 | - mpfr=4.0.2 36 | - mpmath=1.3.0 37 | - numpy=1.26.4 38 | - numpy-base=1.26.4 39 | - openjpeg=2.5.2 40 | - openssl=3.0.16 41 | - pip=24.2 42 | - pybind11-abi=4 43 | - pysocks=1.7.1 44 | - python=3.11.8 45 | - pyyaml=6.0.2 46 | - requests=2.32.3 47 | - setuptools=75.1.0 48 | - sqlite=3.45.3 49 | - sympy=1.13.3 50 | - tbb=2021.8.0 51 | - tk=8.6.14 52 | - typing_extensions=4.12.2 53 | - wheel=0.44.0 54 | - xz=5.4.6 55 | - yaml=0.2.5 56 | - zlib=1.2.13 57 | - zstd=1.5.6 58 | prefix: /Users/ronchy2000/DevelopEnv/anaconda3/envs/MARL 59 | -------------------------------------------------------------------------------- /MADDPG_Continous/utils/linux_environment.yml: -------------------------------------------------------------------------------- 1 | name: MARL 2 | channels: 3 | - pytorch 4 | - defaults 5 | - https://repo.anaconda.com/pkgs/main 6 | - https://repo.anaconda.com/pkgs/r 7 | dependencies: 8 | - blas=1.0 9 | - brotli-python=1.0.9 10 | - bzip2=1.0.8 11 | - ca-certificates=2025.2.25 12 | - certifi=2025.1.31 13 | - filelock=3.13.1 14 | - freetype=2.12.1 15 | - gmp=6.2.1 16 | - gmpy2=2.1.2 17 | - intel-openmp=2023.1.0 18 | - jinja2=3.1.4 19 | - jpeg=9e 20 | - lcms2=2.16 21 | - lerc=4.0.0 22 | - libcxx=14.0.6 23 | - libdeflate=1.22 24 | - libffi=3.4.4 25 | - libgfortran=5.0.0 26 | - libgfortran5=11.3.0 27 | - libjpeg-turbo=2.0.0 28 | - libpng=1.6.39 29 | - libtiff=4.5.1 30 | - libwebp-base=1.3.2 31 | - llvm-openmp=14.0.6 32 | - lz4-c=1.9.4 33 | - markupsafe=2.1.3 34 | - mkl=2023.1.0 35 | - mkl-service=2.4.0 36 | - mkl_fft=1.3.8 37 | - mkl_random=1.2.4 38 | - mpc=1.1.0 39 | - mpfr=4.0.2 40 | - mpmath=1.3.0 41 | - ncurses=6.4 42 | - numpy-base=1.26.4 43 | - openjpeg=2.5.2 44 | - openssl=3.0.16 45 | - pip=24.2 46 | - pybind11-abi=4 47 | - pysocks=1.7.1 48 | - python=3.11.8 49 | - pytorch=2.2.2 50 | - pyyaml=6.0.2 51 | - readline=8.2 52 | - requests=2.32.3 53 | - setuptools=75.1.0 54 | - sqlite=3.45.3 55 | - sympy=1.13.3 56 | - tbb=2021.8.0 57 | - tk=8.6.14 58 | - torchaudio=2.2.2 59 | - torchvision=0.17.2 60 | - typing_extensions=4.12.2 61 | - wheel=0.44.0 62 | - xz=5.4.6 63 | - yaml=0.2.5 64 | - zlib=1.2.13 65 | - zstd=1.5.6 66 | - pip: 67 | - charset-normalizer==3.4.1 68 | - cloudpickle==3.1.0 69 | - contourpy==1.3.1 70 | - cycler==0.12.1 71 | - farama-notifications==0.0.4 72 | - fonttools==4.55.3 73 | - gymnasium==1.0.0 74 | - idna==3.10 75 | - jsonpatch==1.33 76 | - jsonpointer==3.0.0 77 | - kiwisolver==1.4.8 78 | - matplotlib==3.8.3 79 | - networkx==3.4.2 80 | - numpy==2.2.1 81 | - packaging==24.2 82 | - pandas==2.2.1 83 | - pettingzoo==1.24.4 84 | - pillow==11.1.0 85 | - pip-chill==1.0.3 86 | - pygame==2.6.1 87 | - pyparsing==3.2.1 88 | - python-dateutil==2.9.0.post0 89 | - pytz==2025.1 90 | - scipy==1.15.0 91 | - six==1.17.0 92 | - tornado==6.4.2 93 | - tzdata==2025.1 94 | - urllib3==2.3.0 95 | - visdom==0.2.4 96 | - websocket-client==1.8.0 97 | prefix: /Users/ronchy2000/DevelopEnv/anaconda3/envs/MARL 98 | -------------------------------------------------------------------------------- /MADDPG_Continous/utils/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | from datetime import datetime 5 | 6 | 7 | 8 | """ 9 | 1. 在../logs/下保存了一个 training_log.json 文件,它包含了训练的所有参数和日志信息。 10 | 2. 保存的 plot_data_{current_time.replace(':', '-')}.pkl 是一个 PyTorch 保存的文件,它并不包含模型本身,而是 训练过程中的奖励数据。 11 | 12 | """ 13 | class TrainingLogger: 14 | def __init__(self, log_dir="../logs"): 15 | # 使用绝对路径 16 | current_dir = os.path.dirname(os.path.abspath(__file__)) 17 | self.log_dir = os.path.join(current_dir,'..', 'logs') 18 | 19 | # 确保目录存在 20 | if not os.path.exists(self.log_dir): 21 | os.makedirs(self.log_dir) 22 | 23 | def save_training_log(self, args, device, training_duration, runner): 24 | current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 25 | 26 | # 准备训练日志信息 27 | log_info = { 28 | "训练时间": current_time, 29 | "训练设备": str(device), 30 | "训练用时": training_duration, 31 | "环境名称": args.env_name, 32 | "渲染模式": args.render_mode, 33 | "总回合数": args.episode_num, 34 | "每回合步数": args.episode_length, 35 | "学习间隔": args.learn_interval, 36 | "随机步数": args.random_steps, 37 | "tau": args.tau, 38 | "gamma": args.gamma, 39 | "buffer容量": args.buffer_capacity, 40 | "batch_size": args.batch_size, 41 | "actor学习率": args.actor_lr, 42 | "critic学习率": args.critic_lr, 43 | "是否使用visdom": args.visdom, 44 | "visdom窗口大小": args.size_win 45 | } 46 | 47 | # 保存训练日志 48 | log_file = os.path.join(self.log_dir, "training_log.json") 49 | 50 | # 打印当前目录和目标目录 51 | print(f"Current directory: {os.getcwd()}") 52 | print(f"Saving training log to: {log_file}") 53 | 54 | # 确保目录存在并且具有写权限 55 | if os.path.exists(self.log_dir): 56 | print(f"Log directory exists: {self.log_dir}") 57 | else: 58 | print(f"Log directory does not exist. Trying to create it...") 59 | os.makedirs(self.log_dir, exist_ok=True) 60 | 61 | # 读取现有的日志文件,如果存在的话 62 | existing_logs = [] 63 | if os.path.exists(log_file): 64 | with open(log_file, 'r', encoding='utf-8') as f: 65 | existing_logs = json.load(f) 66 | 67 | existing_logs.append(log_info) 68 | 69 | # 保存更新后的日志文件 70 | with open(log_file, 'w', encoding='utf-8') as f: 71 | json.dump(existing_logs, f, ensure_ascii=False, indent=4) 72 | 73 | # 保存训练曲线数据 74 | plot_data = { 75 | "all_sum_rewards": runner.all_sum_rewards, # 所有智能体的总奖励 76 | "all_adversary_avg_rewards": runner.all_adversary_avg_rewards, # 追捕者的平均奖励 77 | "episode_rewards": runner.episode_rewards, # 每个智能体的奖励历史 78 | "running_rewards": runner.get_running_reward(runner.reward_sum_record), # 平滑后的奖励 79 | "timestamps": current_time 80 | } 81 | 82 | plot_file = os.path.join(self.log_dir, f"plot_data_{current_time.replace(':', '-')}.pkl") 83 | torch.save(plot_data, plot_file) 84 | 85 | return current_time 86 | -------------------------------------------------------------------------------- /MADDPG_Continous/utils/mac_arm_M4_environment.yml: -------------------------------------------------------------------------------- 1 | name: MARL 2 | channels: 3 | - pytorch 4 | - defaults 5 | - https://repo.anaconda.com/pkgs/main 6 | - https://repo.anaconda.com/pkgs/r 7 | dependencies: 8 | - blas=1.0 9 | - brotli-python=1.0.9 10 | - bzip2=1.0.8 11 | - ca-certificates=2025.2.25 12 | - certifi=2025.1.31 13 | - filelock=3.13.1 14 | - freetype=2.12.1 15 | - gmp=6.2.1 16 | - gmpy2=2.1.2 17 | # - intel-openmp=2023.1.0 18 | - jinja2=3.1.4 19 | - jpeg=9e 20 | - lcms2=2.16 21 | - lerc=4.0.0 22 | - libcxx=14.0.6 23 | - libdeflate=1.22 24 | - libffi=3.4.4 25 | - libgfortran=5.0.0 26 | - libgfortran5=11.3.0 27 | - libjpeg-turbo=2.0.0 28 | - libpng=1.6.39 29 | - libtiff=4.5.1 30 | - libwebp-base=1.3.2 31 | - llvm-openmp=14.0.6 32 | - lz4-c=1.9.4 33 | - markupsafe=2.1.3 34 | # - mkl=2023.1.0 35 | # - mkl-service=2.4.0 36 | # - mkl_fft=1.3.8 37 | # - mkl_random=1.2.4 38 | - mpc=1.1.0 39 | - mpfr=4.0.2 40 | - mpmath=1.3.0 41 | - ncurses=6.4 42 | - numpy-base=1.26.4 43 | - openjpeg=2.5.2 44 | - openssl=3.0.16 45 | - pip=24.2 46 | - pybind11-abi=4 47 | - pysocks=1.7.1 48 | - python=3.11.8 49 | - pytorch=2.2.2 50 | - pyyaml=6.0.2 51 | - readline=8.2 52 | - requests=2.32.3 53 | - setuptools=75.1.0 54 | - sqlite=3.45.3 55 | - sympy=1.13.3 56 | - tbb=2021.8.0 57 | - tk=8.6.14 58 | - torchaudio=2.2.2 59 | - torchvision=0.17.2 60 | - typing_extensions=4.12.2 61 | - wheel=0.44.0 62 | - xz=5.4.6 63 | - yaml=0.2.5 64 | - zlib=1.2.13 65 | - zstd=1.5.6 66 | 67 | -------------------------------------------------------------------------------- /MADDPG_Continous/utils/pip-requirements.txt: -------------------------------------------------------------------------------- 1 | brotli==1.0.9 2 | gmpy2==2.1.2 3 | gymnasium==1.0.0 4 | importlib-metadata==8.0.0 5 | importlib-resources==6.4.0 6 | jaraco.collections==5.1.0 7 | matplotlib==3.8.3 8 | mkl-fft==1.3.8 9 | mkl-random==1.2.4 10 | pandas==2.2.1 11 | pip-chill==1.0.3 12 | platformdirs==4.2.2 13 | pygame==2.6.1 14 | pysocks==1.7.1 15 | pyyaml==6.0.2 16 | tomli==2.0.1 17 | visdom==0.2.4 18 | -------------------------------------------------------------------------------- /MADDPG_Continous/utils/pip-requirements_mac_arm_M4.txt: -------------------------------------------------------------------------------- 1 | brotli==1.0.9 2 | gmpy2==2.1.2 3 | gymnasium==1.1.1 4 | importlib-metadata==8.0.0 5 | importlib-resources==6.4.0 6 | jaraco.collections==5.1.0 7 | matplotlib==3.8.3 8 | pandas==2.2.1 9 | pip-chill==1.0.3 10 | platformdirs==4.2.2 11 | pygame==2.6.1 12 | pysocks==1.7.1 13 | pyyaml==6.0.2 14 | tomli==2.0.1 15 | visdom==0.2.4 16 | -------------------------------------------------------------------------------- /MADDPG_Continous/utils/setupPettingzoo.py: -------------------------------------------------------------------------------- 1 | # 使用 sys.executable 获取当前虚拟环境的 pip,这样它会始终使用当前虚拟环境的 pip 安装包,而不是系统环境的 pip 2 | import pkg_resources 3 | import sys 4 | import platform 5 | import os 6 | from subprocess import call 7 | 8 | def check_and_install_pettingzoo(): 9 | # 打印当前虚拟环境的相关信息 10 | print("================================") 11 | print(f"Current Python executable: {sys.executable}") 12 | print(f"Python version: {sys.version}") 13 | print(f"Current virtual environment: {sys.prefix}") 14 | print(f"Platform: {platform.system()} {platform.release()}") 15 | print("================================") 16 | 17 | try: 18 | # 检查 pettingzoo 是否已经安装 19 | pkg_resources.get_distribution("pettingzoo") 20 | print("================================") 21 | print("pettingzoo is already installed.") 22 | print("================================") 23 | except pkg_resources.DistributionNotFound: 24 | # 如果 pettingzoo 没有安装,执行安装操作 25 | print("================================") 26 | print("pettingzoo is not installed. Installing pettingzoo...") 27 | print("================================") 28 | 29 | # 获取当前虚拟环境的 Python 解释器路径 30 | python_executable = sys.executable 31 | 32 | # 根据操作系统确定 pip 路径 33 | if platform.system() == "Windows": 34 | # Windows 系统下,pip 通常在 Scripts 目录下 35 | pip_executable = os.path.join(os.path.dirname(python_executable), "Scripts", "pip.exe") 36 | else: 37 | # macOS/Linux 系统下 38 | pip_dir = os.path.dirname(python_executable) 39 | pip_executable = os.path.join(pip_dir, "pip") 40 | if not os.path.exists(pip_executable): 41 | pip_executable = python_executable.replace("python", "pip") 42 | 43 | print(f"Using pip executable: {pip_executable}") 44 | 45 | # 尝试安装 pettingzoo==1.24.4 46 | try: 47 | print("Attempting to install pettingzoo==1.24.4...") 48 | result = call([pip_executable, "install", "pettingzoo==1.24.4"]) 49 | if result == 0: 50 | print("================================") 51 | print("Successfully installed pettingzoo==1.24.4") 52 | print("================================") 53 | else: 54 | print("Installation of pettingzoo==1.24.4 failed. Trying GitHub installation...") 55 | # 如果安装失败,尝试从 GitHub 安装 56 | try: 57 | # 根据操作系统调整命令格式 58 | if platform.system() == "Windows": 59 | # Windows 下不使用引号 60 | result = call([pip_executable, "install", "pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git"]) 61 | else: 62 | # macOS/Linux 下使用引号 63 | result = call([pip_executable, "install", "pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git"]) 64 | 65 | if result == 0: 66 | print("================================") 67 | print("Successfully installed pettingzoo from GitHub.") 68 | print("================================") 69 | else: 70 | print("GitHub installation failed. Please check the error above.") 71 | except Exception as e: 72 | print(f"Failed to install pettingzoo from GitHub: {e}") 73 | print("================================") 74 | print("Please manually install pettingzoo or check the error above.") 75 | except Exception as e: 76 | print(f"Failed to install pettingzoo==1.24.4: {e}") 77 | print("Attempting to install pettingzoo from GitHub...") 78 | 79 | if __name__ == "__main__": 80 | check_and_install_pettingzoo() -------------------------------------------------------------------------------- /MATD3_Continous/agents/NN_actor_td3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.functional as F 4 | import os 5 | from datetime import datetime 6 | 7 | """ 8 | 和MADDPG中的actor网络相同. 9 | """ 10 | 11 | class MLPNetworkActor_td3(nn.Module): 12 | def __init__(self,chkpt_name, chkpt_dir, in_dim, out_dim, action_bound, hidden_dim = 128, non_linear = nn.ReLU()): 13 | super(MLPNetworkActor_td3, self).__init__() 14 | self.chkpt_dir = chkpt_dir 15 | self.chkpt_name = chkpt_name 16 | 17 | # different ,为什么要保持这两个信息? 18 | self.out_dim = out_dim 19 | self.action_bound = action_bound 20 | 21 | self.net = nn.Sequential( 22 | nn.Linear(in_dim, hidden_dim), 23 | non_linear, 24 | nn.Linear(hidden_dim, hidden_dim), 25 | non_linear, 26 | nn.Linear(hidden_dim, out_dim), 27 | ).apply(self.init) 28 | 29 | @staticmethod 30 | def init(m): 31 | '''init patameters of the module''' 32 | gain = nn.init.calculate_gain('relu') 33 | if isinstance(m, nn.Linear): 34 | nn.init.xavier_uniform_(m.weight, gain = gain) #使用了 Xavier 均匀分布初始化(也叫 Glorot 初始化) 35 | m.bias.data.fill_(0.01) 36 | 37 | def forward(self, x): 38 | x = self.net(x) 39 | # logi = x 40 | # a_min = self.action_bound[0] 41 | # a_max = self.action_bound[1] 42 | # ''' 这三行为什么要这么处理? 引入了bias项干嘛''' 43 | # k = torch.tensor( (a_max - a_min) /2 , device=x.device ) 44 | # bias = torch.tensor( (a_max + a_min) /2, device=x.device ) 45 | # action = k * torch.tanh(x) + bias 46 | # return action, logi 47 | x = torch.tanh(x) 48 | return x 49 | 50 | def save_checkpoint(self, is_target=False, timestamp = False): 51 | # 使用时间戳保存功能 52 | if timestamp is True: 53 | # 使用时间戳创建新文件夹 54 | current_timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M') 55 | save_dir = os.path.join(self.chkpt_dir, current_timestamp) 56 | else: 57 | # 直接保存在主目录下,不使用时间戳 58 | save_dir = self.chkpt_dir 59 | # 创建保存路径 60 | self.chkpt_file = os.path.join(save_dir, self.chkpt_name) 61 | 62 | if is_target: 63 | target_chkpt_name = self.chkpt_file.replace('actor', 'target_actor') 64 | os.makedirs(os.path.dirname(target_chkpt_name), exist_ok=True) 65 | torch.save(self.state_dict(), target_chkpt_name) 66 | else: 67 | os.makedirs(os.path.dirname(self.chkpt_file), exist_ok=True) 68 | torch.save(self.state_dict(), self.chkpt_file) 69 | 70 | def load_checkpoint(self, device = 'cpu', is_target = False, timestamp = None): # 默认加载target 71 | if timestamp and isinstance(timestamp, str): 72 | # 如果提供了有效的时间戳字符串,从对应文件夹加载 73 | load_dir = os.path.join(self.chkpt_dir, timestamp) 74 | else: 75 | # 否则从主目录加载 76 | load_dir = self.chkpt_dir 77 | 78 | self.chkpt_file = os.path.join(load_dir, self.chkpt_name) 79 | 80 | if is_target: 81 | target_chkpt_name = self.chkpt_file.replace('actor', 'target_actor') 82 | self.load_state_dict(torch.load(target_chkpt_name, map_location=torch.device(device))) 83 | else: 84 | self.load_state_dict(torch.load(self.chkpt_file, map_location=torch.device(device))) -------------------------------------------------------------------------------- /MATD3_Continous/agents/NN_critic_td3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.functional as F 4 | import os 5 | from datetime import datetime 6 | 7 | 8 | ''' 创新点1:双截断Q网络''' 9 | class MLPNetworkCritic_td3(nn.Module): 10 | def __init__(self, chkpt_name, chkpt_dir, in_dim, out_dim = 1, hidden_dim = 128, non_linear = nn.ReLU()): 11 | super(MLPNetworkCritic_td3, self).__init__() 12 | self.chkpt_dir = chkpt_dir 13 | self.chkpt_name = chkpt_name 14 | # Q1网络 15 | self.net1 = nn.Sequential( 16 | nn.Linear(in_dim, hidden_dim), 17 | non_linear, 18 | nn.Linear(hidden_dim, hidden_dim), 19 | non_linear, 20 | nn.Linear(hidden_dim, out_dim), 21 | ).apply(self.init) 22 | # Q2网络 23 | self.net2 = nn.Sequential( 24 | nn.Linear(in_dim, hidden_dim), 25 | non_linear, 26 | nn.Linear(hidden_dim, hidden_dim), 27 | non_linear, 28 | nn.Linear(hidden_dim, out_dim), 29 | ).apply(self.init) 30 | @staticmethod 31 | def init(m): 32 | '''init patameters of the module''' 33 | gain = nn.init.calculate_gain('relu') 34 | if isinstance(m, nn.Linear): 35 | nn.init.xavier_uniform_(m.weight, gain = gain) #使用了 Xavier 均匀分布初始化(也叫 Glorot 初始化) 36 | m.bias.data.fill_(0.01) 37 | 38 | def forward(self, x): 39 | # 返回两个Q值 40 | q1 = self.net1(x) 41 | q2 = self.net2(x) 42 | return q1, q2 43 | 44 | def Q1(self, x): 45 | # 只使用Q1网络进行评估 46 | return self.net1(x) 47 | 48 | 49 | def save_checkpoint(self, is_target = False, timestamp = False): 50 | if timestamp is True: 51 | # 使用时间戳创建新文件夹 52 | current_timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M') 53 | save_dir = os.path.join(self.chkpt_dir, current_timestamp) 54 | else: 55 | # 直接保存在主目录下 56 | save_dir = self.chkpt_dir 57 | 58 | self.chkpt_file = os.path.join(save_dir, self.chkpt_name) 59 | 60 | if is_target: 61 | target_chkpt_name = self.chkpt_file.replace('critic', 'target_critic') 62 | os.makedirs(os.path.dirname(target_chkpt_name), exist_ok=True) 63 | torch.save(self.state_dict(), target_chkpt_name) 64 | else: 65 | os.makedirs(os.path.dirname(self.chkpt_file), exist_ok=True) 66 | torch.save(self.state_dict(), self.chkpt_file) 67 | 68 | def load_checkpoint(self, device = 'cpu', is_target = False, timestamp = None): 69 | if timestamp and isinstance(timestamp, str): 70 | # 如果提供了有效的时间戳字符串,从对应文件夹加载 71 | load_dir = os.path.join(self.chkpt_dir, timestamp) 72 | else: 73 | # 否则从主目录加载 74 | load_dir = self.chkpt_dir 75 | 76 | self.chkpt_file = os.path.join(load_dir, self.chkpt_name) 77 | 78 | if is_target: 79 | target_chkpt_name = self.chkpt_file.replace('critic', 'target_critic') 80 | self.load_state_dict(torch.load(target_chkpt_name, map_location=torch.device(device))) 81 | else: 82 | self.load_state_dict(torch.load(self.chkpt_file, map_location=torch.device(device))) 83 | -------------------------------------------------------------------------------- /MATD3_Continous/agents/TD3_agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | from typing import List 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import nn, Tensor 8 | from torch.optim import Adam 9 | from .NN_actor_td3 import MLPNetworkActor_td3 10 | from .NN_critic_td3 import MLPNetworkCritic_td3 11 | 12 | 13 | class TD3: 14 | def __init__(self, obs_dim, act_dim, global_obs_dim, actor_lr, critic_lr, device, action_bound, chkpt_dir, chkpt_name): 15 | self.actor = MLPNetworkActor_td3(in_dim=obs_dim, out_dim=act_dim, hidden_dim = 64, action_bound=action_bound, chkpt_dir = chkpt_dir, chkpt_name = (chkpt_name + 'actor_td3.pth')).to(device) 16 | self.critic = MLPNetworkCritic_td3(in_dim=global_obs_dim, out_dim=1, hidden_dim = 64, chkpt_dir = chkpt_dir, chkpt_name = (chkpt_name + 'critic_td3.pth')).to(device) 17 | #优化器 18 | self.actor_optimizer = Adam(self.actor.parameters(), lr = actor_lr) 19 | self.critic_optimizer = Adam(self.critic.parameters(), lr = critic_lr) 20 | # 创建相对于的target网络 21 | self.actor_target = deepcopy(self.actor) 22 | self.critic_target = deepcopy(self.critic) 23 | """ 24 | 使用 deepcopy 创建 target 网络是一个更好的选择,原因如下: 25 | 初始化一致性: 26 | - deepcopy 确保 target 网络和原网络完全相同的初始参数 27 | - 重新创建网络可能因为随机初始化导致参数不一致 28 | """ 29 | 30 | def actor_action(self, obs): 31 | # 如果是list,先合并为单个tensor 32 | # if isinstance(obs, list): 33 | # obs = torch.cat(obs, dim=1) 34 | action = self.actor(obs) 35 | return action 36 | 37 | def actor_target_action(self, obs): 38 | # 如果是list,先合并为单个tensor 39 | if isinstance(obs, list): 40 | obs = torch.cat(obs, dim=1) 41 | action = self.actor_target(obs) 42 | return action 43 | 44 | def critic_qvalue(self, obs, action): 45 | """获取 critic网络 的Q值""" 46 | # 合并观测和动作 47 | if isinstance(obs, list) and isinstance(action, list): 48 | sa = torch.cat(list(obs) + list(action), dim=1) 49 | else: 50 | sa = torch.cat([obs, action], dim=1) 51 | q1, q2 = self.critic(sa)# 返回两个Q值 52 | return q1.squeeze(1), q2.squeeze(1) 53 | 54 | def critic_target_q(self, obs, action): 55 | """获取 critic目标网络 的Q值""" 56 | # 合并观测和动作 57 | if isinstance(obs, list) and isinstance(action, list): 58 | sa = torch.cat(list(obs) + list(action), dim=1) 59 | else: 60 | sa = torch.cat([obs, action], dim=1) 61 | q1, q2 = self.critic_target(sa)# 返回两个Q值 62 | return q1.squeeze(1), q2.squeeze(1) 63 | 64 | def critic_q1(self, obs, action): 65 | """只获取 critic网络的 第一个Q值 ,用于策略更新""" 66 | # 合并观测和动作 67 | if isinstance(obs, list) and isinstance(action, list): 68 | sa = torch.cat(list(obs) + list(action), dim=1) 69 | else: 70 | sa = torch.cat([obs, action], dim=1) 71 | return self.critic.Q1(sa).squeeze(1) # 只返回Q1 72 | 73 | 74 | def update_actor(self, loss): 75 | self.actor_optimizer.zero_grad() 76 | loss.backward() 77 | ''' 78 | 在较新版本的PyTorch中, clip_grad_norm 已被弃用,推荐使用 clip_grad_norm_ 79 | clip_grad_norm_ 是 clip_grad_norm 的原地版本,不会创建新的张量,而是直接在输入张量上进行修改. 80 | ''' 81 | nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5) # 与clip_grad_norm的不同? 82 | self.actor_optimizer.step() 83 | 84 | def update_critic(self, loss): 85 | self.critic_optimizer.zero_grad() 86 | loss.backward() 87 | nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) 88 | self.critic_optimizer.step() 89 | -------------------------------------------------------------------------------- /MATD3_Continous/agents/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | class BUFFER(): 5 | 6 | def __init__(self,capacity, obs_dim, act_dim, device): 7 | # 使用连续内存布局 8 | self.capacity = capacity 9 | self.obs = np.zeros((capacity, obs_dim), dtype=np.float32) # 指定dtype 10 | self.action = np.zeros((capacity, act_dim), dtype=np.float32) 11 | self.reward = np.zeros(capacity, dtype=np.float32) 12 | self.next_obs = np.zeros((capacity, obs_dim), dtype=np.float32) 13 | self.done = np.zeros(capacity, dtype=np.float32) # 使用bool_ 14 | self._index = 0 15 | self._size = 0 16 | self.device = device 17 | 18 | def add(self,obs, action, reward, next_obs, done): 19 | # 确保输入数据类型一致 20 | self.obs[self._index] = np.asarray(obs, dtype=np.float32) 21 | self.action[self._index] = np.asarray(action, dtype=np.float32) 22 | self.reward[self._index] = np.float32(reward) 23 | self.next_obs[self._index] = np.asarray(next_obs, dtype=np.float32) 24 | self.done[self._index] = np.float32(done) 25 | 26 | self._index = (self._index +1) % self.capacity 27 | if self._size < self.capacity: 28 | self._size += 1 29 | 30 | 31 | def sample(self, indices): 32 | # 一次性批量处理 33 | batch = ( 34 | self.obs[indices], 35 | self.action[indices], 36 | self.reward[indices], 37 | self.next_obs[indices], 38 | self.done[indices] 39 | ) 40 | # 批量转换为tensor并移动到设备 41 | return tuple( 42 | torch.as_tensor(data, device=self.device) 43 | for data in batch 44 | ) 45 | 46 | # obs = torch.from_numpy(obs).float().to(self.device) # torch.Size([batch_size, state_dim]) 47 | # action = torch.from_numpy(action).float().to(self.device) # torch.Size([batch_size, action_dim]) 48 | # reward = torch.from_numpy(reward).float().to(self.device) # just a tensor with length: batch_size 49 | # # reward = (reward - reward.mean()) / (reward.std() + 1e-7) # 暂不使用 50 | # next_obs = torch.from_numpy(next_obs).float().to(self.device) # Size([batch_size, state_dim]) 51 | # done = torch.from_numpy(done).float().to(self.device) # just a tensor with length: batch_size 52 | 53 | # return obs, action, reward, next_obs, done 54 | 55 | def __len__(self): #保留方法 56 | return self._size 57 | -------------------------------------------------------------------------------- /MATD3_Continous/envs/custom_agents_dynamics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 该文件定义了自定义的环境,用于测试自定义的智能体动力学模型 3 | 4 | 继承自core.py 5 | 6 | """ 7 | import numpy as np 8 | from pettingzoo.mpe._mpe_utils.core import EntityState, AgentState, Action, Entity, Landmark, Agent 9 | from pettingzoo.mpe._mpe_utils.core import World 10 | 11 | class CustomWorld(World): 12 | def __init__(self, world_size = 2.5 ): # 13 | super().__init__() # 调用父类的构造函数 14 | self.world_size = world_size # Ronchy 添加世界大小 15 | self.dt = 0.1 # 时间步长 16 | self.damping = 0.2 # 阻尼系数 17 | # contact response parameters 18 | self.contact_force = 1e2 # 控制碰撞强度(默认1e2,值越大反弹越强) 19 | self.contact_margin = 1e-3 # 控制碰撞"柔软度"(默认1e-3,值越小越接近刚体) 20 | """ 21 | 常见问题示例 22 | 实体重叠穿透 contact_force太小 增大contact_force至1e3或更高 23 | 碰撞后震荡 damping太低 增大阻尼系数(如0.5) 24 | 微小距离抖动 contact_margin不合理 调整到1e-2~1e-4之间 25 | """ 26 | """ 27 | 重载底层动力学逻辑 28 | 主要是integrate_state()函数 29 | """ 30 | def step(self): 31 | # set actions for scripted agents 32 | # print("Using world -> step()") # 重载成功! 33 | for agent in self.scripted_agents: 34 | agent.action = agent.action_callback(agent, self) 35 | # gather forces applied to entities 36 | p_force = [None] * len(self.entities) 37 | # apply agent physical controls 38 | p_force = self.apply_action_force(p_force) # 加入噪声 39 | # apply environment forces 40 | p_force = self.apply_environment_force(p_force) # 碰撞力计算 collide为True时 41 | # integrate physical state 42 | self.integrate_state(p_force) # 动力学逻辑 43 | # update agent state 44 | for agent in self.agents: 45 | self.update_agent_state(agent) # 更新 communication action 后的状态 46 | 47 | # integrate physical state 48 | #函数功能:动力学逻辑。更新实体的位置和速度 49 | def integrate_state(self, p_force): 50 | for i, entity in enumerate(self.entities): 51 | if not entity.movable: 52 | continue 53 | # 速度阻尼衰减 54 | entity.state.p_vel *= (1 - self.damping) # 正确应用阻尼 55 | # 动力学 -> 运动学 56 | if p_force[i] is not None: 57 | acceleration = p_force[i] / entity.mass # F = ma 58 | entity.state.p_vel += acceleration * self.dt # v = v_0 + a * t 59 | 60 | # 速度限幅 61 | if entity.max_speed is not None: 62 | speed = np.linalg.norm(entity.state.p_vel) # 计算向量模长 63 | if speed > entity.max_speed: 64 | entity.state.p_vel = entity.state.p_vel * (entity.max_speed / speed) # 向量缩放 65 | 66 | # 更新位置 67 | entity.state.p_pos += entity.state.p_vel * self.dt # 更新位置 68 | # 限制位置在世界大小范围内 69 | # entity.state.p_pos = np.clip(entity.state.p_pos, -self.world_size, self.world_size) # Ronchy 添加世界大小限制 70 | 71 | 72 | # get collision forces for any contact between two entities 73 | # TODO: 碰撞逻辑待细化 74 | def get_collision_force(self, entity_a, entity_b): 75 | if (not entity_a.collide) or (not entity_b.collide): 76 | return [None, None] # not a collider 77 | if entity_a is entity_b: 78 | return [None, None] # don't collide against itself 79 | # compute actual distance between entities 80 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos 81 | dist = np.sqrt(np.sum(np.square(delta_pos))) #用norm更简洁 82 | # minimum allowable distance 83 | dist_min = entity_a.size + entity_b.size # 两个实体的半径之和 84 | # softmax penetration 85 | k = self.contact_margin 86 | penetration = np.logaddexp(0, -(dist - dist_min) / k) * k #渗透深度, 当 dist < dist_min 时产生虚拟渗透量 87 | force = self.contact_force * delta_pos / dist * penetration 88 | force_a = +force if entity_a.movable else None 89 | force_b = -force if entity_b.movable else None 90 | return [force_a, force_b] 91 | -------------------------------------------------------------------------------- /MATD3_Continous/main/main_evaluate.py: -------------------------------------------------------------------------------- 1 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3 2 | from main_parameters import main_parameters 3 | 4 | # 修改导入路径 5 | import sys 6 | import os 7 | # 将项目根目录添加到Python路径 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 9 | 10 | 11 | from agents.MATD3_runner import RUNNER 12 | from agents.MATD3_agent import MATD3 13 | import torch 14 | import random 15 | import numpy as np 16 | from envs import simple_tag_env 17 | 18 | def setup_seed(seed): 19 | torch.manual_seed(seed) 20 | if torch.cuda.is_available(): 21 | torch.cuda.manual_seed(seed) 22 | torch.cuda.manual_seed_all(seed) 23 | np.random.seed(seed) 24 | random.seed(seed) 25 | torch.backends.cudnn.deterministic = True 26 | torch.backends.cudnn.benchmark = False 27 | 28 | 29 | def get_env(env_name, ep_len=50, render_mode = "None", seed = None): 30 | """create environment and get observation and action dimension of each agent in this environment""" 31 | new_env = None 32 | if env_name == 'simple_adversary_v3': 33 | new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True) 34 | if env_name == 'simple_spread_v3': 35 | new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array") 36 | if env_name == 'simple_tag_v3': 37 | new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 38 | if env_name == 'simple_tag_env': 39 | new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 40 | 41 | new_env.reset(seed) 42 | _dim_info = {} 43 | action_bound = {} 44 | for agent_id in new_env.agents: 45 | print("agent_id:",agent_id) 46 | _dim_info[agent_id] = [] # [obs_dim, act_dim] 47 | action_bound[agent_id] = [] #[low action, hign action] 48 | _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0]) 49 | _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0]) 50 | action_bound[agent_id].append(new_env.action_space(agent_id).low) 51 | action_bound[agent_id].append(new_env.action_space(agent_id).high) 52 | 53 | return new_env, _dim_info, action_bound 54 | 55 | 56 | 57 | if __name__ == '__main__': 58 | device ='cpu' 59 | # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 60 | print("Using device:",device) 61 | # 模型存储路径 62 | current_dir = os.path.dirname(os.path.abspath(__file__)) 63 | chkpt_dir = os.path.join(current_dir, 'models', 'matd3_models') 64 | load_timestamp = "2025-04-15_22-23" 65 | model_timestamp = None if load_timestamp == '' else load_timestamp 66 | # 定义参数 67 | args = main_parameters() 68 | args.render_mode = "human" 69 | # args.episode_num = 1 70 | 71 | # 创建环境 72 | print("Using Env's name",args.env_name) 73 | # 判断是否使用固定种子 74 | if args.seed is None: 75 | print("使用随机种子 (不固定)") 76 | else: 77 | print(f"使用固定种子: {args.seed}") 78 | setup_seed(args.seed) 79 | 80 | env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode, seed = args.seed) 81 | # print(env, dim_info, action_bound) 82 | # 创建MA-DDPG智能体 dim_info: 字典,键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意 83 | agent = MATD3(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, args.tau, _chkpt_dir = chkpt_dir, _model_timestamp = model_timestamp) 84 | print("--- Loading models ---") 85 | agent.load_model() 86 | print('---- Evaluating ----') 87 | env.reset(args.seed) 88 | runner = RUNNER(agent, env, args, device, mode = 'evaluate') 89 | runner.evaluate() # 使用evaluate方法 90 | print('---- Done! ----') 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /MATD3_Continous/main/main_parameters.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def main_parameters(): 4 | parser = argparse.ArgumentParser("MADDPG legacy") 5 | ############################################ 选择环境 ############################################ 6 | parser.add_argument("--seed", type=int, default=-1, help='随机种子 (使用-1表示不使用固定种子)') 7 | parser.add_argument("--use_variable_seeds", type=bool, default=False, help="使用可变随机种子") 8 | 9 | parser.add_argument("--env_name", type=str, default="simple_tag_v3", help="name of the env", 10 | choices=['simple_adversary_v3', 'simple_spread_v3', 'simple_tag_v3', 'simple_tag_env']) 11 | parser.add_argument("--render_mode", type=str, default="None", help="None | human | rgb_array") 12 | parser.add_argument("--episode_num", type=int, default=3, help="训练轮数") 13 | parser.add_argument("--episode_length", type=int, default=100, help="每轮最大步数") 14 | parser.add_argument("--evaluate_episode_num", type=int, default=100, help="评估轮数") 15 | parser.add_argument('--learn_interval', type=int, default=10, 16 | help='学习间隔步数') 17 | 18 | parser.add_argument('--random_steps', type=int, default=500, help='初始随机探索步数') 19 | parser.add_argument('--tau', type=float, default=0.01, help='软更新参数') 20 | parser.add_argument('--gamma', type=float, default=0.9, help='折扣因子') 21 | parser.add_argument('--buffer_capacity', type=int, default=int(1e6), help='经验回放缓冲区容量') 22 | parser.add_argument('--batch_size', type=int, default=128, help='批次大小') 23 | parser.add_argument('--actor_lr', type=float, default=0.0001, help='Actor学习率') 24 | parser.add_argument('--critic_lr', type=float, default=0.003, help='Critic学习率') 25 | parser.add_argument('--comm_lr', type=float, default=0.00001, help='Comm学习率') 26 | # 通信网络参数 27 | parser.add_argument('--message_dim', type=int, default=3, help='通信消息维度') 28 | 29 | parser.add_argument('--best_score', type=int, default= -20, help='最佳分数_初始值') 30 | 31 | # 可视化参数 32 | parser.add_argument('--visdom', action="store_true", help="是否使用visdom可视化") 33 | parser.add_argument('--size_win', type=int, default=200, help="平滑窗口大小") 34 | 35 | # 训练设备 36 | parser.add_argument("--device", type=str, default='cpu', help="训练设备,默认自动选择cpu") 37 | 38 | args = parser.parse_args() 39 | 40 | # 如果seed为-1,则设置为None 41 | if args.seed == -1: 42 | args.seed = None 43 | 44 | return args -------------------------------------------------------------------------------- /MATD3_Continous/main/main_train.py: -------------------------------------------------------------------------------- 1 | MODULE_NAME = "log_td3_main" # 使用logger保存训练日志 2 | 3 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3 4 | 5 | # 添加项目根目录到Python路径 6 | import sys 7 | import os 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 9 | 10 | from envs import simple_tag_env, custom_agents_dynamics 11 | 12 | from main_parameters import main_parameters 13 | from agents.MATD3_runner import RUNNER 14 | 15 | from agents.MATD3_agent import MATD3 16 | import torch 17 | import random 18 | import numpy as np 19 | 20 | import time 21 | from datetime import datetime, timedelta 22 | from utils.logger import TrainingLogger # 添加导入 23 | 24 | 25 | def setup_seed(seed): 26 | torch.manual_seed(seed) 27 | if torch.cuda.is_available(): 28 | torch.cuda.manual_seed(seed) 29 | torch.cuda.manual_seed_all(seed) 30 | np.random.seed(seed) 31 | random.seed(seed) 32 | torch.backends.cudnn.deterministic = True 33 | torch.backends.cudnn.benchmark = False 34 | 35 | def get_env(env_name, ep_len=25, render_mode ="None", seed = None): 36 | """create environment and get observation and action dimension of each agent in this environment""" 37 | new_env = None 38 | if env_name == 'simple_adversary_v3': 39 | new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True) 40 | if env_name == 'simple_spread_v3': 41 | new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array") 42 | if env_name == 'simple_tag_v3': 43 | new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 44 | if env_name == 'simple_tag_env': 45 | new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True) 46 | 47 | # 使用reset时处理None种子 48 | if seed is not None: 49 | new_env.reset(seed=seed) # 指定种子值 50 | else: 51 | new_env.reset() # 不指定种子,使用随机种子 52 | 53 | _dim_info = {} 54 | action_bound = {} 55 | for agent_id in new_env.agents: 56 | print("agent_id:",agent_id) 57 | _dim_info[agent_id] = [] # [obs_dim, act_dim] 58 | action_bound[agent_id] = [] #[low action, hign action] 59 | _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0]) 60 | _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0]) 61 | action_bound[agent_id].append(new_env.action_space(agent_id).low) 62 | action_bound[agent_id].append(new_env.action_space(agent_id).high) 63 | print("_dim_info:",_dim_info) 64 | print("action_bound:",action_bound) 65 | return new_env, _dim_info, action_bound 66 | 67 | 68 | if __name__ == '__main__': 69 | # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 70 | device = torch.device('mps' if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() 71 | else 'cuda' if torch.cuda.is_available() else 'cpu') 72 | device = "cpu" 73 | print("Using device:",device) 74 | start_time = time.time() # 记录开始时间 75 | # 模型保存路径 76 | current_dir = os.path.dirname(os.path.abspath(__file__)) 77 | chkpt_dir = os.path.join(current_dir, 'models', 'matd3_models') 78 | # 定义参数 79 | args = main_parameters() 80 | # 创建环境 81 | print("Using Env's name",args.env_name) 82 | 83 | # 判断是否使用固定种子 84 | if args.seed is None: 85 | print("使用随机种子 (不固定)") 86 | else: 87 | print(f"使用固定种子: {args.seed}") 88 | setup_seed(args.seed) 89 | 90 | env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode, seed = args.seed) 91 | # print(env, dim_info, action_bound) 92 | # 创建MA-DDPG智能体 dim_info: 字典,键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意。 93 | agent = MATD3(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, args.tau, _chkpt_dir = chkpt_dir, _device = device) 94 | # 创建运行对象 95 | runner = RUNNER(agent, env, args, device, mode = 'train') 96 | 97 | # 记录训练开始时间 98 | start_time = datetime.now() 99 | start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S") 100 | print(f"训练开始时间: {start_time_str}") 101 | 102 | # 开始训练 103 | runner.train() 104 | 105 | # 记录训练结束时间和计算训练用时 106 | end_time = datetime.now() 107 | end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S") 108 | duration = end_time - start_time 109 | training_duration = str(timedelta(seconds=int(duration.total_seconds()))) 110 | 111 | print(f"\n===========训练完成!===========") 112 | print(f"训练开始时间: {start_time_str}") 113 | print(f"训练结束时间: {end_time_str}") 114 | print(f"训练用时: {training_duration}") 115 | print(f"训练设备: {device}") 116 | 117 | # 使用logger保存训练日志 118 | logger = TrainingLogger(module_name = MODULE_NAME) 119 | logger.save_training_log(args, device, start_time_str, end_time_str, training_duration, runner) 120 | 121 | print("--- saving trained models ---") 122 | agent.save_model(timestamp = True) 123 | print("--- trained models saved ---") 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /MATD3_Continous/plot/README.md: -------------------------------------------------------------------------------- 1 | # 使用说明 2 | 3 | 请将plot_rewards.py放置在对应的文件夹内,而非在根目录下使用! 4 | 5 | 如: 6 | 7 | cp plot_rewards.py ./maddpg_scripted_prey/plot_rewards.py -------------------------------------------------------------------------------- /MATD3_Continous/plot/plot_rewards.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import os 4 | from datetime import datetime 5 | import numpy as np 6 | import platform 7 | ''' 8 | 注意: 9 | 作者用pands==2.2.3出错了。 10 | pip install pandas==2.2.1 没问题。 11 | ''' 12 | 13 | def moving_average(data, window_size=50): 14 | """简单移动平均""" 15 | weights = np.ones(window_size) / window_size 16 | return np.convolve(data, weights, mode='valid') 17 | 18 | def exponential_moving_average(data, alpha=0.1): 19 | """指数移动平均""" 20 | ema = np.zeros_like(data) 21 | ema[0] = data[0] 22 | for i in range(1, len(data)): 23 | ema[i] = alpha * data[i] + (1 - alpha) * ema[i-1] 24 | return ema 25 | 26 | def set_font_for_plot(): 27 | """根据平台动态设置字体""" 28 | system_platform = platform.system() 29 | print("system_platform:", system_platform) 30 | if system_platform == "Darwin": # MacOS 31 | font = 'Arial Unicode MS' 32 | elif system_platform == "Windows": # Windows 33 | font = 'SimHei' 34 | else: # Linux 35 | # 中文字体需要手动安装 36 | # 参考:https://blog.csdn.net/takedachia/article/details/131017286 https://blog.csdn.net/weixin_45707277/article/details/118631442 37 | font = 'SimHei' 38 | 39 | plt.rcParams['font.sans-serif'] = [font] 40 | plt.rcParams['axes.unicode_minus'] = False 41 | 42 | def plot_all_rewards(csv_file, window_size=50): 43 | """在一张图上绘制所有智能体的奖励曲线(包括追捕者和逃避者)""" 44 | df = pd.read_csv(csv_file) 45 | set_font_for_plot() 46 | 47 | # 打印CSV文件的列名,帮助调试 48 | print(f"CSV文件列名: {df.columns.tolist()}") 49 | 50 | # 获取数据点数量,动态调整窗口大小 51 | data_points = len(df) 52 | print(f"数据点数量: {data_points}") 53 | 54 | # 如果数据点数量小于窗口大小,则调整窗口大小为数据点数量的一半 55 | if data_points < window_size: 56 | window_size = max(2, data_points // 2) # 确保窗口大小至少为2 57 | print(f"数据点不足,调整窗口大小为: {window_size}") 58 | 59 | # 从CSV文件名中提取时间戳 60 | base_name = os.path.basename(csv_file) 61 | if 'rewards_' in base_name and '.csv' in base_name: 62 | timestamp = base_name.replace('rewards_', '').replace('.csv', '') 63 | else: 64 | timestamp = '' 65 | 66 | # 创建一个包含两个子图的图形 67 | fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10)) 68 | 69 | # 第一个子图:所有智能体的奖励曲线 70 | # 修改:适配CSV文件列名 71 | adversary_col = 'Adversary_Mean' if 'Adversary_Mean' in df.columns else 'Adversary_Mean_Reward' 72 | agent_columns = [col for col in df.columns if col not in ['Episode', adversary_col]] 73 | colors = plt.cm.tab10(np.linspace(0, 1, len(agent_columns))) 74 | 75 | # 绘制每个智能体的奖励曲线 76 | for agent, color in zip(agent_columns, colors): 77 | # 原始数据(半透明) 78 | ax1.plot(df['Episode'], df[agent], color=color, alpha=0.2, label=f'{agent} (原始)') 79 | # 移动平均 80 | ma_data = moving_average(df[agent].values, window_size) 81 | # 确保x轴和y轴数据长度匹配 82 | x_data = df['Episode'][window_size-1:window_size-1+len(ma_data)] 83 | ax1.plot(x_data, ma_data, 84 | color=color, linewidth=2, label=f'{agent} (移动平均)') 85 | 86 | ax1.set_title('所有智能体奖励曲线') 87 | ax1.set_xlabel('回合数') 88 | ax1.set_ylabel('奖励') 89 | ax1.grid(True, linestyle='--', alpha=0.7) 90 | ax1.legend() 91 | 92 | # 第二个子图:追捕者平均奖励 93 | # 修改:适配CSV文件列名 94 | # 原始数据(半透明) 95 | ax2.plot(df['Episode'], df[adversary_col], 96 | 'gray', alpha=0.2, label='原始数据') 97 | # 移动平均 98 | adv_ma = moving_average(df[adversary_col].values, window_size) 99 | # 确保x轴和y轴数据长度匹配 100 | x_data = df['Episode'][window_size-1:window_size-1+len(adv_ma)] 101 | ax2.plot(x_data, adv_ma, 102 | 'r-', linewidth=2, label='移动平均') 103 | 104 | ax2.set_title('追捕者平均奖励趋势') 105 | ax2.set_xlabel('回合数') 106 | ax2.set_ylabel('平均奖励') 107 | ax2.grid(True, linestyle='--', alpha=0.7) 108 | ax2.legend() 109 | 110 | # 调整子图之间的间距 111 | plt.tight_layout() 112 | 113 | # 保存图片 114 | if timestamp: 115 | save_path = os.path.join(os.path.dirname(csv_file), f'training_rewards_{timestamp}.png') 116 | else: 117 | save_path = os.path.join(os.path.dirname(csv_file), 'training_rewards.png') 118 | plt.savefig(save_path, dpi=300, bbox_inches='tight') 119 | print(f"训练奖励图像已保存至: {save_path}") 120 | plt.close() 121 | 122 | if __name__ == "__main__": 123 | # 修改:指定具体的CSV文件名 124 | csv_file = os.path.join(os.path.dirname(__file__), 'xxxx.csv') # 替换为你的CSV文件名 125 | print("csv_file name:", csv_file) 126 | 127 | if os.path.exists(csv_file): 128 | plot_all_rewards(csv_file) 129 | else: 130 | print(f"错误:未找到CSV文件:{csv_file},请检查路径及文件名是否正确!") -------------------------------------------------------------------------------- /MATD3_Continous/plot/training_rewards_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/MATD3_Continous/plot/training_rewards_demo.png -------------------------------------------------------------------------------- /MATD3_Continous/readme.md: -------------------------------------------------------------------------------- 1 | [🇨🇳 中文文档](readme.md) | [🇺🇸 English](readme_en.md) 2 | 3 | # 多智能体深度强化学习MATD3算法 - Predator-Prey追逃博弈 4 | 5 | >**本项目专为Predator-Prey追逃博弈任务优化!** 基于TD3算法的多智能体扩展版本(MATD3:Twin Delayed Deep Deterministic Policy Gradient),在`PettingZoo MPE`环境基础上重构修改,提供了完整的多智能体协作与对抗环境,专注于连续动作空间的多智能体协作与对抗任务;适用于围捕控制、群体智能和策略博弈研究. 6 | 7 | > MATD3算法优势:相比MADDPG,通过双Q网络和目标策略平滑机制有效解决过估计问题,提供更稳定的训练和更优的策略。 8 | 9 | > Reference: https://github.com/wild-firefox/FreeRL/blob/main/MADDPG_file/MATD3_simple.py 10 | 11 | ## 📈 训练效果 12 |
13 | 训练收敛结果 14 |

MATD3算法在simple_tag_v3环境中的奖励收敛曲线

15 |
16 | 17 | > **⚠️ 重要提示**:使用前请查看🔍 [**已知问题与解决方案KNOWN_ISSUES.md**](KNOWN_ISSUES.md)文档,了解常见问题的解决方法,特别是Windows系统的渲染卡死问题和PettingZoo版本兼容性问题。 18 | 19 | > **奖励函数优化**:官方的奖励配置无法训练出良好的围捕行为,本项目专门优化了追捕者的奖励函数,实现更高效的协作围捕 20 | 21 | ## 🚀 实现进度 22 | | 算法 | 状态 | 位置 | 核心组件 | 23 | |--------------|--------|-------------------|----------------------------------| 24 | | MATD3 | ✅ 1.0 | `agents/` | MATD3_agent, buffer, networks | 25 | 26 | 27 | ## 项目结构 28 | 29 | ```tree 30 | MATD3_Continous/ 31 | ├── agents/ # 智能体算法实现 32 | │ ├── buffer.py # 经验回放缓冲区 33 | │ ├── MATD3_agent.py # MATD3智能体控制器 34 | │ ├── MATD3_runner.py # 训练与评估运行器 35 | │ ├── NN_actor_td3.py # Actor网络结构 36 | │ ├── NN_critic_td3.py # Critic网络结构(双Q网络) 37 | │ └── TD3_agent.py # 基础TD3实现 38 | ├── envs/ # 环境实现 39 | │ ├── custom_agents_dynamics.py # 自定义智能体动力学 40 | │ └── simple_tag_env.py # 修改版追逃环境 41 | ├── main/ # 主程序脚本 42 | │ ├── main_evaluate.py # 评估脚本 43 | │ ├── main_parameters.py # 参数配置 44 | │ └── main_train.py # 训练入口 45 | ├── plot/ # 数据可视化 46 | │ ├── matd3_data/ # 训练数据存储 47 | │ ├── plot_rewards.py # 奖励绘图脚本 48 | │ ├── README.md # 绘图说明 49 | │ └── training_rewards_demo.png # 样例训练曲线 50 | ├── logs/ # 日志文件 51 | │ └── log_td3_main/ # TD3训练日志 52 | └── utils/ # 工具函数 53 | ├── conda-environment.yml # Conda环境配置(Windows和Intel芯片的macOS) 54 | ├── linux_environment.yml # Linux环境配置 55 | ├── logger.py # 日志工具 56 | ├── mac_arm_M4_environment.yml # Mac M系列芯片环境配置 57 | ├── pip-requirements.txt # pip依赖 58 | ├── pip-requirements_mac_arm_M4.txt # Mac M系列芯片专用依赖 59 | └── setupPettingzoo.py # PettingZoo环境设置 60 | ``` 61 | 62 | ## 环境说明 63 | 64 | 本项目基于 PettingZoo 的 MPE (Multi-Particle Environment) 环境,主要实现了 simple_tag 追逐逃避任务: 65 | 66 | - **追捕者 (Adversaries)**: 多个追捕者协作追捕逃避者 67 | - **逃避者 (Good Agents)**: 尝试逃离追捕者 68 | 69 | 环境特点: 70 | - 连续动作空间 71 | - 部分可观测状态 72 | - 多智能体协作与对抗 73 | 74 | ## 算法实现 75 | 76 | 项目实现了 MATD3 (Multi-Agent Twin Delayed Deep Deterministic Policy Gradient) 算法,这是 TD3 算法的多智能体扩展版本,主要特点: 77 | 78 | - 双重 Q 网络减少过估计 79 | - 延迟策略更新 80 | - 目标策略平滑正则化 81 | - 集中式训练,分布式执行 (CTDE) 范式 82 | 83 | 84 | ## 🛠️ 快速开始 85 | 86 | ### 环境配置 87 | 88 | > 相关配置需求在utils/文件夹下。 89 | 90 | ### Linux环境(ubuntu) 91 | 1. 使用linux_environment.yml创建新环境 92 | ```bash 93 | # 注意:将"MPE"替换为您喜欢的环境名称 94 | conda env create -f utils/linux_environment.yml -n MPE 95 | # 激活刚创建的环境 96 | conda activate MPE 97 | ``` 98 | 2. pip安装核心依赖 99 | ```bash 100 | pip install -r utils/pip-requirements.txt 101 | ``` 102 | ### Mac M系列芯片环境 103 | 1. 使用mac_arm_M4_environment.yml创建新conda环境 104 | ```bash 105 | # 注意:将"MPE"替换为您喜欢的环境名称 106 | conda env create -f utils/mac_arm_M4_environment.yml -n MPE 107 | # 激活刚创建的环境 108 | conda activate MPE 109 | ``` 110 | 2. pip安装Mac M芯片专用依赖 111 | ```bash 112 | pip install -r utils/pip-requirements_mac_arm_M4.txt 113 | ``` 114 | 115 | ### Windows创建并激活虚拟环境(推荐) 116 | 1. 使用conda-environment.yml创建新环境 117 | ```bash 118 | # 注意:将"MPE"替换为您喜欢的环境名称 119 | conda env create -f utils/conda-environment.yml -n MPE 120 | # 激活刚创建的环境 121 | conda activate MPE 122 | ``` 123 | 2. pip安装核心依赖 124 | ```bash 125 | pip install -r utils/pip-requirements.txt 126 | ``` 127 | ### 手动安装依赖 128 | > 上述虚拟环境创建成功后,您需要手动安装以下依赖: 129 | 3. 从PyTorch官网安装对应版本的PyTorch 130 | ```bash 131 | # 请访问 https://pytorch.org 选择适合您系统的安装命令 132 | # 例如: 133 | pip3 install torch torchvision torchaudio 134 | ``` 135 | 136 | 4. 2025.4.26 update: 安装`PettingZoo 1.25.0`版本,官方PyPI仓库最新版本更新为为1.25.0,内容与1.24.4相同。MPE被拆分出PettingZoo, **警告可忽略**,`MPE2`详情可见:https://github.com/Farama-Foundation/MPE2 137 | ```bash 138 | pip install pettingzoo==1.25.0 139 | ``` 140 | 141 | 4. ~~安装PettingZoo 1.24.4版本~~ 142 | ```bash 143 | # 重要说明:本项目需要PettingZoo 1.24.4版本,但官方PyPI仓库最新版本仅为1.24.3 144 | # 必须从GitHub源码安装才能获取1.24.4版本,安装命令为: 145 | # pip install "pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git" 146 | 或者,您可以直接运行提供的安装脚本安装pettingzoo1.25.0: 147 | python utils/setupPettingzoo.py 148 | ``` 149 | 150 | ### 🖥️ 运行配置 151 | > **注意:** 当前版本采用本地数据存储模式,无需额外配置可视化服务器。训练数据将保存在plot/matd3_data/目录下。 152 | 153 | ## 🔄 训练流程 154 | 1. **参数配置** 155 | 在 `main_parameters.py` 中设置环境和算法参数: 156 | ```python 157 | env_name = 'simple_tag_v3' # 可选:simple_adversary_v3/simple_spread_v3 158 | episode_num = 5000 # 总训练回合数 159 | # 训练参数 160 | batch_size = 128 # 经验回放批次大小 161 | actor_lr = 0.0002 # Actor网络学习率 162 | critic_lr = 0.002 # Critic网络学习率 163 | ``` 164 | 165 | 2. **运行训练脚本** 166 | ```bash 167 | # 使用默认参数训练 168 | cd main 169 | python main_train.py 170 | ``` 171 | 172 | 3. **查看训练进度** 173 | 训练数据将实时保存到CSV文件中,可使用plot_rewards.py脚本进行可视化: 174 | ```bash 175 | python plot/plot_rewards.py 176 | ``` 177 | 178 | 4. **评估训练模型** 179 | ```bash 180 | # 渲染训练好的模型策略 181 | cd main 182 | python main_evaluate.py 183 | ``` 184 | 185 | ### 🌐 环境特性与优化 186 | 本项目基于PettingZoo的MPE环境进行了大量优化: 187 | 188 | - **TD3增强的策略稳定性**: 相比MADDPG,MATD3通过双Q网络和目标策略平滑有效解决过估计问题 189 | - **围捕行为的奖励优化**: 通过精心设计的奖励函数,实现更具协作性的围捕策略 190 | - **物理参数优化**: 191 | - 世界大小:2.5单位(可根据追逃需求自定义) 192 | - 时间步长:0.1秒(影响动作响应速度) 193 | - 阻尼系数:0.2(影响智能体的惯性) 194 | 195 | #### 🌟 MATD3 vs MADDPG 196 | MATD3对标准MADDPG进行了以下关键增强: 197 | 198 | 1. **双Q网络设计**: 减少对动作值的过估计 199 | 2. **延迟策略更新**: 提高训练稳定性 200 | 3. **目标策略平滑**: 通过在目标动作中加入噪声防止过拟合 201 | 4. **自适应噪声调整**: 根据训练进度动态调整探索噪声 202 | 203 | 这些优化使MATD3在追逃博弈场景中展现出更强大的性能和更快的收敛速度。 204 | 205 | ## 📦 数据管理 206 | ### 模型存储 207 | 训练模型自动保存在: 208 | ```tree 209 | ./main/models/ 210 | └── matd3_models/ # MATD3检查点目录 211 | ├── {timestamp}_agent_0_actor.pth # Actor网络参数 212 | ├── {timestamp}_agent_0_critic_1.pth # 第一个Critic网络参数 213 | ├── {timestamp}_agent_0_critic_2.pth # 第二个Critic网络参数 214 | └── ... # 其他智能体网络 215 | ``` 216 | 217 | ### 可视化系统 218 | 训练指标可视化: 219 | ```tree 220 | plot/ 221 | ├── matd3_data/ # 训练数据存储 222 | │ └── rewards_{timestamp}.csv # CSV格式奖励记录 223 | └── plot_rewards.py # 可视化工具 224 | ``` 225 | 226 | ## 🤝 贡献 227 | 本项目的主要贡献在于: 228 | - TD3算法在多智能体场景下的扩展与优化 229 | - 针对Predator-Prey追逃博弈任务的环境适配与优化 230 | - 改进的奖励函数设计,实现高效的围捕协作行为 231 | - 稳定的训练框架,支持各种复杂追逃场景 232 | 233 | 如遇到任何问题,欢迎提交Issue或Pull Request。若您有兴趣扩展更多追逃博弈场景或改进算法,我们欢迎您的贡献! -------------------------------------------------------------------------------- /MATD3_Continous/utils/conda-environment.yml: -------------------------------------------------------------------------------- 1 | name: MARL 2 | channels: 3 | - pytorch 4 | - defaults 5 | - https://repo.anaconda.com/pkgs/main 6 | - https://repo.anaconda.com/pkgs/r 7 | dependencies: 8 | - blas=1.0 9 | - brotli-python=1.0.9 10 | - bzip2=1.0.8 11 | - ca-certificates=2025.2.25 12 | - certifi=2025.1.31 13 | - filelock=3.13.1 14 | - freetype=2.12.1 15 | - gmpy2=2.1.2 16 | - intel-openmp=2023.1.0 17 | - jinja2=3.1.4 18 | - jpeg=9e 19 | - lcms2=2.16 20 | - lerc=4.0.0 21 | - libdeflate=1.22 22 | - libffi=3.4.4 23 | - libjpeg-turbo=2.0.0 24 | - libpng=1.6.39 25 | - libtiff=4.5.1 26 | - libwebp-base=1.3.2 27 | - llvm-openmp=14.0.6 28 | - lz4-c=1.9.4 29 | - markupsafe=2.1.3 30 | - mkl=2023.1.0 31 | - mkl-service=2.4.0 32 | - mkl_fft=1.3.8 33 | - mkl_random=1.2.4 34 | - mpc=1.1.0 35 | - mpfr=4.0.2 36 | - mpmath=1.3.0 37 | - numpy=1.26.4 38 | - numpy-base=1.26.4 39 | - openjpeg=2.5.2 40 | - openssl=3.0.16 41 | - pip=24.2 42 | - pybind11-abi=4 43 | - pysocks=1.7.1 44 | - python=3.11.8 45 | - pyyaml=6.0.2 46 | - requests=2.32.3 47 | - setuptools=75.1.0 48 | - sqlite=3.45.3 49 | - sympy=1.13.3 50 | - tbb=2021.8.0 51 | - tk=8.6.14 52 | - typing_extensions=4.12.2 53 | - wheel=0.44.0 54 | - xz=5.4.6 55 | - yaml=0.2.5 56 | - zlib=1.2.13 57 | - zstd=1.5.6 58 | prefix: /Users/ronchy2000/DevelopEnv/anaconda3/envs/MARL 59 | -------------------------------------------------------------------------------- /MATD3_Continous/utils/linux_environment.yml: -------------------------------------------------------------------------------- 1 | name: MARL 2 | channels: 3 | - pytorch 4 | - defaults 5 | - https://repo.anaconda.com/pkgs/main 6 | - https://repo.anaconda.com/pkgs/r 7 | dependencies: 8 | - blas=1.0 9 | - brotli-python=1.0.9 10 | - bzip2=1.0.8 11 | - ca-certificates=2025.2.25 12 | - certifi=2025.1.31 13 | - filelock=3.13.1 14 | - freetype=2.12.1 15 | - gmp=6.2.1 16 | - gmpy2=2.1.2 17 | - intel-openmp=2023.1.0 18 | - jinja2=3.1.4 19 | - jpeg=9e 20 | - lcms2=2.16 21 | - lerc=4.0.0 22 | - libcxx=14.0.6 23 | - libdeflate=1.22 24 | - libffi=3.4.4 25 | - libgfortran=5.0.0 26 | - libgfortran5=11.3.0 27 | - libjpeg-turbo=2.0.0 28 | - libpng=1.6.39 29 | - libtiff=4.5.1 30 | - libwebp-base=1.3.2 31 | - llvm-openmp=14.0.6 32 | - lz4-c=1.9.4 33 | - markupsafe=2.1.3 34 | - mkl=2023.1.0 35 | - mkl-service=2.4.0 36 | - mkl_fft=1.3.8 37 | - mkl_random=1.2.4 38 | - mpc=1.1.0 39 | - mpfr=4.0.2 40 | - mpmath=1.3.0 41 | - ncurses=6.4 42 | - numpy-base=1.26.4 43 | - openjpeg=2.5.2 44 | - openssl=3.0.16 45 | - pip=24.2 46 | - pybind11-abi=4 47 | - pysocks=1.7.1 48 | - python=3.11.8 49 | - pytorch=2.2.2 50 | - pyyaml=6.0.2 51 | - readline=8.2 52 | - requests=2.32.3 53 | - setuptools=75.1.0 54 | - sqlite=3.45.3 55 | - sympy=1.13.3 56 | - tbb=2021.8.0 57 | - tk=8.6.14 58 | - torchaudio=2.2.2 59 | - torchvision=0.17.2 60 | - typing_extensions=4.12.2 61 | - wheel=0.44.0 62 | - xz=5.4.6 63 | - yaml=0.2.5 64 | - zlib=1.2.13 65 | - zstd=1.5.6 66 | - pip: 67 | - charset-normalizer==3.4.1 68 | - cloudpickle==3.1.0 69 | - contourpy==1.3.1 70 | - cycler==0.12.1 71 | - farama-notifications==0.0.4 72 | - fonttools==4.55.3 73 | - gymnasium==1.0.0 74 | - idna==3.10 75 | - jsonpatch==1.33 76 | - jsonpointer==3.0.0 77 | - kiwisolver==1.4.8 78 | - matplotlib==3.8.3 79 | - networkx==3.4.2 80 | - numpy==2.2.1 81 | - packaging==24.2 82 | - pandas==2.2.1 83 | - pettingzoo==1.24.4 84 | - pillow==11.1.0 85 | - pip-chill==1.0.3 86 | - pygame==2.6.1 87 | - pyparsing==3.2.1 88 | - python-dateutil==2.9.0.post0 89 | - pytz==2025.1 90 | - scipy==1.15.0 91 | - six==1.17.0 92 | - tornado==6.4.2 93 | - tzdata==2025.1 94 | - urllib3==2.3.0 95 | - visdom==0.2.4 96 | - websocket-client==1.8.0 97 | prefix: /Users/ronchy2000/DevelopEnv/anaconda3/envs/MARL 98 | -------------------------------------------------------------------------------- /MATD3_Continous/utils/mac_arm_M4_environment.yml: -------------------------------------------------------------------------------- 1 | name: MARL 2 | channels: 3 | - pytorch 4 | - defaults 5 | - https://repo.anaconda.com/pkgs/main 6 | - https://repo.anaconda.com/pkgs/r 7 | dependencies: 8 | - blas=1.0 9 | - brotli-python=1.0.9 10 | - bzip2=1.0.8 11 | - ca-certificates=2025.2.25 12 | - certifi=2025.1.31 13 | - filelock=3.13.1 14 | - freetype=2.12.1 15 | - gmp=6.2.1 16 | - gmpy2=2.1.2 17 | # - intel-openmp=2023.1.0 18 | - jinja2=3.1.4 19 | - jpeg=9e 20 | - lcms2=2.16 21 | - lerc=4.0.0 22 | - libcxx=14.0.6 23 | - libdeflate=1.22 24 | - libffi=3.4.4 25 | - libgfortran=5.0.0 26 | - libgfortran5=11.3.0 27 | - libjpeg-turbo=2.0.0 28 | - libpng=1.6.39 29 | - libtiff=4.5.1 30 | - libwebp-base=1.3.2 31 | - llvm-openmp=14.0.6 32 | - lz4-c=1.9.4 33 | - markupsafe=2.1.3 34 | # - mkl=2023.1.0 35 | # - mkl-service=2.4.0 36 | # - mkl_fft=1.3.8 37 | # - mkl_random=1.2.4 38 | - mpc=1.1.0 39 | - mpfr=4.0.2 40 | - mpmath=1.3.0 41 | - ncurses=6.4 42 | - numpy-base=1.26.4 43 | - openjpeg=2.5.2 44 | - openssl=3.0.16 45 | - pip=24.2 46 | - pybind11-abi=4 47 | - pysocks=1.7.1 48 | - python=3.11.8 49 | - pytorch=2.2.2 50 | - pyyaml=6.0.2 51 | - readline=8.2 52 | - requests=2.32.3 53 | - setuptools=75.1.0 54 | - sqlite=3.45.3 55 | - sympy=1.13.3 56 | - tbb=2021.8.0 57 | - tk=8.6.14 58 | - torchaudio=2.2.2 59 | - torchvision=0.17.2 60 | - typing_extensions=4.12.2 61 | - wheel=0.44.0 62 | - xz=5.4.6 63 | - yaml=0.2.5 64 | - zlib=1.2.13 65 | - zstd=1.5.6 66 | 67 | -------------------------------------------------------------------------------- /MATD3_Continous/utils/pip-requirements.txt: -------------------------------------------------------------------------------- 1 | brotli==1.0.9 2 | gmpy2==2.1.2 3 | gymnasium==1.0.0 4 | importlib-metadata==8.0.0 5 | importlib-resources==6.4.0 6 | jaraco.collections==5.1.0 7 | matplotlib==3.8.3 8 | mkl-fft==1.3.8 9 | mkl-random==1.2.4 10 | pandas==2.2.1 11 | pip-chill==1.0.3 12 | platformdirs==4.2.2 13 | pygame==2.6.1 14 | pysocks==1.7.1 15 | pyyaml==6.0.2 16 | tomli==2.0.1 17 | visdom==0.2.4 18 | -------------------------------------------------------------------------------- /MATD3_Continous/utils/pip-requirements_mac_arm_M4.txt: -------------------------------------------------------------------------------- 1 | brotli==1.0.9 2 | gmpy2==2.1.2 3 | gymnasium==1.1.1 4 | importlib-metadata==8.0.0 5 | importlib-resources==6.4.0 6 | jaraco.collections==5.1.0 7 | matplotlib==3.8.3 8 | pandas==2.2.1 9 | pip-chill==1.0.3 10 | platformdirs==4.2.2 11 | pygame==2.6.1 12 | pysocks==1.7.1 13 | pyyaml==6.0.2 14 | tomli==2.0.1 15 | visdom==0.2.4 16 | -------------------------------------------------------------------------------- /MATD3_Continous/utils/setupPettingzoo.py: -------------------------------------------------------------------------------- 1 | # 使用 sys.executable 获取当前虚拟环境的 pip,这样它会始终使用当前虚拟环境的 pip 安装包,而不是系统环境的 pip 2 | import pkg_resources 3 | import sys 4 | import platform 5 | from subprocess import call 6 | 7 | def check_and_install_pettingzoo(): 8 | # 打印当前虚拟环境的相关信息 9 | print("================================") 10 | print(f"Current Python executable: {sys.executable}") 11 | print(f"Python version: {sys.version}") 12 | print(f"Current virtual environment: {sys.prefix}") 13 | print(f"Platform: {platform.system()} {platform.release()}") 14 | print("================================") 15 | 16 | try: 17 | # 检查 pettingzoo 是否已经安装 18 | pkg_resources.get_distribution("pettingzoo") 19 | print("================================") 20 | print("pettingzoo is already installed.") 21 | print("================================") 22 | except pkg_resources.DistributionNotFound: 23 | # 如果 pettingzoo 没有安装,执行安装操作 24 | print("================================") 25 | print("pettingzoo is not installed. Installing pettingzoo...") 26 | print("================================") 27 | 28 | # 获取当前虚拟环境的 Python 解释器路径 29 | python_executable = sys.executable 30 | pip_executable = python_executable.replace("python", "pip") # 获取 pip 路径 31 | 32 | # 尝试安装 pettingzoo==1.24.4 33 | try: 34 | print("Attempting to install pettingzoo==1.24.4...") 35 | result = call([pip_executable, "install", "pettingzoo==1.24.4"]) 36 | if result == 0: 37 | print("================================") 38 | print("Successfully installed pettingzoo==1.24.4") 39 | print("================================") 40 | else: 41 | print("Installation of pettingzoo==1.24.4 failed. Trying GitHub installation...") 42 | # 如果安装失败,尝试从 GitHub 安装 43 | try: 44 | result = call([pip_executable, "install", "\"pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git\""]) 45 | if result == 0: 46 | print("================================") 47 | print("Successfully installed pettingzoo from GitHub.") 48 | print("================================") 49 | else: 50 | print("GitHub installation failed. Please check the error above.") 51 | except Exception as e: 52 | print(f"Failed to install pettingzoo from GitHub: {e}") 53 | print("================================") 54 | print("Please manually install pettingzoo or check the error above.") 55 | except Exception as e: 56 | print(f"Failed to install pettingzoo==1.24.4: {e}") 57 | print("Attempting to install pettingzoo from GitHub...") 58 | 59 | if __name__ == "__main__": 60 | check_and_install_pettingzoo() 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 强化学习与多智能体强化学习项目集 2 | [ 🇺🇸 English](./README_en.md) | 🇨🇳 中文文档 3 | 4 | ![项目总状态](https://img.shields.io/badge/状态-维护模式-blue) ![Python](https://img.shields.io/badge/Python-3.11.8%2B-blue) ![强化学习](https://img.shields.io/badge/强化学习-基础到高级-orange) ![多智能体](https://img.shields.io/badge/多智能体-MADDPG实现-success) 5 | 6 | 本仓库包含强化学习(RL)和多智能体强化学习(MARL)相关的多个项目,既有经典算法的复现,也有个人的研究实现。通过这些项目,我希望构建从基础强化学习到多智能体强化学习的完整学习路径。 7 | 8 | | 项目 | 状态 | 完成度 | 技术栈 | 文档索引 | 9 | |------|------|--------|--------|----------| 10 | | [RL_Learning-main](./RL_Learning-main/) | ![状态](https://img.shields.io/badge/状态-已完成-success) | ![完成度](https://img.shields.io/badge/完成度-90%25-green) | ![技术](https://img.shields.io/badge/技术-基础RL算法-blue) | [已实现算法](./RL_Learning-main/README.md#已实现算法) | 11 | | [动手学强化学习](./动手学强化学习/) | ![状态](https://img.shields.io/badge/状态-参考实现-informational) | ![完成度](https://img.shields.io/badge/完成度-100%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-DQN到DDPG-blue) | [README](./动手学强化学习/README.md) | 12 | | [MADDPG_Continous](./MADDPG_Continous/) | ![状态](https://img.shields.io/badge/状态-已完成-success) | ![完成度](https://img.shields.io/badge/完成度-100%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-连续MADDPG-blue) | [中文文档](./MADDPG_Continous/README.md#项目特色) | 13 | | [MATD3_Continous](./MATD3_Continous/) | ![状态](https://img.shields.io/badge/状态-已完成-success) | ![完成度](https://img.shields.io/badge/完成度-100%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-连续MATD3-blue) | [中文文档](./MATD3_Continous/readme.md) | 14 | 15 | 16 | ## 学习路径与项目关联 17 | 本仓库中的项目构成了一条从基础强化学习到多智能体强化学习的完整学习路径: 18 | 19 | 1. **基础理论与算法** (RL_Learning-main):掌握强化学习的数学基础和基本算法 20 | 2. **基础算法实现** (动手学强化学习):动手实现基础强化学习算法 21 | 4. **多智能体扩展** (MADDPG_Continous, MATD3_Continous):将单智能体算法扩展到多智能体场景 22 | 23 | ## 项目结构 24 | ### RL_Learning-main:强化学习基础代码复现 25 | 26 | 复现西湖大学**赵世钰老师**的强化学习课程代码,包括值迭代、策略迭代、蒙特卡洛、时序差分、DQN、Reinforce等算法实现。这部分是理解强化学习基础算法的最佳起点。 27 | 28 |
29 | 策略迭代可视化 30 | 值迭代可视化 31 |

从左到右: 策略迭代算法、值迭代算法可视化

32 |
33 | 34 | #### 参考资源 35 | - [赵老师强化学习课程](https://www.bilibili.com/video/BV1sd4y167NS) 36 | - [强化学习的数学原理](https://github.com/MathFoundationRL/Book-Mathematical-Foundation-of-Reinforcement-Learning) 37 | #### 代码位置 [`赵老师强化学习代码仓库: ./RL_Learning-main`](./RL_Learning-main/scripts) 38 | 39 | #### 更新日志 40 | 41 | **2024.6.7** 42 | 重大更新! 原作者render坐标与state设置不一致。坐标已统一修改为: 43 | ![img.png](img.png) 44 | > 原始代码来源: https://github.com/jwk1rose/RL_Learning 45 | > 本人正在重构代码,尽量分解为更多独立模块并添加详细注释。 46 | >Refactoring the code of jwk1rose,I'm trying to divide it into as many sections as possible and write comments. 47 | 48 | --- 49 | ### 二、动手学强化学习 50 | 《动手学强化学习》书籍代码的复现与扩展,最终目标是扩展到MADDPG。这部分是我系统学习强化学习的记录,从基础算法到高级算法的实现。 51 | #### 实现算法 52 | - DQN (Deep Q-Network) 53 | - Policy Gradient (REINFORCE) 54 | - Actor-Critic 55 | - DDPG (Deep Deterministic Policy Gradient) 56 | #### 学习路径 57 | 这部分展示了从基础DQN到DDPG,再到MADDPG的学习路径,是理解多智能体强化学习的基础铺垫。 58 | #### 代码位置 [`./动手学强化学习`](./动手学强化学习/) 59 | 60 | #### 参考资源 61 | - [动手学强化学习](https://hrl.boyuai.com/chapter/2/dqn%E7%AE%97%E6%B3%95) 62 | - [HandsOnRL GitHub](https://github.com/peterwu4084/HandsOnRL/tree/main) 63 | 64 | --- 65 | ### 三、多智能体强化学习实现 66 | > **本项目专为Predator-Prey追逃博弈任务优化!** 在`PettingZoo MPE`环境基础上重构修改,提供了完整的多智能体协作与对抗环境,适用于围捕控制、群体智能和策略博弈研究。 67 | 68 | 在掌握了基础强化学习算法后,我们自然会思考:如何将这些方法扩展到多个智能体同时学习的场景?多智能体强化学习(MARL)正是解决这一问题的关键技术。以下是我在MARL领域的两个主要实现。 69 | 70 | #### 3.1 MADDPG_Continous:多智能体深度确定性策略梯度算法 71 | 72 | 73 | 个人基于最新版**Pettingzoo**`(pettingzoo==1.25.0)`中的MPE环境,实现的连续状态,连续动作下的MADDPG算法,支持连续动作空间的多智能体协作与竞争。 74 | 75 | > MADDPG algorithm Reference: https://github.com/Git-123-Hub/maddpg-pettingzoo-pytorch 76 | 77 |
78 | 智能体行为 79 |

训练后的智能体行为展示:捕食者(红色)追逐猎物(绿色)的过程

80 | 81 | 训练收敛结果 82 |

MADDPG算法在simple_tag_v3环境中的奖励收敛曲线

83 |
84 | 85 | 86 | #### 实现进度 87 | | 算法 | 状态 | 位置 | 核心组件 | 88 | |----------------|--------|----------------------|----------------------------------| 89 | | MADDPG | ✅ 1.0 | `agents/maddpg/` | MADDPG_agent, DDPG_agent, buffer | 90 | | Independent RL | ⏳ 待完成 | `agents/independent/`| IndependentRL (计划中) | 91 | | Centralized RL | ⏳ 待完成 | `agents/centralized/`| CentralizedRL (计划中) | 92 | #### 代码位置 [`./MADDPG_Continous`](./MADDPG_Continous) 93 | 94 | 95 | #### 3.2 MATD3_Continous:多智能体双延迟深度确定性策略梯度算法 96 | 97 | 基于TD3算法的多智能体扩展版本(MATD3: Twin Delayed Deep Deterministic Policy Gradient),相比MADDPG,通过双Q网络和目标策略平滑机制有效解决过估计问题,提供更稳定的训练和更优的策略。 98 | 99 | > MATD3 algorithm Reference: https://github.com/wild-firefox/FreeRL/blob/main/MADDPG_file/MATD3_simple.py 100 | 101 |
102 | 训练收敛结果 103 |

MATD3算法在simple_tag_env环境中的奖励收敛曲线

104 |
105 | 106 | #### MATD3 vs MADDPG 107 | MATD3对标准MADDPG进行了以下关键增强: 108 | 109 | 1. **双Q网络设计**: 减少对动作值的过估计 110 | 2. **延迟策略更新**: 提高训练稳定性 111 | 3. **目标策略平滑**: 通过在目标动作中加入噪声防止过拟合 112 | 4. **自适应噪声调整**: 根据训练进度动态调整探索噪声 113 | 114 | #### 代码位置 [`./MATD3_Continous`](./MATD3_Continous) 115 | 116 | 117 | 118 | ## 进行中的项目 119 | - **MARL**: 基于深度强化学习的多智能体协作与协调 120 | - 探索不同通信机制对多智能体协作的影响 121 | - 研究异构智能体在复杂环境中的协作策略 122 | 123 | - **图上的多智能体协调与决策** 124 | - 将多智能体强化学习与图神经网络结合 125 | - 研究大规模图结构上的多智能体协调问题 126 | - **多智能体强化学习的应用** 127 | - 探索多智能体强化学习在工业、医疗等领域的应用 128 | - 研究多智能体强化学习在不同场景下的性能优化 129 | 130 | ## 联系方式 131 | 如有任何问题,请随时联系我。 132 | ronchy_lu AT 163 dot com 133 | 134 | Fight for MARL. 135 | 136 | 137 | 138 | ## Star History 139 | 140 | 141 | 142 | 143 | 144 | 145 | Star History Chart 146 | 147 | 148 | -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter10_Actor Critic/1.[QAC]Simplest actor critic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter10_Actor Critic/1.[QAC]Simplest actor critic.py -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter10_Actor Critic/2.[A2C]Advantage actor critic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter10_Actor Critic/2.[A2C]Advantage actor critic.py -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter10_Actor Critic/3.1Importance sampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | ''' 6 | There is no need sampling too much, because you can't tell the samples if you collect more than 1000 samples. 7 | That's catastrophe for ploting! 8 | 200 samples are enough. 9 | 2024.10.4 10 | ''' 11 | class Importance_sampling: 12 | def __init__(self,p0_probability, p1_probability): 13 | self.seed = 42 14 | self.p0_probability = p0_probability 15 | self.p1_probability = p1_probability 16 | self.p1_values = [1, -1] 17 | self.p1_samples = np.array([]) 18 | def sampling(self,sampling_size): 19 | # generate samples 20 | np.random.seed(self.seed) 21 | self.p1_samples = np.random.choice(self.p1_values, size=sampling_size, p=self.p1_probability) 22 | 23 | print("p1_samples::", self.p1_samples.shape) # 采样结果 向量。 24 | def calculate(self): 25 | if not self.p1_samples.size: #if p1_samples is empty, raise error. self.sample返回的不是一个bool值 26 | raise ValueError("Please generate p1_samples first") 27 | # 计算累积和 28 | cumulative_sum = np.cumsum(self.p1_samples) 29 | # 计算累积平均值 30 | p1_samples_average = cumulative_sum / np.arange(1, len(self.p1_samples) + 1) 31 | 32 | p_xi1 = np.where(self.p1_samples == -1, self.p1_probability[1], self.p1_probability[0]) 33 | p_xi0 = np.where(self.p1_samples == -1, self.p0_probability[1], self.p0_probability[0]) 34 | # 计算 35 | importance_p0_samples = (p_xi0/p_xi1) * self.p1_samples #Core,importance sampling 的体现 36 | 37 | 38 | cumulative_sum = np.cumsum(importance_p0_samples) 39 | cumulative_importance_p0_average = cumulative_sum / np.arange(1, len(importance_p0_samples) + 1) 40 | 41 | return p1_samples_average, cumulative_importance_p0_average 42 | def render(self,average_result, importance_sampling_result): 43 | plt.figure(figsize=(10, 6)) # set size of figure 44 | 45 | 46 | x1 = np.arange(len(self.p1_samples[:200])) 47 | y1 = self.p1_samples[:200] 48 | # plt.xlim(0, x.shape[0]) # adaptive is fine. 49 | plt.ylim(-2, 2) # set x,y range 50 | plt.plot(x1, y1, 'ro', markerfacecolor='none', label='p0_samples') 51 | 52 | y0 = average_result[:200] 53 | plt.plot(x1, y0, 'b.', label='average') 54 | 55 | y2 = importance_sampling_result[:200] 56 | plt.plot(x1, y2, 'g-', label='importance sampling') 57 | 58 | 59 | plt.xlabel('Sample index') 60 | # plt.ylabel() 61 | plt.legend() #图中带标签 62 | plt.show() 63 | 64 | 65 | 66 | if __name__ == '__main__': 67 | p0_probability = [0.5, 0.5] 68 | p1_probability = [0.8, 0.2] 69 | importance_sampling = Importance_sampling(p0_probability, p1_probability) #实例化 70 | 71 | importance_sampling.sampling(200) 72 | average_result, importance_sampling_result = importance_sampling.calculate() 73 | 74 | importance_sampling.render(average_result, importance_sampling_result) 75 | 76 | print("Done!") 77 | -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter10_Actor Critic/3.[Importance sampling]Off-policy actor critic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter10_Actor Critic/3.[Importance sampling]Off-policy actor critic.py -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter10_Actor Critic/4.[DPG]Deterministic actor critic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter10_Actor Critic/4.[DPG]Deterministic actor critic.py -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/policy_iteration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/policy_iteration.png -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/value_iteration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/value_iteration.png -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/value iteration.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | from torch.utils import data 7 | from torch.utils.tensorboard import SummaryWriter # 导入SummaryWriter 8 | 9 | # 引用上级目录 10 | import sys 11 | sys.path.append("..") 12 | import grid_env 13 | 14 | 15 | class class_value_iteration(): 16 | def __init__(self, env: grid_env.GridEnv): 17 | self.gama = 0.9 #discount rate 18 | self.env = env 19 | self.action_space_size = env.action_space_size 20 | self.state_space_size = env.size**2 #幂运算,grid world的尺寸 如 5 ** 2 = 25的网格世界。 21 | self.reward_space_size, self.reward_list = len(self.env.reward_list), self.env.reward_list #父类中:self.reward_list = [0, 1, -10, -10] 22 | #state_value 23 | self.state_value = np.zeros(shape=self.state_space_size) # 1维数组 24 | #action value -> Q-table 25 | self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size)) # 25 x 5 26 | 27 | self.mean_policy = np.ones(shape=(self.state_space_size, self.action_space_size)) / self.action_space_size 28 | self.policy = self.mean_policy.copy() 29 | self.writer = SummaryWriter("../logs") # 实例化SummaryWriter对象 30 | 31 | print("action_space_size: {} state_space_size:{}" .format(self.action_space_size ,self.state_space_size) ) 32 | print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,self.qvalue.shape, self.mean_policy.shape)) 33 | print("\n分别是non-forbidden area, target area, forbidden area 以及撞墙:") 34 | print("self.reward_space_size:{},self.reward_list:{}".format(self.reward_space_size,self.reward_list)) 35 | print('----------------------------------------------------------------') 36 | 37 | def value_iteration_new(self, tolerance=0.001, steps=100): 38 | """ 39 | 迭代求解最优贝尔曼公式 得到 最优state value tolerance 和 steps 满足其一即可 40 | :param tolerance: 当 前后 state_value 的范数小于tolerance 则认为state_value 已经收敛 41 | :param steps: 当迭代次数大于step时 停止 建议将此变量设置大一些 42 | :return: 剩余迭代次数 43 | """ 44 | # 初始化 V0 为 1 45 | state_value_k = np.ones(self.state_space_size) 46 | while np.linalg.norm(state_value_k - self.state_value, ord=1)>tolerance and steps>0: 47 | steps -= 1 48 | self.state_value = state_value_k.copy() 49 | """ 50 | 是普通 policy_improvement 的变种 相当于是值迭代算法 也可以 供策略迭代使用 做策略迭代时不需要 接收第二个返回值 51 | 更新 qvalue ;qvalue[state,action]=reward+value[next_state] 52 | 找到 state 处的 action*:action* = arg max(qvalue[state,action]) 即最优action即最大qvalue对应的action 53 | 更新 policy :将 action*的概率设为1 其他action的概率设为0 这是一个greedy policy 54 | :param: state_value: policy对应的state value 55 | :return: improved policy, 以及迭代下一步的state_value 56 | """ 57 | # 方法初始化了一个新的策略 policy,所有状态的所有动作的概率都被设置为0 58 | policy = np.zeros(shape=(self.state_space_size, self.action_space_size)) 59 | #state_value_k = state_value_k.copy() 60 | #遍历所有的 state 61 | q_table = np.zeros(shape=(self.state_space_size, self.action_space_size)) 62 | for state in range(self.state_space_size): 63 | qvalue_list = [] 64 | #遍历所有的 action 65 | for action in range(self.action_space_size): 66 | # 计算qvalue,即acton value. 67 | """ 68 | 计算qvalue elementwise形式 69 | :param state: 对应的state 70 | :param action: 对应的action 71 | :param state_value: 状态值 72 | :return: 计算出的结果 73 | """ 74 | qvalue = 0 75 | for i in range(self.reward_space_size): 76 | # print("self.reward_list[i] * self.env.Rsa[state, action, i]:{}x{}={}".format(self.reward_list[i], self.env.Rsa[state, action, i],self.reward_list[i] * self.env.Rsa[state, action, i])) 77 | qvalue += self.reward_list[i] * self.env.Rsa[state, action, i] 78 | 79 | for next_state in range(self.state_space_size): 80 | qvalue += self.gama * self.env.Psa[state, action, next_state] * state_value_k[next_state] 81 | qvalue_list.append(qvalue) 82 | # print("qvalue_list:",qvalue_list) 83 | q_table[state,:] = qvalue_list.copy() 84 | 85 | state_value_k[state] = max(qvalue_list) #取该state 的最大state value 86 | action_star = qvalue_list.index(max(qvalue_list)) #取该state 的最大state value对应的action 87 | policy[state, action_star] = 1 #更新策略,贪婪算法 88 | print("q_table:{}".format(q_table)) 89 | self.policy = policy 90 | return steps 91 | 92 | 93 | def show_policy(self): 94 | for state in range(self.state_space_size): 95 | for action in range(self.action_space_size): 96 | policy = self.policy[state, action] 97 | self.env.render_.draw_action(pos=self.env.state2pos(state), 98 | toward=policy * 0.4 * self.env.action_to_direction[action], 99 | radius=policy * 0.1) 100 | 101 | def show_state_value(self, state_value, y_offset=0.2): 102 | for state in range(self.state_space_size): 103 | self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)), 104 | y_offset=y_offset, 105 | size_discount=0.7) 106 | 107 | def obtain_episode(self, policy, start_state, start_action, length): 108 | """ 109 | 110 | :param policy: 由指定策略产生episode 111 | :param start_state: 起始state 112 | :param start_action: 起始action 113 | :param length: episode 长度 114 | :return: 一个 state,action,reward,next_state,next_action 序列 115 | """ 116 | self.env.agent_location = self.env.state2pos(start_state) 117 | episode = [] 118 | next_action = start_action 119 | next_state = start_state 120 | while length > 0: 121 | length -= 1 122 | state = next_state 123 | action = next_action 124 | _, reward, done, _, _ = self.env.step(action) 125 | next_state = self.env.pos2state(self.env.agent_location) 126 | next_action = np.random.choice(np.arange(len(policy[next_state])), 127 | p=policy[next_state]) 128 | episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state, 129 | "next_action": next_action}) 130 | return episode 131 | 132 | 133 | 134 | if __name__ == "__main__": 135 | print("-----Begin!-----") 136 | gird_world2x2 = grid_env.GridEnv(size=3, target=[2, 2], 137 | forbidden=[[1, 0],[2,1]], 138 | render_mode='') 139 | 140 | solver = class_value_iteration(gird_world2x2) 141 | start_time = time.time() 142 | 143 | # 执行值迭代算法 144 | demand_step = 1000 145 | remaining_steps = solver.value_iteration_new(tolerance=0.1, steps=demand_step) 146 | if remaining_steps > 0: 147 | print("Value iteration converged in {} steps.".format(demand_step - remaining_steps)) 148 | else: 149 | print("Value iteration did not converge in 100 steps.") 150 | 151 | end_time = time.time() 152 | 153 | cost_time = end_time - start_time 154 | print("cost_time:{}".format(round(cost_time, 2))) 155 | print(len(gird_world2x2.render_.trajectory)) 156 | 157 | solver.show_policy() # solver.env.render() 158 | solver.show_state_value(solver.state_value, y_offset=0.25) 159 | 160 | 161 | gird_world2x2.render() -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter5_Monte Carlo Methods/MC_Exploring_Starts.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import numpy as np 4 | from torch.utils.tensorboard import SummaryWriter # 导入SummaryWriter 5 | 6 | # 引用上级目录 7 | import sys 8 | sys.path.append("..") 9 | import grid_env 10 | 11 | 12 | ''' 13 | MC Basic 是个Model free 的方法,与value iteration和 Policy iteration对比,数据是MC的必需品。 14 | 15 | 16 | ''' 17 | class MC_Exploring_Starts: 18 | def __init__(self, env = grid_env.GridEnv): 19 | self.gama = 0.9 #discount rate 20 | self.env = env 21 | self.action_space_size = env.action_space_size 22 | self.state_space_size = env.size ** 2 23 | self.reward_space_size, self.reward_list = len(self.env.reward_list), self.env.reward_list # [-10,-10,0,1] reward list 24 | self.state_value = np.zeros(shape=self.state_space_size) #一维列表 25 | print("self.state_value:",self.state_value) 26 | #Q表和policy 维数一样 27 | self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size)) # 二维: state数 x action数 28 | self.mean_policy = np.ones(shape=(self.state_space_size, self.action_space_size)) / self.action_space_size #平均策略,即取每个动作的概率均等 29 | self.policy = self.mean_policy.copy() 30 | self.writer = SummaryWriter("logs") # 实例化SummaryWriter对象 31 | 32 | print("action_space_size: {} state_space_size:{}" .format(self.action_space_size ,self.state_space_size) ) 33 | print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,self.qvalue.shape, self.mean_policy.shape)) 34 | 35 | print('----------------------------------------------------------------') 36 | ''' 37 | 定义可视化grid world所需的函数 38 | def show_policy(self) 39 | def show_state_value(self, state_value, y_offset=0.2): 40 | def obtain_episode(self, policy, start_state, start_action, length): 41 | ''' 42 | def show_policy(self): 43 | for state in range(self.state_space_size): 44 | for action in range(self.action_space_size): 45 | policy = self.policy[state, action] 46 | self.env.render_.draw_action(pos=self.env.state2pos(state), 47 | toward=policy * 0.4 * self.env.action_to_direction[action], 48 | radius=policy * 0.1) 49 | 50 | def show_state_value(self, state_value, y_offset=0.2): 51 | for state in range(self.state_space_size): 52 | self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)), 53 | y_offset=y_offset, 54 | size_discount=0.7) 55 | 56 | def obtain_episode(self, policy, start_state, start_action, length): 57 | """ 58 | :param policy: 由指定策略产生episode 59 | :param start_state: 起始state 60 | :param start_action: 起始action 61 | :param length: 一个episode 长度 62 | :return: 一个 state,action,reward,next_state,next_action 列表,其中是字典格式 63 | """ 64 | self.env.agent_location = self.env.state2pos(start_state) 65 | episode = [] 66 | next_action = start_action 67 | next_state = start_state 68 | while length > 0: 69 | length -= 1 70 | state = next_state 71 | action = next_action 72 | _, reward, done, _, _ = self.env.step(action) 73 | next_state = self.env.pos2state(self.env.agent_location) 74 | next_action = np.random.choice(np.arange(len(policy[next_state])), #[0, len(policy[next_state]) 中随机抽一个随机数 75 | p=policy[next_state]) #p参数的例子: p=[0.1, 0.2, 0.3, 0.1, 0.3]的概率从 [0,1,2,3,4]这四个数中选取3个数 76 | episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state, 77 | "next_action": next_action}) #向列表中添加一个字典 78 | return episode 79 | 80 | def mc_exploring_starts_simple(self, length=50, epochs=10): 81 | """ 82 | :param length: 每一个 state-action 对的长度 83 | :return: 84 | """ 85 | for epoch in range(epochs): 86 | episode = self.obtain_episode(self.policy, state, action, length) # policy is mean policy 87 | 88 | for state in range(self.state_space_size): 89 | for action in range(self.action_space_size): 90 | episode = self.obtain_episode(self.policy, state, action, length) # policy is mean policy 91 | print("obtain_episode,type:,{}; {}".format(type(episode[0]), episode)) 92 | # Policy evaluation: 93 | sum_qvalue = 0 94 | for i in range(len(episode) - 1): 95 | sum_qvalue += self.gama**i * episode[i]['reward'] 96 | self.qvalue[state][action] = sum_qvalue 97 | 98 | # Policy improvement: 99 | max_index = np.argmax(self.qvalue[state]) # qvalue_star 100 | max_qvalue = np.max(self.qvalue[state]) #action_star 101 | 102 | 103 | def mc_exploring_starts_first_visit(self, length=10): 104 | time_start = time.time() 105 | # policy = self.mean_policy.copy() 106 | # policy = np.zeros(shape=(self.state_space_size, self.action_space_size)) 107 | policy = np.random.dirichlet(alpha=[1] * self.action_space_size, size = self.state_space_size) 108 | print("policy:",policy) 109 | # policy /= policy.sum(1) 110 | 111 | qvalue = self.qvalue.copy() 112 | returns = [[[0] for col in range(5)] for block in range(25)] 113 | # returns = [[]] 114 | print("returns:", returns) 115 | print("np.linalg.norm(policy - self.policy, ord=1) :",np.linalg.norm(policy - self.policy, ord=1) ) 116 | while np.linalg.norm(policy - self.policy, ord=1) > 0.001: 117 | print("开始运行:") 118 | policy = self.policy.copy() 119 | for state in range(self.state_space_size): 120 | for action in range(self.action_space_size): 121 | visit_list = [] 122 | g = 0 123 | # Following the current policy, generate an episode of length T ;生成一个episode 124 | episode = self.obtain_episode(policy=self.policy, start_state=state, start_action=action, 125 | length=length) 126 | for step in range(len(episode)-1, -1, -1): #从末尾开始截取 127 | reward = episode[step]['reward'] 128 | state = episode[step]['state'] 129 | action = episode[step]['action'] 130 | g = self.gama * g + reward 131 | # first visit 132 | # print("[state, action] :",[state, action] ) 133 | if [state, action] not in visit_list: 134 | visit_list.append([state, action]) 135 | # print("visit_list:",visit_list) 136 | returns[state][action].append(g) 137 | qvalue[state, action] = np.array(returns[state][action]).mean() 138 | qvalue_star = qvalue[state].max() 139 | action_star = qvalue[state].tolist().index(qvalue_star) 140 | self.policy[state] = np.zeros(shape=self.action_space_size).copy() 141 | self.policy[state, action_star] = 1 142 | # self.state_value[state] = qvalue_star 143 | print(np.linalg.norm(policy - self.policy, ord=1)) 144 | 145 | time_end = time.time() 146 | print("mc_exploring_starts cost time:" + str(time_end - time_start)) 147 | 148 | if __name__ == "__main__": 149 | episode_length = 2000 150 | gird_world = grid_env.GridEnv(size=5, target=[2, 3], 151 | forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]], 152 | render_mode='') 153 | solver = MC_Exploring_Starts(gird_world) 154 | start_time = time.time() 155 | 156 | # solver.state_value = solver.mc_exploring_starts_first_visit(length=episode_length) 157 | solver.mc_exploring_starts_first_visit(length=episode_length) # 修改后,利用tqdm显示epoch进度 158 | 159 | end_time = time.time() 160 | cost_time = end_time - start_time 161 | print("episode_length:{} that the cost_time is:{}".format(episode_length, round(cost_time, 2))) 162 | 163 | solver.show_policy() # solver.env.render() 164 | solver.show_state_value(solver.state_value, y_offset=0.25) 165 | gird_world.plot_title("Episode_length = " + str(episode_length)) 166 | gird_world.render() 167 | # gird_world.render_clear() 168 | print("--------------------") -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter5_Monte Carlo Methods/MC_epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import numpy as np 4 | from torch.utils.tensorboard import SummaryWriter # 导入SummaryWriter 5 | 6 | # 引用上级目录 7 | import sys 8 | sys.path.append("..") 9 | import grid_env 10 | 11 | class MC_epsilon_greedy: 12 | def __init__(self, env = grid_env.GridEnv): 13 | self.gama = 0.9 #discount rate 14 | self.env = env 15 | self.action_space_size = env.action_space_size 16 | self.state_space_size = env.size ** 2 17 | self.reward_space_size, self.reward_list = len(self.env.reward_list), self.env.reward_list # [-10,-10,0,1] reward list 18 | self.state_value = np.zeros(shape=self.state_space_size) #一维列表 19 | print("self.state_value:",self.state_value) 20 | #Q表和policy 维数一样 21 | self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size)) # 二维: state数 x action数 22 | self.mean_policy = np.ones(shape=(self.state_space_size, self.action_space_size)) / self.action_space_size #平均策略,即取每个动作的概率均等 23 | self.policy = self.mean_policy.copy() 24 | self.writer = SummaryWriter("logs") # 实例化SummaryWriter对象 25 | 26 | print("action_space_size: {} state_space_size:{}" .format(self.action_space_size ,self.state_space_size) ) 27 | print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,self.qvalue.shape, self.mean_policy.shape)) 28 | 29 | print('----------------------------------------------------------------') 30 | ''' 31 | 定义可视化grid world所需的函数 32 | def show_policy(self) 33 | def show_state_value(self, state_value, y_offset=0.2): 34 | def obtain_episode(self, policy, start_state, start_action, length): 35 | ''' 36 | def show_policy(self): 37 | for state in range(self.state_space_size): 38 | for action in range(self.action_space_size): 39 | policy = self.policy[state, action] 40 | self.env.render_.draw_action(pos=self.env.state2pos(state), 41 | toward=policy * 0.4 * self.env.action_to_direction[action], 42 | radius=policy * 0.1) 43 | 44 | def show_state_value(self, state_value, y_offset=0.2): 45 | for state in range(self.state_space_size): 46 | self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)), 47 | y_offset=y_offset, 48 | size_discount=0.7) 49 | 50 | def obtain_episode(self, policy, start_state, start_action, length): 51 | """ 52 | :param policy: 由指定策略产生episode 53 | :param start_state: 起始state 54 | :param start_action: 起始action 55 | :param length: 一个episode 长度 56 | :return: 一个 state,action,reward,next_state,next_action 列表,其中是字典格式 57 | """ 58 | self.env.agent_location = self.env.state2pos(start_state) 59 | episode = [] 60 | next_action = start_action 61 | next_state = start_state 62 | while length > 0: 63 | length -= 1 64 | state = next_state 65 | action = next_action 66 | _, reward, done, _, _ = self.env.step(action) 67 | next_state = self.env.pos2state(self.env.agent_location) 68 | next_action = np.random.choice(np.arange(len(policy[next_state])), #[0, len(policy[next_state]) 中随机抽一个随机数 69 | p=policy[next_state]) #p参数的例子: p=[0.1, 0.2, 0.3, 0.1, 0.3]的概率从 [0,1,2,3,4]这四个数中选取3个数 70 | episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state, 71 | "next_action": next_action}) #向列表中添加一个字典 72 | return episode 73 | 74 | def mc_epsilon_greedy(self, episodes, episode_length, epsilon = 0.5 ): 75 | # 初始化Returns和Num计数器 76 | returns = np.zeros(self.qvalue.shape) # 初始化回报累计 77 | num_visits = np.zeros(self.qvalue.shape, dtype=int) # 初始化访问次数 78 | 79 | for _ in range(episodes): 80 | # Episode generation 81 | start_state = np.random.randint(self.state_space_size) # 随机选择起始状态 82 | start_action = np.random.choice(np.arange(self.action_space_size), # 随机选择起始动作 83 | p=self.policy[start_state]) 84 | 85 | episode = self.obtain_episode(self.policy, start_state, start_action, 86 | episode_length) # 获取一个episode 87 | 88 | # 对于每个step的回报累积和访问次数更新 89 | for step in reversed(episode): # 逆序遍历,从T-1到0 90 | state, action, reward = step["state"], step["action"], step["reward"] 91 | G = reward # 当前步的即时奖励 92 | for rt in episode[::-1][episode.index(step):]: # 从当前步开始反向累加未来奖励 93 | G = self.gama * G + rt["reward"] # 累积折扣回报 94 | returns[state, action] += G # 更新累积回报 95 | num_visits[state, action] += 1 # 更新状态动作对的访问次数 96 | 97 | # Policy evaluation 98 | self.qvalue = np.divide(returns, num_visits, where=num_visits != 0) # 避免除以零错误 99 | # Policy improvement 100 | best_actions = np.argmax(self.qvalue, axis=1) # 找到每个状态下最优的动作 101 | for state in range(self.state_space_size): 102 | for action in range(self.action_space_size): 103 | # self.policy[state, action] = (1 - epsilon + epsilon / self.action_space_size) * ( 104 | # action == best_actions[state]) + \ 105 | # (epsilon / self.action_space_size) * (action != best_actions[state]) 106 | self.policy[state, :] = 0 # 先将所有动作概率设为0 107 | self.policy[state, best_actions[state]] = 1 # 最优动作概率设为1 108 | 109 | 110 | if __name__ == "__main__": 111 | episodes = 1000 112 | episode_length = 2000 113 | gird_world = grid_env.GridEnv(size=5, target=[2, 3], 114 | forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]], 115 | render_mode='') 116 | solver = MC_epsilon_greedy(gird_world) 117 | start_time = time.time() 118 | 119 | # solver.state_value = solver.mc_exploring_starts_first_visit(length=episode_length) 120 | solver.mc_epsilon_greedy(episodes, episode_length) # 修改后,利用tqdm显示epoch进度 121 | 122 | end_time = time.time() 123 | cost_time = end_time - start_time 124 | print("episode_length:{} that the cost_time is:{}".format(episode_length, round(cost_time, 2))) 125 | 126 | solver.show_policy() # solver.env.render() 127 | solver.show_state_value(solver.state_value, y_offset=0.25) 128 | gird_world.plot_title("Episode_length = " + str(episode_length)) 129 | gird_world.render() 130 | # gird_world.render_clear() 131 | print("--------------------") -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter6_Stochastic_approximation/Robbins-Monro algorithm.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | # import decimal #用numpy计算,弃用decimal 4 | # decimal.getcontext().prec = 50 5 | 6 | import matplotlib.pyplot as plt 7 | """ 8 | Consider an example: g(w) = w**3 - 5 9 | analytical solution: g(w) = 0; w**3 = 5; 5^(1/3) ≈ 1.71. 10 | 11 | Now, suppose that we can only observe the input w and the output g̃(w) = g(w) + η, 12 | 13 | """ 14 | 15 | w_k = [0] # w_1 = 0 16 | g_tilde = [] 17 | 18 | # eta = np.random.normal(size=10) # η 高斯噪声 19 | # print("eta:",eta) 20 | eta_list = [] #plot用 21 | def calculate_g_tilde(w): 22 | eta = np.random.normal() 23 | eta_list.append(eta) 24 | g_tilde =np.array(w**3) - 5 + eta 25 | 26 | # g_tilde = decimal.Decimal(w ** 3) - 5 27 | return (g_tilde) 28 | 29 | for a_k in range(2,550): # a_k 要从2开始 30 | g_tilde.append( calculate_g_tilde(w_k[-1]) ) # g_k 31 | # print("g_tilde",g_tilde) 32 | w_k.append( w_k[-1] - np.array(1/a_k) * g_tilde[-1] ) 33 | # print("w_k" ,w_k) 34 | print("w_k",w_k) #w_k[-1]是结果 35 | print('---------------------------') 36 | print("实际结果:",np.cbrt(5)) #numpy开立方 37 | print("迭代最后结果:",w_k[-1]) 38 | 39 | 40 | 41 | # 绘制第一个图表 42 | plt.figure(figsize=(10, 5)) 43 | plt.plot(range(1, len(w_k)+1), w_k, marker='o',markerfacecolor='none', # 空心,设置填充色为透明 44 | markeredgecolor='blue', # 边框颜色为蓝色 45 | markersize=10, 46 | linestyle='-', color='blue', label='Estimated root w_k') 47 | plt.xlabel('Iteration index k', fontsize = 12) 48 | plt.ylabel('Estimated root w_k', fontsize = 12) 49 | 50 | # 绘制第二个图表 51 | plt.figure(figsize=(8, 5)) 52 | plt.plot(range(len(eta_list)), eta_list, marker='o',markerfacecolor='none', # 空心,设置填充色为透明 53 | markeredgecolor='green', # 边框颜色为蓝色 54 | markersize=10, 55 | linestyle='-', color='green', label='Observation noise') 56 | plt.xlabel('Iteration index k', fontsize = 12) 57 | plt.ylabel('Observation noise', fontsize = 12) 58 | 59 | # 添加图例 60 | plt.legend() 61 | 62 | # 显示图表 63 | plt.show() -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/1.Sarsa.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.utils.tensorboard import SummaryWriter # 导入SummaryWriter 6 | 7 | # 引用上级目录 8 | import sys 9 | sys.path.append("..") 10 | import grid_env 11 | 12 | """ 13 | SARSA: State - action - reward - state - action 14 | 15 | TD learning of acton values: Sarsa -> directly estimate action values. 16 | """ 17 | class Sarsa(): 18 | def __init__(self,alpha,env = grid_env.GridEnv): 19 | self.gama = 0.9 # discount rate 20 | self.alpha = alpha #learning rate 21 | self.env = env 22 | self.action_space_size = env.action_space_size 23 | self.state_space_size = env.size ** 2 24 | self.reward_space_size, self.reward_list = len( 25 | self.env.reward_list), self.env.reward_list # [-10,-10,0,1] reward list 26 | self.state_value = np.zeros(shape=self.state_space_size) # 一维列表 27 | print("self.state_value:", self.state_value) 28 | self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size)) # 二维: state数 x action数 29 | self.mean_policy = np.ones( #self.mean_policy shape: (25, 5) 30 | shape=(self.state_space_size, self.action_space_size)) / self.action_space_size # 平均策略,即取每个动作的概率均等 31 | self.policy = self.mean_policy.copy() 32 | self.writer = SummaryWriter("logs") # 实例化SummaryWriter对象 33 | 34 | print("action_space_size: {} state_space_size:{}".format(self.action_space_size, self.state_space_size)) 35 | print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape, 36 | self.qvalue.shape, 37 | self.mean_policy.shape)) 38 | 39 | print('----------------------------------------------------------------') 40 | 41 | def show_policy(self): 42 | for state in range(self.state_space_size): 43 | for action in range(self.action_space_size): 44 | policy = self.policy[state, action] 45 | self.env.render_.draw_action(pos=self.env.state2pos(state), 46 | toward=policy * 0.4 * self.env.action_to_direction[action], 47 | radius=policy * 0.1) 48 | 49 | def show_state_value(self, state_value, y_offset=0.2): 50 | for state in range(self.state_space_size): 51 | self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)), 52 | y_offset=y_offset, 53 | size_discount=0.7) 54 | 55 | def obtain_episode(self, policy, start_state, start_action, length): 56 | """ 57 | :param policy: 由指定策略产生episode 58 | :param start_state: 起始state 59 | :param start_action: 起始action 60 | :param length: 一个episode 长度 61 | :return: 一个列表,其中是字典格式: state,action,reward,next_state,next_action 62 | """ 63 | self.env.agent_location = self.env.state2pos(start_state) 64 | episode = [] 65 | next_action = start_action 66 | next_state = start_state 67 | while length > 0: 68 | length -= 1 69 | state = next_state 70 | action = next_action 71 | _, reward, done, _, _ = self.env.step(action) # 一步动作 72 | next_state = self.env.pos2state(self.env.agent_location) 73 | next_action = np.random.choice(np.arange(len(policy[next_state])), 74 | p=policy[next_state]) 75 | episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state, 76 | "next_action": next_action}) #向列表中添加一个字典 77 | return episode #返回列表,其中的元素为字典 78 | 79 | ''' 80 | Learn an optimal policy that can lead the agent to the target state from an initial state s0. 81 | ''' 82 | def Sarsa_alg(self,initial_location, epsilon = 0.1): 83 | total_rewards = [] 84 | episode_lengths = [] 85 | initial_state = self.env.pos2state(initial_location) 86 | print("initial_state:", initial_state) 87 | for episode_num in range(1000): # episode_num 88 | self.env.reset() 89 | total_reward = 0 90 | episode_length = 0 91 | done = False 92 | print("episode_num:",episode_num) 93 | 94 | state = initial_state 95 | action = np.random.choice(a=np.arange(self.action_space_size), 96 | p=self.policy[state, :]) # Generate a0 at s0 following π0(s0) 97 | #initialize buffers 98 | states = [state] 99 | aciton = [action] 100 | rewards = [0] 101 | while not done: #If s_t is not the target state, do 102 | episode_length += 1 103 | _, reward, done, _, _ = self.env.step(action) #Collect an experience sample (rt+1, st+1, at+1) 104 | #S 105 | next_state = self.env.pos2state(self.env.agent_location) 106 | # print("next_state:",next_state, "self.env.agent_location:",self.env.agent_location) 107 | #A 108 | next_action = np.random.choice(np.arange(self.action_space_size), 109 | p=self.policy[next_state,:]) 110 | total_reward += reward 111 | #Update q-value for (st, at): 112 | self.qvalue[state][action] = self.qvalue[state, action] - self.alpha * (self.qvalue[state, action] - (reward + self.gama * self.qvalue[next_state, next_action]) ) 113 | #update policy 114 | qvalue_star = self.qvalue[state].max() 115 | action_star = self.qvalue[state].tolist().index(qvalue_star) 116 | for a in range(self.action_space_size): 117 | if a == action_star: 118 | self.policy[state, a] = 1 - epsilon + (epsilon / self.action_space_size) 119 | 120 | else: 121 | self.policy[state, a] = epsilon / self.action_space_size 122 | 123 | action = next_action 124 | state = next_state 125 | total_rewards.append(total_reward) 126 | episode_lengths.append(episode_length) 127 | 128 | return total_rewards,episode_lengths 129 | 130 | if __name__ =="__main__": 131 | gird_world = grid_env.GridEnv(size=5, target=[2, 3], 132 | forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]], 133 | render_mode='') 134 | solver = Sarsa(alpha =0.1, env = gird_world) 135 | # solver.sarsa() 136 | # print("env.policy[0, :]:",solver.policy[0, :]) 137 | # for _ in range(20): 138 | # a0 = np.random.choice(5, p=solver.policy[0, :] ) 139 | # 140 | # print("a0:",a0) 141 | 142 | start_time = time.time() 143 | 144 | initial_location = [0,0] 145 | total_rewards, episode_lengths = solver.Sarsa_alg(initial_location = initial_location) 146 | 147 | 148 | end_time = time.time() 149 | cost_time = end_time - start_time 150 | print("cost_time:",cost_time) 151 | print(len(gird_world.render_.trajectory)) 152 | 153 | initial_state = solver.env.pos2state(initial_location) 154 | print("训练后的policy结果为:\n",solver.policy[initial_state,:]) 155 | solver.show_policy() # solver.env.render() 156 | solver.show_state_value(solver.state_value, y_offset=0.25) 157 | # gird_world.plot_title("Episode_length = " + str(i)) 158 | gird_world.render() 159 | # gird_world.render_clear() 160 | print("--------------------") 161 | print("Plot") 162 | # 绘制第一个图表 163 | plt.figure(figsize=(10, 5)) 164 | plt.plot(range(1, len(total_rewards) + 1), total_rewards, # 空心,设置填充色为透明 165 | markeredgecolor='blue', # 边框颜色为蓝色 166 | markersize=10, 167 | linestyle='-', color='blue',label = "total_rewards") 168 | plt.xlabel('Episode index', fontsize=12) 169 | plt.ylabel('total_rewards', fontsize=12) 170 | 171 | # 绘制第二个图表 172 | plt.figure(figsize=(10, 5)) 173 | plt.plot(range(1, len(episode_lengths) + 1), episode_lengths, # 空心,设置填充色为透明 174 | markeredgecolor='blue', # 边框颜色为蓝色 175 | markersize=10, 176 | linestyle='-', color='blue',label = "episode_length") 177 | plt.xlabel('Episode index', fontsize=12) 178 | plt.ylabel('episode_length', fontsize=12) 179 | 180 | # 添加图例 181 | plt.legend() 182 | # 显示图表 183 | plt.show() 184 | -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/2.n-step Sarsa.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.utils.tensorboard import SummaryWriter # 导入SummaryWriter 6 | 7 | # 引用上级目录 8 | import sys 9 | sys.path.append("..") 10 | import grid_env 11 | 12 | """ 13 | SARSA: State - action - reward - state - action 14 | 15 | TD learning of acton values: Sarsa -> directly estimate action values. 16 | """ 17 | class N_step_Sarsa(): 18 | def __init__(self,alpha,env = grid_env.GridEnv): 19 | self.gama = 0.9 # discount rate 20 | self.alpha = alpha #learning rate 21 | self.env = env 22 | self.action_space_size = env.action_space_size 23 | self.state_space_size = env.size ** 2 24 | self.reward_space_size, self.reward_list = len( 25 | self.env.reward_list), self.env.reward_list # [-10,-10,0,1] reward list 26 | self.state_value = np.zeros(shape=self.state_space_size) # 一维列表 27 | print("self.state_value:", self.state_value) 28 | self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size)) # 二维: state数 x action数 29 | self.mean_policy = np.ones( #self.mean_policy shape: (25, 5) 30 | shape=(self.state_space_size, self.action_space_size)) / self.action_space_size # 平均策略,即取每个动作的概率均等 31 | self.policy = self.mean_policy.copy() 32 | self.writer = SummaryWriter("logs") # 实例化SummaryWriter对象 33 | 34 | print("action_space_size: {} state_space_size:{}".format(self.action_space_size, self.state_space_size)) 35 | print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape, 36 | self.qvalue.shape, 37 | self.mean_policy.shape)) 38 | 39 | print('----------------------------------------------------------------') 40 | 41 | def show_policy(self): 42 | for state in range(self.state_space_size): 43 | for action in range(self.action_space_size): 44 | policy = self.policy[state, action] 45 | self.env.render_.draw_action(pos=self.env.state2pos(state), 46 | toward=policy * 0.4 * self.env.action_to_direction[action], 47 | radius=policy * 0.1) 48 | 49 | def show_state_value(self, state_value, y_offset=0.2): 50 | for state in range(self.state_space_size): 51 | self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)), 52 | y_offset=y_offset, 53 | size_discount=0.7) 54 | 55 | ''' 56 | Learn an optimal policy that can lead the agent to the target state from an initial state s0. 57 | ''' 58 | 59 | def n_step_Sarsa_alg(self, initial_location, epsilon=0.1, n=3): 60 | total_rewards = [] 61 | episode_lengths = [] 62 | initial_state = self.env.pos2state(initial_location) 63 | print("initial_state:", initial_state) 64 | 65 | for episode_num in range(1000): # episode_num 66 | self.env.reset() 67 | total_reward = 0 68 | episode_length = 0 69 | done = False 70 | print("episode_num:", episode_num) 71 | 72 | state = initial_state 73 | action = np.random.choice(a=np.arange(self.action_space_size), 74 | p=self.policy[state, :]) # Generate a0 at s0 following π0(s0) 75 | 76 | # Initialize buffers 77 | states = [state] 78 | actions = [action] 79 | rewards = [0] # Reward at time 0 is 0 80 | 81 | T = float('inf') 82 | t = 0 83 | 84 | while True: 85 | if t < T: 86 | _, reward, done, _, _ = self.env.step(action) # Collect an experience sample (rt+1, st+1, at+1) 87 | next_state = self.env.pos2state(self.env.agent_location) 88 | next_action = np.random.choice(np.arange(self.action_space_size), p=self.policy[next_state, :]) 89 | 90 | states.append(next_state) 91 | actions.append(next_action) 92 | rewards.append(reward) 93 | 94 | total_reward += reward 95 | episode_length += 1 96 | 97 | if done: 98 | T = t + 1 99 | 100 | tau = t - n + 1 101 | if tau >= 0: 102 | G = sum([self.gama ** (i - tau - 1) * rewards[i] for i in range(tau + 1, min(tau + n, T) + 1)]) 103 | if tau + n < T: 104 | G += self.gama ** n * self.qvalue[states[tau + n]][actions[tau + n]] 105 | 106 | state_tau = states[tau] 107 | action_tau = actions[tau] 108 | self.qvalue[state_tau][action_tau] += self.alpha * (G - self.qvalue[state_tau][action_tau]) 109 | 110 | # Update policy 111 | qvalue_star = self.qvalue[state_tau].max() 112 | action_star = self.qvalue[state_tau].tolist().index(qvalue_star) 113 | for a in range(self.action_space_size): 114 | if a == action_star: 115 | self.policy[state_tau, a] = 1 - epsilon + (epsilon / self.action_space_size) 116 | else: 117 | self.policy[state_tau, a] = epsilon / self.action_space_size 118 | 119 | if tau == T - 1: 120 | break 121 | 122 | t += 1 123 | state = next_state 124 | action = next_action 125 | 126 | total_rewards.append(total_reward) 127 | episode_lengths.append(episode_length) 128 | 129 | return total_rewards, episode_lengths 130 | if __name__ =="__main__": 131 | gird_world = grid_env.GridEnv(size=5, target=[2, 3], 132 | forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]], 133 | render_mode='') 134 | solver = N_step_Sarsa(alpha =0.1, env = gird_world) 135 | # solver.sarsa() 136 | # print("env.policy[0, :]:",solver.policy[0, :]) 137 | # for _ in range(20): 138 | # a0 = np.random.choice(5, p=solver.policy[0, :] ) 139 | # 140 | # print("a0:",a0) 141 | 142 | start_time = time.time() 143 | 144 | initial_location = [0,4] 145 | total_rewards, episode_lengths = solver.n_step_Sarsa_alg(initial_location = initial_location) 146 | 147 | 148 | end_time = time.time() 149 | cost_time = end_time - start_time 150 | print("cost_time:",cost_time) 151 | print(len(gird_world.render_.trajectory)) 152 | 153 | initial_state = solver.env.pos2state(initial_location) 154 | print("训练后的policy结果为:\n",solver.policy[initial_state,:]) 155 | solver.show_policy() # solver.env.render() 156 | solver.show_state_value(solver.state_value, y_offset=0.25) 157 | # gird_world.plot_title("Episode_length = " + str(i)) 158 | gird_world.render() 159 | # gird_world.render_clear() 160 | print("--------------------") 161 | print("Plot") 162 | # 绘制第一个图表 163 | plt.figure(figsize=(10, 5)) 164 | plt.plot(range(1, len(total_rewards) + 1), total_rewards, # 空心,设置填充色为透明 165 | markeredgecolor='blue', # 边框颜色为蓝色 166 | markersize=10, 167 | linestyle='-', color='blue',label = "total_rewards") 168 | plt.xlabel('Episode index', fontsize=12) 169 | plt.ylabel('total_rewards', fontsize=12) 170 | 171 | # 绘制第二个图表 172 | plt.figure(figsize=(10, 5)) 173 | plt.plot(range(1, len(episode_lengths) + 1), episode_lengths, # 空心,设置填充色为透明 174 | markeredgecolor='blue', # 边框颜色为蓝色 175 | markersize=10, 176 | linestyle='-', color='blue',label = "episode_length") 177 | plt.xlabel('Episode index', fontsize=12) 178 | plt.ylabel('episode_length', fontsize=12) 179 | 180 | # 添加图例 181 | plt.legend() 182 | # 显示图表 183 | plt.show() 184 | -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/3.Q-learning.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.utils.tensorboard import SummaryWriter # 导入SummaryWriter 6 | 7 | # 引用上级目录 8 | import sys 9 | sys.path.append("..") 10 | import grid_env 11 | 12 | """ 13 | SARSA: State - action - reward - state - action 14 | 15 | TD learning of acton values: Sarsa -> directly estimate action values. 16 | """ 17 | class Q_learning(): 18 | def __init__(self,alpha,env = grid_env.GridEnv): 19 | self.gamma = 0.9 # discount rate 20 | self.alpha = alpha #learning rate 21 | self.env = env 22 | self.action_space_size = env.action_space_size 23 | self.state_space_size = env.size ** 2 24 | self.reward_space_size, self.reward_list = len( 25 | self.env.reward_list), self.env.reward_list # [-10,-10,0,1] reward list 26 | self.state_value = np.zeros(shape=self.state_space_size) # 一维列表 27 | print("self.state_value:", self.state_value) 28 | self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size)) # 二维: state数 x action数 29 | self.mean_policy = np.ones( #self.mean_policy shape: (25, 5) 30 | shape=(self.state_space_size, self.action_space_size)) / self.action_space_size # 平均策略,即取每个动作的概率均等 31 | self.policy = self.mean_policy.copy() 32 | self.writer = SummaryWriter("logs") # 实例化SummaryWriter对象 33 | 34 | print("action_space_size: {} state_space_size:{}".format(self.action_space_size, self.state_space_size)) 35 | print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape, 36 | self.qvalue.shape, 37 | self.mean_policy.shape)) 38 | 39 | print('----------------------------------------------------------------') 40 | 41 | def show_policy(self): 42 | for state in range(self.state_space_size): 43 | for action in range(self.action_space_size): 44 | policy = self.policy[state, action] 45 | self.env.render_.draw_action(pos=self.env.state2pos(state), 46 | toward=policy * 0.4 * self.env.action_to_direction[action], 47 | radius=policy * 0.1) 48 | 49 | def show_state_value(self, state_value, y_offset=0.2): 50 | for state in range(self.state_space_size): 51 | self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)), 52 | y_offset=y_offset, 53 | size_discount=0.7) 54 | 55 | ''' 56 | Learn an optimal policy that can lead the agent to the target state from an initial state s0. 57 | ''' 58 | 59 | def q_learning(self, initial_location, epsilon=0.1, n=3): 60 | total_rewards = [] 61 | episode_lengths = [] 62 | initial_state = self.env.pos2state(initial_location) 63 | print("initial_state:", initial_state) 64 | 65 | for episode_num in range(1000): # episode_num 66 | self.env.reset() 67 | total_reward = 0 68 | episode_length = 0 69 | done = False 70 | print("episode_num:", episode_num) 71 | state = initial_state 72 | while not done: 73 | # Choose action using epsilon-greedy policy 74 | if np.random.rand() < epsilon: 75 | action = np.random.choice(self.action_space_size) # Explore: random action 76 | else: 77 | action = np.argmax(self.qvalue[state]) # Exploit: action with max Q-value 78 | 79 | # Take action and observe reward and next state 80 | _, reward, done, _, _ = self.env.step(action) 81 | next_state = self.env.pos2state(self.env.agent_location) 82 | 83 | # Update Q-value 84 | best_next_action = np.argmax(self.qvalue[next_state]) 85 | td_target = reward + self.gamma * self.qvalue[next_state][best_next_action] 86 | td_error = self.qvalue[state][action] - td_target 87 | self.qvalue[state][action] -= self.alpha * td_error 88 | 89 | # Update policy (optional, since Q-learning is off-policy) 90 | qvalue_star = self.qvalue[state].max() 91 | action_star = self.qvalue[state].tolist().index(qvalue_star) 92 | for a in range(self.action_space_size): 93 | if a == action_star: 94 | self.policy[state, a] = 1 - epsilon + (epsilon / self.action_space_size) 95 | else: 96 | self.policy[state, a] = epsilon / self.action_space_size 97 | 98 | # Update state 99 | state = next_state 100 | total_reward += reward 101 | episode_length += 1 102 | 103 | total_rewards.append(total_reward) 104 | episode_lengths.append(episode_length) 105 | 106 | return total_rewards, episode_lengths 107 | if __name__ =="__main__": 108 | gird_world = grid_env.GridEnv(size=5, target=[2, 3], 109 | forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]], 110 | render_mode='') 111 | solver = Q_learning(alpha =0.1, env = gird_world) 112 | # solver.sarsa() 113 | # print("env.policy[0, :]:",solver.policy[0, :]) 114 | # for _ in range(20): 115 | # a0 = np.random.choice(5, p=solver.policy[0, :] ) 116 | # 117 | # print("a0:",a0) 118 | 119 | start_time = time.time() 120 | 121 | initial_location = [4,0] 122 | total_rewards, episode_lengths = solver.q_learning(initial_location = initial_location) 123 | 124 | 125 | end_time = time.time() 126 | cost_time = end_time - start_time 127 | print("cost_time:",cost_time) 128 | print(len(gird_world.render_.trajectory)) 129 | 130 | initial_state = solver.env.pos2state(initial_location) 131 | print("训练后的policy结果为:\n",solver.policy[initial_state,:]) 132 | solver.show_policy() # solver.env.render() 133 | solver.show_state_value(solver.state_value, y_offset=0.25) 134 | # gird_world.plot_title("Episode_length = " + str(i)) 135 | gird_world.render() 136 | # gird_world.render_clear() 137 | print("--------------------") 138 | print("Plot") 139 | # 绘制第一个图表 140 | plt.figure(figsize=(10, 5)) 141 | plt.plot(range(1, len(total_rewards) + 1), total_rewards, # 空心,设置填充色为透明 142 | markeredgecolor='blue', # 边框颜色为蓝色 143 | markersize=10, 144 | linestyle='-', color='blue',label = "total_rewards") 145 | plt.xlabel('Episode index', fontsize=12) 146 | plt.ylabel('total_rewards', fontsize=12) 147 | 148 | # 绘制第二个图表 149 | plt.figure(figsize=(10, 5)) 150 | plt.plot(range(1, len(episode_lengths) + 1), episode_lengths, # 空心,设置填充色为透明 151 | markeredgecolor='blue', # 边框颜色为蓝色 152 | markersize=10, 153 | linestyle='-', color='blue',label = "episode_length") 154 | plt.xlabel('Episode index', fontsize=12) 155 | plt.ylabel('episode_length', fontsize=12) 156 | 157 | # 添加图例 158 | plt.legend() 159 | # 显示图表 160 | plt.show() 161 | -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/4.Q-learning on policy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/4.Q-learning on policy.py -------------------------------------------------------------------------------- /RL_Learning-main/scripts/Chapter9_Policy Gradient/[Reinforce]Monte Carlo policy gradient.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.utils.tensorboard import SummaryWriter # 导入SummaryWriter 6 | from torch.utils import data 7 | import torch 8 | import torch.nn as nn 9 | 10 | 11 | # 引用上级目录 12 | import sys 13 | sys.path.append("..") 14 | import grid_env 15 | 16 | """ 17 | REINFORCE algorithm 18 | 19 | """ 20 | 21 | "Define policy NN" 22 | class PolicyNet(nn.Module): 23 | def __init__(self, input_dim=2, output_dim=5): 24 | super(PolicyNet, self).__init__() 25 | self.fc = nn.Sequential( 26 | nn.Linear(in_features=input_dim, out_features=100), 27 | nn.ReLU(), 28 | nn.Linear(in_features=100, out_features=output_dim), 29 | nn.Softmax(dim=1) 30 | ) 31 | 32 | def forward(self, x): 33 | x = x.type(torch.float32) 34 | return self.fc(x) 35 | 36 | 37 | class REINFORCE(): 38 | def __init__(self,alpha,env = grid_env.GridEnv): 39 | self.gama = 0.9 # discount rate 40 | self.alpha = alpha #learning rate 41 | self.env = env 42 | self.action_space_size = env.action_space_size 43 | self.state_space_size = env.size ** 2 44 | self.reward_space_size, self.reward_list = len( 45 | self.env.reward_list), self.env.reward_list # [-10,-10,0,1] reward list 46 | self.state_value = np.zeros(shape=self.state_space_size) # 一维列表 47 | print("self.state_value:", self.state_value) 48 | self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size)) # 二维: state数 x action数 49 | self.mean_policy = np.ones( #self.mean_policy shape: (25, 5) 50 | shape=(self.state_space_size, self.action_space_size)) / self.action_space_size # 平均策略,即取每个动作的概率均等 51 | self.policy = self.mean_policy.copy() 52 | self.writer = SummaryWriter("logs") # 实例化SummaryWriter对象 53 | 54 | print("action_space_size: {} state_space_size:{}".format(self.action_space_size, self.state_space_size)) 55 | print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape, 56 | self.qvalue.shape, 57 | self.mean_policy.shape)) 58 | print('----------------------------------------------------------------') 59 | 60 | def show_policy(self): 61 | for state in range(self.state_space_size): 62 | for action in range(self.action_space_size): 63 | policy = self.policy[state, action] 64 | self.env.render_.draw_action(pos=self.env.state2pos(state), 65 | toward=policy * 0.4 * self.env.action_to_direction[action], 66 | radius=policy * 0.1) 67 | 68 | def show_state_value(self, state_value, y_offset=0.2): 69 | for state in range(self.state_space_size): 70 | self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)), 71 | y_offset=y_offset, 72 | size_discount=0.7) 73 | 74 | #其中一个episode 就是一个从start 到 target 的轨迹。 75 | def obtain_episode_net(self, policy_net, start_state, start_action): 76 | """ 77 | :param policy_net: 由指定策略产生episode 78 | :param start_state: 起始state 79 | :param start_action: 起始action 80 | :return: 一个列表,其中是字典格式: state,action,reward,next_state,next_action 81 | """ 82 | self.env.agent_location = self.env.state2pos(start_state) 83 | episode = [] 84 | next_action = start_action 85 | next_state = start_state 86 | terminated = False 87 | while not terminated: 88 | state = next_state 89 | action = next_action 90 | _, reward, terminated, _, _ = self.env.step(action) # 一步动作 91 | next_state = self.env.pos2state(self.env.agent_location) 92 | x, y = self.env.state2pos(next_state) / self.env.size 93 | prb = policy_net(torch.tensor((x, y)).reshape(-1, 2))[0] 94 | next_action = np.random.choice(np.arange(self.action_space_size), p = prb.detach().numpy()) 95 | episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state, 96 | "next_action": next_action}) # 向列表中添加一个字典 97 | return episode 98 | 99 | def reiniforce(self, epochs=20000): 100 | policy_net = PolicyNet() 101 | optimizer = torch.optim.Adam(policy_net.parameters(), lr=self.alpha) 102 | for epoch in range(epochs): 103 | prb = policy_net(torch.tensor((0, 0)).reshape(-1, 2))[0] 104 | print("epoch:{} , prb:{}".format(epoch, prb)) 105 | start_action = np.random.choice(np.arange(self.action_space_size), p=prb.detach().numpy()) 106 | episode = self.obtain_episode_net(policy_net, start_state=0, start_action=start_action) 107 | # print("eposode:", episode) 108 | 109 | if len(episode) < 10 : 110 | g = -100 111 | else: 112 | g = 0 113 | optimizer.zero_grad() # 清零梯度 114 | for step in reversed(range(len(episode))): 115 | reward = episode[step]['reward'] 116 | state = episode[step]['state'] 117 | action = episode[step]['action'] 118 | if len(episode) > 1000: 119 | # print(g, reward) 120 | pass 121 | g = self.gama * g + reward 122 | self.qvalue[state, action] = g 123 | x ,y = self.env.state2pos(state)/self.env.size 124 | prb = policy_net(torch.tensor((x, y)).reshape(-1, 2))[0] 125 | log_prob = torch.log(prb[action]) 126 | loss = -log_prob * g 127 | loss.backward() #反向传播计算梯度 128 | self.writer.add_scalar("loss", float(loss.detach()), epoch) 129 | self.writer.add_scalar('g', g, epoch) 130 | self.writer.add_scalar('episode_length', len(episode), epoch) 131 | # print(epoch, len(episode), g) 132 | optimizer.step() 133 | for s in range(self.state_space_size): 134 | x, y = self.env.state2pos(s) / self.env.size 135 | prb = policy_net(torch.tensor((x, y)).reshape(-1, 2))[0] 136 | self.policy[s,:] = prb.copy() 137 | self.writer.close() 138 | 139 | if __name__ == '__main__': 140 | gird_world = grid_env.GridEnv(size=5, target=[2, 3], 141 | forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]], 142 | render_mode='') 143 | solver = REINFORCE(alpha=0.001, env=gird_world) 144 | start_time = time.time() 145 | 146 | solver.reiniforce() 147 | print("solver.state_value:", solver.state_value) 148 | 149 | 150 | end_time = time.time() 151 | cost_time = end_time - start_time 152 | print("cost_time:", cost_time) 153 | solver.show_policy() # solver.env.render() 154 | solver.show_state_value(solver.state_value, y_offset=0.25) 155 | solver.env.render() 156 | -------------------------------------------------------------------------------- /RL_Learning-main/scripts/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class QNET(nn.Module): 6 | def __init__(self, input_dim=3, output_dim=1): 7 | super(QNET, self).__init__() 8 | self.fc = nn.Sequential( 9 | nn.Linear(in_features=input_dim, out_features=128), 10 | nn.ReLU(), 11 | nn.Linear(in_features=128, out_features=64), 12 | nn.ReLU(), 13 | nn.Linear(in_features=64, out_features=32), 14 | nn.ReLU(), 15 | nn.Linear(in_features=32, out_features=output_dim), 16 | ) 17 | 18 | def forward(self, x): 19 | x = x.type(torch.float32) 20 | return self.fc(x) 21 | 22 | 23 | class PolicyNet(nn.Module): 24 | def __init__(self, input_dim=2, output_dim=5): 25 | super(PolicyNet, self).__init__() 26 | self.fc = nn.Sequential( 27 | nn.Linear(in_features=input_dim, out_features=100), 28 | nn.ReLU(), 29 | nn.Linear(in_features=100, out_features=output_dim), 30 | nn.Softmax(dim=1) 31 | ) 32 | 33 | def forward(self, x): 34 | x = x.type(torch.float32) 35 | return self.fc(x) 36 | 37 | 38 | class DPolicyNet(nn.Module): 39 | def __init__(self, input_dim=2, output_dim=1): 40 | super(DPolicyNet, self).__init__() 41 | self.fc = nn.Sequential( 42 | nn.Linear(in_features=input_dim, out_features=100), 43 | nn.ReLU(), 44 | nn.Linear(in_features=100, out_features=output_dim), 45 | ) 46 | 47 | def forward(self, x): 48 | x = x.type(torch.float32) 49 | return self.fc(x) 50 | 51 | 52 | class ValueNet(torch.nn.Module): 53 | def __init__(self, input_dim=2, output_dim=1): 54 | super(ValueNet, self).__init__() 55 | self.fc = nn.Sequential( 56 | nn.Linear(in_features=input_dim, out_features=100), 57 | nn.ReLU(), 58 | nn.Linear(in_features=100, out_features=output_dim), 59 | ) 60 | 61 | def forward(self, x): 62 | x = x.type(torch.float32) 63 | return self.fc(x) 64 | 65 | 66 | 67 | if __name__ == '__main__': 68 | dqn = PolicyNet() 69 | input = torch.tensor([[2, 1], [3, 1]]) 70 | print(dqn) 71 | print(dqn(input)) 72 | -------------------------------------------------------------------------------- /img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/img.png -------------------------------------------------------------------------------- /动手学强化学习/DQN/DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | 5 | 6 | class Qnet(torch.nn.Module): 7 | ''' 只有一层隐藏层的Q网络 ''' 8 | def __init__(self, state_dim, hidden_dim, action_dim): 9 | super(Qnet, self).__init__() 10 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 11 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 12 | 13 | def forward(self, x): 14 | x = F.relu(self.fc1(x)) # 隐藏层使用ReLU激活函数 15 | return self.fc2(x) 16 | 17 | 18 | class VAnet(torch.nn.Module): 19 | def __init__(self, state_dim, hidden_dim, action_dim): 20 | super().__init__() 21 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 22 | self.fc_A = torch.nn.Linear(hidden_dim, action_dim) 23 | self.fc_V = torch.nn.Linear(hidden_dim, 1) 24 | 25 | def forward(self, x): 26 | x = F.relu(self.fc1(x)) 27 | A = self.fc_A(x) 28 | V = self.fc_V(x) 29 | Q = V + A - A.mean(1).view(-1, 1) 30 | return Q 31 | 32 | class DQN: 33 | ''' DQN算法 ''' 34 | def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma, 35 | epsilon, target_update, device): 36 | self.action_dim = action_dim 37 | self.q_net = Qnet(state_dim, hidden_dim, 38 | self.action_dim).to(device) # Q网络 39 | # 目标网络 40 | self.target_q_net = Qnet(state_dim, hidden_dim, 41 | self.action_dim).to(device) 42 | # 使用Adam优化器 43 | self.optimizer = torch.optim.Adam(self.q_net.parameters(), 44 | lr=learning_rate) 45 | self.gamma = gamma # 折扣因子 46 | self.epsilon = epsilon # epsilon-贪婪策略 47 | self.target_update = target_update # 目标网络更新频率 48 | self.count = 0 # 计数器,记录更新次数 49 | self.device = device 50 | 51 | def take_action(self, state): # epsilon-贪婪策略采取动作 52 | if np.random.random() < self.epsilon: 53 | action = np.random.randint(self.action_dim) 54 | else: 55 | state = torch.tensor([state], dtype=torch.float).to(self.device) 56 | action = self.q_net(state).argmax().item() 57 | return action 58 | 59 | def update(self, transition_dict): 60 | states = torch.tensor(transition_dict['states'], 61 | dtype=torch.float).to(self.device) 62 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 63 | self.device) 64 | rewards = torch.tensor(transition_dict['rewards'], 65 | dtype=torch.float).view(-1, 1).to(self.device) 66 | next_states = torch.tensor(transition_dict['next_states'], 67 | dtype=torch.float).to(self.device) 68 | dones = torch.tensor(transition_dict['dones'], 69 | dtype=torch.float).view(-1, 1).to(self.device) 70 | 71 | q_values = self.q_net(states).gather(1, actions) # Q值 72 | # 下个状态的最大Q值 73 | max_next_q_values = self.target_q_net(next_states).max(1)[0].view( 74 | -1, 1) 75 | q_targets = rewards + self.gamma * max_next_q_values * (1 - dones 76 | ) # TD误差目标 77 | dqn_loss = torch.mean(F.mse_loss(q_values, q_targets)) # 均方误差损失函数 78 | self.optimizer.zero_grad() # PyTorch中默认梯度会累积,这里需要显式将梯度置为0 79 | dqn_loss.backward() # 反向传播更新参数 80 | self.optimizer.step() 81 | 82 | if self.count % self.target_update == 0: #target_update 指C步之后更新目标网络Q_target的参数 \omega 83 | self.target_q_net.load_state_dict( 84 | self.q_net.state_dict()) # 更新目标网络 85 | self.count += 1 -------------------------------------------------------------------------------- /动手学强化学习/DQN/display.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | from DQN import DQN 5 | from time import sleep 6 | 7 | 8 | def dis_to_con(discrete_action, env, action_dim): 9 | action_lowbound = env.action_space.low[0] 10 | action_upbound = env.action_space.high[0] 11 | return np.array([discrete_action / (action_dim - 1) * (action_upbound - action_lowbound) + action_lowbound]) 12 | 13 | 14 | lr = 2e-3 15 | hidden_dim = 128 16 | gamma = 0.98 17 | epsilon = 0.0 18 | target_update = 10 19 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 20 | 21 | env = gym.make('Pendulum-v1') 22 | 23 | state_dim = env.observation_space.shape[0] 24 | action_dim = 11 25 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device) 26 | state_dict = torch.load('dqn_pendulumv1.pth') 27 | agent.q_net.load_state_dict(state_dict) 28 | agent.target_q_net.load_state_dict(state_dict) 29 | 30 | state = env.reset() 31 | done = False 32 | agent_return = 0 33 | while not done: 34 | action = agent.take_action(state) 35 | action = dis_to_con(action, env, action_dim) 36 | next_state, reward, done, _ = env.step(action) 37 | agent_return += reward 38 | env.render() 39 | state = next_state 40 | sleep(0.01) 41 | 42 | print('DQN return:', agent_return) 43 | 44 | 45 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device, 'DoubleDQN') 46 | state_dict = torch.load('double_dqn_pendulumv1.pth') 47 | agent.q_net.load_state_dict(state_dict) 48 | agent.target_q_net.load_state_dict(state_dict) 49 | 50 | state = env.reset() 51 | done = False 52 | agent_return = 0 53 | while not done: 54 | action = agent.take_action(state) 55 | action = dis_to_con(action, env, action_dim) 56 | next_state, reward, done, _ = env.step(action) 57 | agent_return += reward 58 | env.render() 59 | state = next_state 60 | sleep(0.01) 61 | 62 | print('Double DQN return:', agent_return) 63 | 64 | 65 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device, 'DuelingDQN') 66 | state_dict = torch.load('dueling_dqn_pendulumv1.pth') 67 | agent.q_net.load_state_dict(state_dict) 68 | agent.target_q_net.load_state_dict(state_dict) 69 | 70 | state = env.reset() 71 | done = False 72 | agent_return = 0 73 | while not done: 74 | action = agent.take_action(state) 75 | action = dis_to_con(action, env, action_dim) 76 | next_state, reward, done, _ = env.step(action) 77 | agent_return += reward 78 | env.render() 79 | state = next_state 80 | sleep(0.01) 81 | 82 | print('Dueling DQN return:', agent_return) -------------------------------------------------------------------------------- /动手学强化学习/DQN/main.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import torch 3 | import random 4 | import numpy as np 5 | from DQN import DQN 6 | from tqdm import tqdm 7 | import matplotlib.pyplot as plt 8 | 9 | import sys 10 | import os 11 | # 将上级目录添加到 sys.path 12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 13 | import rl_utils 14 | 15 | def dis_to_con(discrete_action, env, action_dim): 16 | action_lowbound = env.action_space.low[0] 17 | action_upbound = env.action_space.high[0] 18 | return np.array([discrete_action / (action_dim - 1) * (action_upbound - action_lowbound) + action_lowbound]) 19 | 20 | 21 | def train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size): 22 | return_list = [] 23 | max_q_value_list = [] 24 | max_q_value = 0 25 | 26 | for i in range(10): 27 | with tqdm(total=num_episodes // 10, desc='Iteration %d' % i) as pbar: 28 | for i_episode in range(num_episodes // 10): 29 | episode_return = 0 30 | state, *_ = env.reset() 31 | done = False 32 | while not done: 33 | # print("state", state) 34 | action = agent.take_action(state) 35 | # max_q_value = agent.max_q_value(state) * 0.005 + max_q_value * 0.995 36 | # max_q_value_list.append(max_q_value) 37 | 38 | # action_continuous = dis_to_con(action, env, agent.action_dim) 39 | next_state, reward, done, *_ = env.step(action) #实参或者为: action_continuous 40 | replay_buffer.add(state, action, reward, next_state, done) 41 | state = next_state 42 | episode_return += reward 43 | 44 | if replay_buffer.size() > minimal_size: 45 | b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size) 46 | transition_dict = dict( 47 | states=b_s, 48 | actions=b_a, 49 | rewards=b_r, 50 | next_states=b_ns, 51 | dones=b_d 52 | ) 53 | agent.update(transition_dict) 54 | return_list.append(episode_return) 55 | 56 | if (i_episode + 1) % 10 == 0: 57 | pbar.set_postfix({ 58 | 'episode': '%d' % (num_episodes / 10 * i + i_episode + 1), 59 | 'return': '%.3f' % np.mean(return_list[-10:]) 60 | }) 61 | pbar.update(1) 62 | return return_list, max_q_value_list 63 | 64 | 65 | 66 | if __name__ == "__main__": 67 | lr = 2e-3 68 | num_episodes = 500 69 | hidden_dim = 128 70 | gamma = 0.99 71 | epsilon = 0.01 72 | target_update = 10 73 | buffer_size = 10000 74 | minimal_size = 500 75 | batch_size = 64 76 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 77 | print(f"Using device: {device}") 78 | # env_name = 'Pendulum-v1' 79 | env_name = "CartPole-v0" 80 | env = gym.make(env_name) 81 | print("CartPole-v0 env:",env) 82 | 83 | random.seed(0) 84 | np.random.seed(0) 85 | env.reset(seed = 0) # 新版gymnausim 86 | # env.seed(0) #旧版gym 87 | torch.manual_seed(0) 88 | 89 | 90 | replay_buffer = rl_utils.ReplayBuffer(buffer_size) 91 | print("replay_buffer建立成功!", replay_buffer) 92 | state_dim = env.observation_space.shape[0] 93 | action_dim = env.action_space.n 94 | print("state_dim:", state_dim) 95 | print("action_dim:", action_dim) 96 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device) 97 | # return_list, max_q_value_list = train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size) 98 | return_list, _ = train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size) 99 | 100 | # torch.save(agent.q_net.state_dict(), 'dqn_pendulumv1.pth') 101 | episodes_list = list(range(len(return_list))) 102 | 103 | plt.plot(episodes_list, return_list) 104 | plt.xlabel('Episodes') 105 | plt.ylabel('Returns') 106 | plt.title('DQN Returns on {}'.format(env_name)) 107 | plt.show() 108 | ##将结果平滑处理 109 | mv_return = rl_utils.moving_average(return_list, 9) 110 | plt.plot(episodes_list, mv_return) 111 | plt.xlabel('Episodes') 112 | plt.ylabel('Returns') 113 | plt.title('DQN on {}'.format(env_name)) 114 | plt.show() 115 | 116 | 117 | 118 | # -------------------------------------------------------- 119 | # 120 | # print("Double DQN") 121 | # random.seed(0) 122 | # np.random.seed(0) 123 | # env.seed(0) 124 | # torch.manual_seed(0) 125 | # 126 | # replay_buffer = ReplayBuffer(buffer_size) 127 | # agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device, "DoubleDQN") 128 | # return_list, max_q_value_list = train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size) 129 | # 130 | # torch.save(agent.q_net.state_dict(), 'double_dqn_pendulumv1.pth') 131 | # episodes_list = list(range(len(return_list))) 132 | # mv_returns = moving_average(return_list, 5) 133 | # plt.plot(episodes_list, mv_returns) 134 | # plt.xlabel('Episodes') 135 | # plt.ylabel('Returns') 136 | # plt.title('Double DQN Returns on {}'.format(env_name)) 137 | # plt.show() 138 | # -------------------------------------------------------- 139 | # frames_list = list(range(len(max_q_value_list))) 140 | # plt.plot(frames_list, max_q_value_list) 141 | # plt.axhline(0, c='orange', ls='--') 142 | # plt.axhline(10, c='red', ls='--') 143 | # plt.xlabel('Frames') 144 | # plt.ylabel('Q value') 145 | # plt.title('Double DQN Q value on {}'.format(env_name)) 146 | # plt.show() 147 | # 148 | # print("Dueling DQN") 149 | # random.seed(0) 150 | # np.random.seed(0) 151 | # env.seed(0) 152 | # torch.manual_seed(0) 153 | # 154 | # replay_buffer = ReplayBuffer(buffer_size) 155 | # agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device, "DuelingDQN") 156 | # return_list, max_q_value_list = train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size) 157 | # 158 | # torch.save(agent.q_net.state_dict(), 'dueling_dqn_pendulumv1.pth') 159 | # episodes_list = list(range(len(return_list))) 160 | # mv_returns = moving_average(return_list, 5) 161 | # plt.plot(episodes_list, mv_returns) 162 | # plt.xlabel('Episodes') 163 | # plt.ylabel('Returns') 164 | # plt.title('Dueling DQN Returns on {}'.format(env_name)) 165 | # plt.show() 166 | # 167 | # frames_list = list(range(len(max_q_value_list))) 168 | # plt.plot(frames_list, max_q_value_list) 169 | # plt.axhline(0, c='orange', ls='--') 170 | # plt.axhline(10, c='red', ls='--') 171 | # plt.xlabel('Frames') 172 | # plt.ylabel('Q value') 173 | # plt.title('Dueling DQN Q value on {}'.format(env_name)) 174 | # plt.show() -------------------------------------------------------------------------------- /动手学强化学习/Hands-on-RL/README.md: -------------------------------------------------------------------------------- 1 | # 动手学强化学习 2 | 3 | Tips: 若运行gym环境的代码时遇到报错,请尝试pip install gym==0.18.3安装此版本的gym库,若仍有问题,欢迎提交issue! 4 | 5 | 欢迎来到《动手学强化学习》(Hands-on Reinforcement Learning)的地带。该系列从强化学习的定义等基础讲起,一步步由浅入深,介绍目前一些主流的强化学习算法。每一章内容都是一个Jupyter Notebook,内含详细的图文介绍和代码讲解。 6 | 7 | * 由于GitHub上渲染notebook效果有限,我们推荐读者前往[Hands-on RL主页](https://hrl.boyuai.com/)进行浏览,我们在此提供了纯代码版本的notebook,供大家下载运行。 8 | 9 | * 欢迎在[京东](https://item.jd.com/13129509.html)和[当当网](http://product.dangdang.com/29391150.html)购买《动手学强化学习》。 10 | 11 | * 如果你发现了本书的任何问题,或者有任何改善建议的,欢迎提交issue! 12 | 13 | * 本书配套的强化学习课程已上线到[伯禹学习平台](https://www.boyuai.com/elites/course/xVqhU42F5IDky94x),所有人都可以免费学习和讨论。 14 | 15 | ![](https://boyuai.oss-cn-shanghai.aliyuncs.com/disk/tmp/hrl-poster.jpeg) 16 | -------------------------------------------------------------------------------- /动手学强化学习/Hands-on-RL/rl_utils.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | import torch 4 | import collections 5 | import random 6 | 7 | class ReplayBuffer: 8 | def __init__(self, capacity): 9 | self.buffer = collections.deque(maxlen=capacity) 10 | 11 | def add(self, state, action, reward, next_state, done): 12 | self.buffer.append((state, action, reward, next_state, done)) 13 | 14 | def sample(self, batch_size): 15 | transitions = random.sample(self.buffer, batch_size) 16 | state, action, reward, next_state, done = zip(*transitions) 17 | return np.array(state), action, reward, np.array(next_state), done 18 | 19 | def size(self): 20 | return len(self.buffer) 21 | 22 | def moving_average(a, window_size): 23 | cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 24 | middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size 25 | r = np.arange(1, window_size-1, 2) 26 | begin = np.cumsum(a[:window_size-1])[::2] / r 27 | end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1] 28 | return np.concatenate((begin, middle, end)) 29 | 30 | def train_on_policy_agent(env, agent, num_episodes): 31 | return_list = [] 32 | for i in range(10): 33 | with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar: 34 | for i_episode in range(int(num_episodes/10)): 35 | episode_return = 0 36 | transition_dict = {'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': []} 37 | state = env.reset() 38 | done = False 39 | while not done: 40 | action = agent.take_action(state) 41 | next_state, reward, done, _ = env.step(action) 42 | transition_dict['states'].append(state) 43 | transition_dict['actions'].append(action) 44 | transition_dict['next_states'].append(next_state) 45 | transition_dict['rewards'].append(reward) 46 | transition_dict['dones'].append(done) 47 | state = next_state 48 | episode_return += reward 49 | return_list.append(episode_return) 50 | agent.update(transition_dict) 51 | if (i_episode+1) % 10 == 0: 52 | pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])}) 53 | pbar.update(1) 54 | return return_list 55 | 56 | def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size): 57 | return_list = [] 58 | for i in range(10): 59 | with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar: 60 | for i_episode in range(int(num_episodes/10)): 61 | episode_return = 0 62 | state = env.reset() 63 | done = False 64 | while not done: 65 | action = agent.take_action(state) 66 | next_state, reward, done, _ = env.step(action) 67 | replay_buffer.add(state, action, reward, next_state, done) 68 | state = next_state 69 | episode_return += reward 70 | if replay_buffer.size() > minimal_size: 71 | b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size) 72 | transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d} 73 | agent.update(transition_dict) 74 | return_list.append(episode_return) 75 | if (i_episode+1) % 10 == 0: 76 | pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])}) 77 | pbar.update(1) 78 | return return_list 79 | 80 | 81 | def compute_advantage(gamma, lmbda, td_delta): 82 | td_delta = td_delta.detach().numpy() 83 | advantage_list = [] 84 | advantage = 0.0 85 | for delta in td_delta[::-1]: 86 | advantage = gamma * lmbda * advantage + delta 87 | advantage_list.append(advantage) 88 | advantage_list.reverse() 89 | return torch.tensor(advantage_list, dtype=torch.float) 90 | -------------------------------------------------------------------------------- /动手学强化学习/README.md: -------------------------------------------------------------------------------- 1 | This file aims to reproduce https://hrl.boyuai.com/chapter/2/dqn%E7%AE%97%E6%B3%95 2 | 3 | 1. DQN 4 | 2. Policy gradient (Reinforce) 5 | 3. Actor Critic 6 | 4. DDPG 7 | 8 | 最后扩展到MADDPG。 9 | 10 | ##[2023]py script 形式的HandsOnRL 11 | https://github.com/peterwu4084/HandsOnRL/tree/main -------------------------------------------------------------------------------- /动手学强化学习/rl_utils.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | import torch 4 | import collections 5 | import random 6 | 7 | class ReplayBuffer: 8 | ''' 经验回放池 ''' 9 | def __init__(self, capacity): 10 | self.buffer = collections.deque(maxlen=capacity) # 队列,先进先出 11 | 12 | def add(self, state, action, reward, next_state, done): # 将数据加入buffer 13 | self.buffer.append((state, action, reward, next_state, done)) 14 | 15 | def sample(self, batch_size): # 从buffer中采样数据,数量为batch_size 16 | transitions = random.sample(self.buffer, batch_size) 17 | state, action, reward, next_state, done = zip(*transitions) 18 | return np.array(state), action, reward, np.array(next_state), done 19 | 20 | def size(self): # 目前buffer中数据的数量 21 | return len(self.buffer) 22 | 23 | def moving_average(a, window_size): 24 | a = np.array(a) # 先转换为 NumPy 数组 25 | a = np.where(a > 200, 200, a) # 将大于200的值替换为200 26 | cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 27 | middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size 28 | r = np.arange(1, window_size-1, 2) 29 | begin = np.cumsum(a[:window_size-1])[::2] / r 30 | end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1] 31 | return np.concatenate((begin, middle, end)) 32 | 33 | def train_on_policy_agent(env, agent, num_episodes): 34 | return_list = [] 35 | for i in range(10): 36 | with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar: 37 | for i_episode in range(int(num_episodes/10)): 38 | episode_return = 0 39 | transition_dict = {'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': []} 40 | state = env.reset() 41 | done = False 42 | while not done: 43 | action = agent.take_action(state) 44 | next_state, reward, done, _ = env.step(action) 45 | transition_dict['states'].append(state) 46 | transition_dict['actions'].append(action) 47 | transition_dict['next_states'].append(next_state) 48 | transition_dict['rewards'].append(reward) 49 | transition_dict['dones'].append(done) 50 | state = next_state 51 | episode_return += reward 52 | return_list.append(episode_return) 53 | agent.update(transition_dict) 54 | if (i_episode+1) % 10 == 0: 55 | pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])}) 56 | pbar.update(1) 57 | return return_list 58 | 59 | def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size): 60 | return_list = [] 61 | for i in range(10): 62 | with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar: 63 | for i_episode in range(int(num_episodes/10)): 64 | episode_return = 0 65 | state = env.reset() 66 | done = False 67 | while not done: 68 | action = agent.take_action(state) 69 | next_state, reward, done, _ = env.step(action) 70 | replay_buffer.add(state, action, reward, next_state, done) 71 | state = next_state 72 | episode_return += reward 73 | if replay_buffer.size() > minimal_size: 74 | b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size) 75 | transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d} 76 | agent.update(transition_dict) 77 | return_list.append(episode_return) 78 | if (i_episode+1) % 10 == 0: 79 | pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])}) 80 | pbar.update(1) 81 | return return_list 82 | 83 | 84 | def compute_advantage(gamma, lmbda, td_delta): 85 | td_delta = td_delta.detach().numpy() 86 | advantage_list = [] 87 | advantage = 0.0 88 | for delta in td_delta[::-1]: 89 | advantage = gamma * lmbda * advantage + delta 90 | advantage_list.append(advantage) 91 | advantage_list.reverse() 92 | return torch.tensor(advantage_list, dtype=torch.float) 93 | -------------------------------------------------------------------------------- /动手学强化学习/策略梯度/Reinforce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | class PolicyNet(torch.nn.Module): 5 | def __init__(self, state_dim, hidden_dim, action_dim): 6 | super(PolicyNet, self).__init__() 7 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 8 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 9 | 10 | def forward(self, x): 11 | x = F.relu(self.fc1(x)) 12 | return F.softmax(self.fc2(x), dim=1) 13 | 14 | class REINFORCE: 15 | def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma, 16 | device): 17 | self.policy_net = PolicyNet(state_dim, hidden_dim, 18 | action_dim).to(device) 19 | self.optimizer = torch.optim.Adam(self.policy_net.parameters(), 20 | lr=learning_rate) # 使用Adam优化器 21 | self.gamma = gamma # 折扣因子 22 | self.device = device 23 | 24 | def take_action(self, state): # 根据动作概率分布随机采样 25 | state = torch.tensor([state], dtype=torch.float).to(self.device) 26 | probs = self.policy_net(state) 27 | action_dist = torch.distributions.Categorical(probs) 28 | action = action_dist.sample() 29 | return action.item() 30 | 31 | def update(self, transition_dict): 32 | reward_list = transition_dict['rewards'] 33 | state_list = transition_dict['states'] 34 | action_list = transition_dict['actions'] 35 | 36 | G = 0 37 | self.optimizer.zero_grad() 38 | for i in reversed(range(len(reward_list))): # 从最后一步算起 39 | reward = reward_list[i] 40 | state = torch.tensor([state_list[i]], 41 | dtype=torch.float).to(self.device) 42 | action = torch.tensor([action_list[i]]).view(-1, 1).to(self.device) 43 | log_prob = torch.log(self.policy_net(state).gather(1, action)) 44 | G = self.gamma * G + reward 45 | loss = -log_prob * G # 每一步的损失函数 46 | loss.backward() # 反向传播计算梯度 47 | self.optimizer.step() # 梯度下降 -------------------------------------------------------------------------------- /动手学强化学习/策略梯度/display.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/动手学强化学习/策略梯度/display.py -------------------------------------------------------------------------------- /动手学强化学习/策略梯度/main.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import torch 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from tqdm import tqdm 7 | from Reinforce import * 8 | 9 | # 将上级目录添加到 sys.path 10 | import sys 11 | import os 12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 13 | import rl_utils 14 | 15 | learning_rate = 1e-3 16 | num_episodes = 1000 17 | hidden_dim = 128 18 | gamma = 0.98 19 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device( 20 | "cpu") 21 | print(f"Using device: {device}") 22 | 23 | env_name = "CartPole-v0" 24 | env = gym.make(env_name) 25 | env.reset(seed = 0) 26 | torch.manual_seed(0) 27 | state_dim = env.observation_space.shape[0] 28 | action_dim = env.action_space.n 29 | agent = REINFORCE(state_dim, hidden_dim, action_dim, learning_rate, gamma, 30 | device) 31 | 32 | return_list = [] 33 | for i in range(10): 34 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 35 | for i_episode in range(int(num_episodes / 10)): 36 | episode_return = 0 37 | transition_dict = { 38 | 'states': [], 39 | 'actions': [], 40 | 'next_states': [], 41 | 'rewards': [], 42 | 'dones': [] 43 | } 44 | state, *_ = env.reset() 45 | done = False 46 | while not done: 47 | action = agent.take_action(state) 48 | next_state, reward, done, *_ = env.step(action) 49 | transition_dict['states'].append(state) 50 | transition_dict['actions'].append(action) 51 | transition_dict['next_states'].append(next_state) 52 | transition_dict['rewards'].append(reward) 53 | transition_dict['dones'].append(done) 54 | state = next_state 55 | episode_return += reward 56 | return_list.append(episode_return) 57 | agent.update(transition_dict) 58 | if (i_episode + 1) % 10 == 0: 59 | pbar.set_postfix({ 60 | 'episode': 61 | '%d' % (num_episodes / 10 * i + i_episode + 1), 62 | 'return': 63 | '%.3f' % np.mean(return_list[-10:]) 64 | }) 65 | pbar.update(1) 66 | 67 | episodes_list = list(range(len(return_list))) 68 | plt.plot(episodes_list, return_list) 69 | plt.xlabel('Episodes') 70 | plt.ylabel('Returns') 71 | plt.title('REINFORCE on {}'.format(env_name)) 72 | plt.show() 73 | 74 | mv_return = rl_utils.moving_average(return_list, 9) 75 | plt.plot(episodes_list, mv_return) 76 | plt.xlabel('Episodes') 77 | plt.ylabel('Returns') 78 | plt.title('REINFORCE on {}'.format(env_name)) 79 | plt.show() --------------------------------------------------------------------------------