├── .gitignore
├── MADDPG_Continous
    ├── .gitignore
    ├── KNOWN_ISSUES.md
    ├── README.md
    ├── README_EN.md
    ├── agents
    │   ├── Centralized
    │   │   └── readme.md
    │   ├── Independent
    │   │   └── readme.md
    │   └── maddpg
    │   │   ├── DDPG_agent.py
    │   │   ├── MADDPG_agent.py
    │   │   ├── NN_actor.py
    │   │   ├── NN_critic.py
    │   │   ├── buffer.py
    │   │   └── readme.md
    ├── envs
    │   ├── custom_agents_dynamics.py
    │   ├── simple_env_fixed render.py
    │   └── simple_tag_env.py
    ├── main_evaluate.py
    ├── main_evaluate_save_render2gif.py
    ├── main_parameters.py
    ├── main_train.py
    ├── plot
    │   ├── convert_gif_to_loop.py
    │   ├── demo-rewards_plot_ma.png
    │   ├── plot_rewards.py
    │   ├── simple_tag_v3_demo.gif
    │   └── simple_tag_v3_demo_loop.gif
    └── utils
    │   ├── conda-environment.yml
    │   ├── linux_environment.yml
    │   ├── logger.py
    │   ├── mac_arm_M4_environment.yml
    │   ├── pip-requirements.txt
    │   ├── pip-requirements_mac_arm_M4.txt
    │   ├── runner.py
    │   └── setupPettingzoo.py
├── MATD3_Continous
    ├── agents
    │   ├── MATD3_agent.py
    │   ├── MATD3_runner.py
    │   ├── NN_actor_td3.py
    │   ├── NN_critic_td3.py
    │   ├── TD3_agent.py
    │   └── buffer.py
    ├── envs
    │   ├── custom_agents_dynamics.py
    │   └── simple_tag_env.py
    ├── main
    │   ├── main_evaluate.py
    │   ├── main_parameters.py
    │   └── main_train.py
    ├── plot
    │   ├── README.md
    │   ├── plot_rewards.py
    │   └── training_rewards_demo.png
    ├── readme.md
    ├── readme_en.md
    └── utils
    │   ├── conda-environment.yml
    │   ├── linux_environment.yml
    │   ├── logger.py
    │   ├── mac_arm_M4_environment.yml
    │   ├── pip-requirements.txt
    │   ├── pip-requirements_mac_arm_M4.txt
    │   └── setupPettingzoo.py
├── README.md
├── README_en.md
├── RL_Learning-main
    ├── README.md
    └── scripts
    │   ├── Chapter10_Actor Critic
    │       ├── 1.[QAC]Simplest actor critic.py
    │       ├── 2.[A2C]Advantage actor critic.py
    │       ├── 3.1Importance sampling.py
    │       ├── 3.[Importance sampling]Off-policy actor critic.py
    │       └── 4.[DPG]Deterministic actor critic.py
    │   ├── Chapter4_Value iteration and Policy iteration
    │       ├── plot_figure
    │       │   ├── policy_iteration.png
    │       │   └── value_iteration.png
    │       ├── policy_iteration.py
    │       └── value iteration.py
    │   ├── Chapter5_Monte Carlo Methods
    │       ├── MC_Basic.py
    │       ├── MC_Exploring_Starts.py
    │       └── MC_epsilon_greedy.py
    │   ├── Chapter6_Stochastic_approximation
    │       └── Robbins-Monro algorithm.py
    │   ├── Chapter7_Temporal-Difference learning
    │       ├── 1.Sarsa.py
    │       ├── 2.n-step Sarsa.py
    │       ├── 3.Q-learning.py
    │       └── 4.Q-learning on policy.py
    │   ├── Chapter8_Value Function Approximaton
    │       ├── 1.TD-Linear.py
    │       ├── 2.Sarsa with function approximation.py
    │       ├── 3.Q-learning with function approximation.py
    │       └── 4.[DQN]Deep Q-Network or Q-learning.py
    │   ├── Chapter9_Policy Gradient
    │       └── [Reinforce]Monte Carlo policy gradient.py
    │   ├── grid_env.py
    │   ├── model.py
    │   ├── render.py
    │   └── solver.py
├── img.png
└── 动手学强化学习
    ├── DQN
        ├── DQN.py
        ├── display.py
        └── main.py
    ├── Hands-on-RL
        ├── README.md
        ├── rl_utils.py
        ├── 第10章-Actor-Critic算法.ipynb
        ├── 第11章-TRPO算法.ipynb
        ├── 第12章-PPO算法.ipynb
        ├── 第13章-DDPG算法.ipynb
        ├── 第14章-SAC算法.ipynb
        ├── 第15章-模仿学习.ipynb
        ├── 第16章-模型预测控制.ipynb
        ├── 第17章-基于模型的策略优化.ipynb
        ├── 第18章-离线强化学习.ipynb
        ├── 第19章-目标导向的强化学习.ipynb
        ├── 第20章-多智能体强化学习入门.ipynb
        ├── 第21章-多智能体强化学习进阶.ipynb
        ├── 第2章-多臂老虎机问题.ipynb
        ├── 第3章-马尔可夫决策过程.ipynb
        ├── 第4章-动态规划算法.ipynb
        ├── 第5章-时序差分算法.ipynb
        ├── 第6章-Dyna-Q算法.ipynb
        ├── 第7章-DQN算法.ipynb
        ├── 第8章-DQN改进算法.ipynb
        └── 第9章-策略梯度算法.ipynb
    ├── README.md
    ├── rl_utils.py
    └── 策略梯度
        ├── Reinforce.py
        ├── display.py
        └── main.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | 
 3 | #忽略所有的__pycache__目录以及所有的.pyc、.pyo和.pyd文件
 4 | **/__pycache__
 5 | *.py[cod]
 6 | 
 7 | #如果你想忽略所有名为logs的文件夹，无论它们在项目中的位置如何，你可以在.gitignore文件中添加以下内容：
 8 | **/logs/
 9 | # ignore the work I'm doing now
10 | RL-basic-algorithm/Graph based Multi-agent path planning/
11 | 
12 | #忽略 .\Multi-agents-RL\Multi-agent Partial Environment\MADDPG\maddpg-pettingzoo-pytorch-master下的*
13 | **/results/
14 | 
15 | 
16 | #macOS
17 | **/.DS_Store
18 | Desktop.ini
19 | 
20 | # Thumbnail cache files
21 | ._*
22 | Thumbs.db
23 | 
24 | # Files that might appear on external disks
25 | .Spotlight-V100
26 | .Trashes
27 | 
28 | # Compiled Python files
29 | *.pyc
30 | 
31 | #
32 | plot/*
33 | 
34 | # Temp File
35 | *.swp
36 | *.swa
37 | *.swo
38 | 
39 | # github merge file
40 | *.orig
41 | 
42 | # virtualenv 
43 | venv
44 | __pycache__
45 | 
46 | #vscode
47 | .vscode
48 | 
49 | 
50 | 
51 | **/matd3_models/
52 | # 保留 maddpg_models 文件夹本身
53 | !**/matd3_models/.gitkeep  
54 | 
55 | # 忽略所有名为 data 的文件夹中的内容，但保留 data 文件夹本身
56 | **/matd3_data/*
57 | !**/matd3_data/.gitkeep
58 | 
59 | **/log_td3_main/*
60 | !**/log_td3_main/.gitkeep
61 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python 编译文件
 2 | **/__pycache__/
 3 | *.py[cod]
 4 | *.pyo
 5 | *.pyd
 6 | 
 7 | # PyCharm 设置
 8 | .idea/
 9 | 
10 | # VSCode 设置
11 | .vscode/
12 | 
13 | #如果你想忽略所有名为maddpg_models的文件夹，无论它们在项目中的位置如何，你可以在.gitignore文件中添加以下内容：
14 | **/maddpg_models/
15 | # 保留 maddpg_models 文件夹本身
16 | !**/maddpg_models/.gitkeep  
17 | 
18 | # 忽略所有名为 data 的文件夹中的内容，但保留 data 文件夹本身
19 | **/data/*
20 | !**/data/.gitkeep
21 | 
22 | **/logs/*
23 | !**/logs/.gitkeep
24 | 
25 | # 深度学习模型
26 | *.pth
27 | 
28 | # 操作系统特定的文件
29 | .DS_Store    # macOS
30 | Thumbs.db    # Windows


--------------------------------------------------------------------------------
/MADDPG_Continous/README.md:
--------------------------------------------------------------------------------
  1 | [🇨🇳 中文文档](README.md) | [🇺🇸 English](README_EN.md)
  2 | 
  3 | # 多智能体深度强化学习MADDPG算法 - Predator-Prey追逃博弈
  4 | 
  5 | ![项目状态](https://img.shields.io/badge/状态-重构完成-green) ![MADDPG](https://img.shields.io/badge/MADDPG-已实现-success)![Python](https://img.shields.io/badge/python-3.11.8%2B-blue)
  6 | 
  7 | >**本项目专为Predator-Prey追逃博弈任务优化！** 在`PettingZoo MPE`环境基础上重构修改，提供了完整的多智能体协作与对抗环境，适用于围捕控制、群体智能和策略博弈研究。
  8 | 
  9 | > Pettingzoo MPE环境：https://github.com/Farama-Foundation/PettingZoo
 10 | 
 11 | > MADDPG algorithm Reference: https://github.com/Git-123-Hub/maddpg-pettingzoo-pytorch
 12 | 
 13 | > 2025.4.26 update: MPE环境已经拆分出PettingZoo,详情请见MPE2:https://github.com/Farama-Foundation/MPE2
 14 | 
 15 | ## 📈 训练效果
 16 | <div align="center">
 17 |   <img src="./plot/simple_tag_v3_demo_loop.gif" alt="智能体行为" width="45%"/>
 18 |   <p><strong>训练后的智能体行为展示：捕食者(红色)追逐猎物(绿色)的过程</strong></p>
 19 | 
 20 |   <img src="./plot/demo-rewards_plot_ma.png" alt="训练收敛结果" width="80%"/>
 21 |   <p><strong>MADDPG算法在simple_tag_v3环境中的奖励收敛曲线</strong></p>
 22 | </div>
 23 | 
 24 | > **⚠️ 重要提示**：使用前请查看🔍 [**已知问题与解决方案KNOWN_ISSUES.md**](KNOWN_ISSUES.md)文档，了解常见问题的解决方法，特别是Windows系统的渲染卡死问题和PettingZoo版本兼容性问题。
 25 | 
 26 | > **奖励函数修改**：官方的奖励配置无法训练出好的效果，需要修改追捕者的奖励函数
 27 | 
 28 | > 当前状态：MADDPG算法已在 `/agents/maddpg/*.py` 中实现
 29 | 
 30 | ## 🚀 实现进度
 31 | | 算法            | 状态   | 位置                  | 核心组件                           |
 32 | |----------------|--------|----------------------|----------------------------------|
 33 | | MADDPG         | ✅ 1.0 | `agents/maddpg/`   | MADDPG_agent, DDPG_agent, buffer |
 34 | | Independent RL | ⏳ 待完成 | `agents/independent/`| IndependentRL (计划中)          |
 35 | | Centralized RL | ⏳ 待完成 | `agents/centralized/`| CentralizedRL (计划中)          |
 36 | 
 37 | > 注意：MADDPG模块目前位于agents根目录（buffer.py, DDPG_agent.py等），但功能完整可用！
 38 | 
 39 | ## 🏗️ 项目结构
 40 | ```tree
 41 | MADDPG_Continous/
 42 | ├── agents/                   # 核心实现
 43 | │   ├── maddpg/              # MADDPG算法实现
 44 | │   │   ├── MADDPG_agent.py  # 多智能体控制器
 45 | │   │   ├── DDPG_agent.py    # 基础DDPG实现
 46 | │   │   ├── buffer.py        # 经验回放缓冲区
 47 | │   │   └── NN_(actor|critic).py # 神经网络模块
 48 | │   ├── Independent/         # 独立RL实现（计划中）
 49 | │   └── Centralized/         # 集中式RL实现（计划中）
 50 | ├── envs/                     # 自定义环境
 51 | │   ├── custom_agents_dynamics.py  # 扩展物理引擎
 52 | │   └── simple_tag_env.py          # 修改后的标签环境
 53 | ├── utils/                    # 工具模块
 54 | │   ├── runner.py             # 训练运行器
 55 | │   ├── logger.py             # 训练日志记录器
 56 | │   ├── conda-environment.yml # Conda环境配置文件
 57 | │   ├── linux_environment.yml # Linux环境配置文件
 58 | │   ├── mac_arm_M4_environment.yml # Mac M系列芯片环境配置文件
 59 | │   ├── pip-requirements.txt  # 通用依赖项要求
 60 | │   ├── pip-requirements_mac_arm_M4.txt # Mac M芯片特定依赖项
 61 | │   └── setupPettingzoo.py    # PettingZoo环境设置脚本
 62 | ├── main_train.py             # 统一训练入口
 63 | ├── main_evaluate.py          # 统一评估入口
 64 | ├── main_evaluate_save_render2gif.py # 渲染并保存GIF
 65 | └── main_parameters.py        # 统一参数配置
 66 | ```
 67 | 
 68 | ## 🛠️ 快速开始
 69 | 
 70 | ### 环境配置
 71 | 
 72 | > 相关配置需求在utils/文件夹下。
 73 | 
 74 | ### Linux环境（ubuntu）
 75 | 1. 使用linux_environment.yml创建新环境
 76 | ```bash
 77 | # 注意：将"MPE"替换为您喜欢的环境名称
 78 | conda env create -f utils/linux_environment.yml -n MPE
 79 | # 激活刚创建的环境
 80 | conda activate MPE
 81 | ```
 82 | 2. pip安装核心依赖
 83 | ```bash
 84 | pip install -r utils/pip-requirements.txt
 85 | ```
 86 | ### Mac M系列芯片环境
 87 | 1. 使用mac_arm_M4_environment.yml创建新conda环境
 88 | ```bash
 89 | # 注意：将"MPE"替换为您喜欢的环境名称
 90 | conda env create -f utils/mac_arm_M4_environment.yml -n MPE
 91 | # 激活刚创建的环境
 92 | conda activate MPE
 93 | ```
 94 | 2. pip安装Mac M芯片专用依赖
 95 | ```bash
 96 | pip install -r utils/pip-requirements_mac_arm_M4.txt
 97 | ```
 98 | 
 99 | ### Windows创建并激活虚拟环境（推荐）
100 | 1. 使用conda-environment.yml创建新环境
101 | ```bash
102 | # 注意：将"MPE"替换为您喜欢的环境名称
103 | conda env create -f utils/conda-environment.yml -n MPE
104 | # 激活刚创建的环境
105 | conda activate MPE
106 | ```
107 | 2. pip安装核心依赖
108 | ```bash
109 | pip install -r utils/pip-requirements.txt
110 | ```
111 | ### 手动安装依赖
112 | > 上述虚拟环境创建成功后，您需要手动安装以下依赖：
113 | 3. 从PyTorch官网安装对应版本的PyTorch
114 | ```bash
115 | # 请访问 https://pytorch.org 选择适合您系统的安装命令
116 | # 例如：
117 | pip3 install torch torchvision torchaudio
118 | ```
119 | 
120 | 4. 2025.4.26 update: 安装`PettingZoo 1.25.0`版本，官方PyPI仓库最新版本更新为为1.25.0，内容与1.24.4相同。MPE被拆分出PettingZoo, **警告可忽略**，`MPE2`详情可见:https://github.com/Farama-Foundation/MPE2
121 | ```bash
122 | pip install pettingzoo==1.25.0
123 | ```
124 | 
125 | 4. ~~安装PettingZoo 1.24.4版本~~
126 | ```bash
127 | # 重要说明：本项目需要PettingZoo 1.24.4版本，但官方PyPI仓库最新版本仅为1.24.3
128 | # 必须从GitHub源码安装才能获取1.24.4版本，安装命令为：
129 | # pip install "pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git"
130 | # 或者，您可以直接运行提供的安装脚本：
131 | # python utils/setupPettingzoo.py
132 | ```
133 | 
134 | ### 🖥️ 运行配置
135 | > **注意：** 为简化使用，当前版本已不再依赖Visdom进行可视化，您可跳过下述visdom配置，但保留相关配置供需要时参考。
136 | 
137 | ```bash
138 | # 启动Visdom可视化服务器（新终端）
139 | python -m visdom.server
140 | # 或指定端口
141 | python -m visdom.server -port 8097
142 | 
143 | # 访问训练仪表盘：
144 | # http://localhost:8097
145 | ```
146 | 
147 | ## 🔄 训练流程
148 | 1. **参数配置**   
149 | 在 [`main_parameter.py`](main_parameters.py) 中设置环境参数：
150 | ```python
151 | env_name = 'simple_tag_v3'  # 可选：simple_adversary_v3/simple_spread_v3
152 | episode_num = 5000         # 总训练回合数
153 | # 训练参数
154 | batch_size = 128          # 经验回放批次大小
155 | actor_lr = 0.01           # Actor网络学习率
156 | critic_lr = 0.01          # Critic网络学习率
157 | ```
158 | 
159 | 2. **启动Visdom服务器**
160 | ```bash
161 | # 在单独的终端中启动Visdom可视化服务器
162 | python -m visdom.server
163 | # 或指定端口
164 | python -m visdom.server -port 8097
165 | 
166 | # 访问训练仪表盘：
167 | # http://localhost:8097
168 | ```
169 | 3. **运行训练脚本**
170 | ```bash
171 | # 使用默认参数训练
172 | python main_train.py
173 | ```
174 | 4. **在 `http://localhost:8097` 监控训练进度**
175 | 
176 | 5. **评估训练模型**
177 | ```bash
178 | # 渲染训练好的模型策略
179 | python main_evaluate.py
180 | ```
181 | 
182 | ### 🌐 环境定制
183 | [`envs/simple_tag_env.py`](envs/simple_tag_env.py) 扩展了PettingZoo的MPE环境：
184 | - 在 [`envs/custom_agents_dynamics.py`](envs/custom_agents_dynamics.py) 中自定义智能体动力学
185 | - 修改的奖励函数，专为Predator-Prey任务优化
186 | - 可调节的智能体物理参数：
187 |   - 世界大小：2.5单位（可根据追逃需求自定义）
188 |   - 时间步长：0.1秒（影响动作响应速度）
189 |   - 阻尼系数：0.2（影响智能体的惯性）
190 |   - 碰撞参数：
191 |     - 接触力：1e2（控制碰撞强度，影响围捕效果）
192 |     - 接触边界：1e-3（控制碰撞柔软度）
193 | 
194 | #### 🔄 自定义追逃场景
195 | 您可以轻松配置自己的追逃环境：
196 | - 自定义Predator数量、速度和加速度
197 | - 配置Evader的逃跑策略和敏捷度
198 | - 设计围捕奖励机制，鼓励协作或竞争行为
199 | - 实现复杂地形和障碍物（通过自定义碰撞处理）
200 | 
201 | 
202 | ## 📦 数据管理
203 | ### 模型存储
204 | 训练模型自动保存在：
205 | ```tree
206 | ./models/
207 | └── maddpg_models/          # MADDPG检查点目录
208 |     ├── {timestamp}_agent_0_actor.pth    # Actor网络参数
209 |     ├── {timestamp}_agent_0_critic.pth   # Critic网络参数
210 |     └── ...                             # 其他智能体网络
211 | ```
212 | 
213 | ### 可视化系统
214 | 训练指标可视化：
215 | ```tree
216 | plot/
217 | ├── data/                   # 序列化训练指标
218 | │   └── plot_data_20240515.pkl  # PyTorch张量存储
219 | └── plot_rewards.py         # 可视化工具
220 | ```
221 | 
222 | ### 日志系统
223 | 实现于 [`logger.py`](utils/logger.py)：
224 | - 记录训练元数据（设备、时长）
225 | - 序列化超参数
226 | - 生成训练报告
227 | 
228 | ```tree
229 | logs/
230 | ├── training_log.json       # 可读训练报告
231 | └── plot_data_20240515.pkl  # 原始指标数据
232 | ```
233 | 
234 | 
235 | ## 🐛 已知问题与解决方案
236 | 我们整理了一份详细的已知问题及其解决方案文档，包括：
237 | - **Windows系统渲染无响应问题**：修复PettingZoo的渲染问题
238 | - **PettingZoo版本兼容性问题**：本项目需要1.24.4版本
239 | - **Visdom服务器连接问题**：确保可视化服务正常运行
240 | - **奖励函数修改**：官方的奖励配置无法训练出好的效果，需要修改追捕者的奖励函数
241 | 👉 **[点击查看完整的已知问题与解决方案文档](KNOWN_ISSUES.md)**
242 | 
243 | 如果您遇到文档中未提及的问题，请在Issues中提交，我们将尽快解决。
244 | 
245 | ## 🤝 贡献
246 | 本项目的主要贡献在于：
247 | - 针对Predator-Prey追逃博弈任务的环境适配与优化
248 | - 改进的奖励函数设计，解决官方环境训练效果不佳的问题
249 | - 灵活的围捕控制参数配置，支持多种追逃场景
250 | 
251 | 如遇到任何问题，欢迎提交Issue或Pull Request。若您有兴趣扩展更多追逃博弈场景，欢迎您的贡献！
252 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/agents/Centralized/readme.md:
--------------------------------------------------------------------------------
 1 | # Centralized RL 实现
 2 | 
 3 | ## 算法特点
 4 | - 集中式训练和执行
 5 | - 将多智能体系统视为单一控制问题
 6 | - 考虑所有智能体的全局状态和联合动作
 7 | 
 8 | ## 核心组件
 9 | - `CentralizedRL.py`: 集中式学习算法的主要实现
10 | - `DDPG_agent.py`: 改进的 DDPG 算法
11 | - `NN_actor.py`: 集中式 Actor 网络
12 | - `NN_critic.py`: 集中式 Critic 网络
13 | 
14 | ## 优缺点
15 | 优点：
16 | - 能获得理论上的最优策略
17 | - 完整利用全局信息
18 | 
19 | 缺点：
20 | - 状态空间和动作空间随智能体数量指数增长
21 | - 实际部署时需要集中式控制
22 | 
23 | 
24 | | 2025.2.18 updated.
25 | <br>
26 | 1. 共享reward函数。
27 | 2. 定义：所有智能体共享同一个全局网络，通过智能体ID或角色标识区分。<br>
28 | 输入输出：<br>
29 | Actor：接收自身观测+智能体ID，输出动作。(待核实)<br>  
30 | Critic：接收全局状态+所有动作+智能体ID，输出Q值。（待核实）<br>


--------------------------------------------------------------------------------
/MADDPG_Continous/agents/Independent/readme.md:
--------------------------------------------------------------------------------
 1 | # Independent RL 实现
 2 | 
 3 | ## 算法特点
 4 | - 每个智能体独立学习和决策
 5 | - 将多智能体问题转化为多个单智能体问题
 6 | - 不考虑其他智能体的行为和策略
 7 | 
 8 | ## 核心组件
 9 | - `IndependentRL.py`: 独立学习算法的主要实现
10 | - `DDPG_agent.py`: 单智能体 DDPG 算法
11 | - `NN_actor.py`: Actor 网络结构
12 | - `NN_critic.py`: Critic 网络结构
13 | 
14 | ## 优缺点
15 | 优点：
16 | - 实现简单，训练稳定
17 | - 易于并行化
18 | 
19 | 缺点：
20 | - 忽略智能体间的交互
21 | - 难以学习协作行为
22 | 
23 | 
24 | | 2025.2.18 updated.
25 | <br>
26 | 1. reward独立.
27 | 2. 智能体独自决策，没有信息共享。  action = actor(obs);   Q = critic(obs, action). （应该没错）


--------------------------------------------------------------------------------
/MADDPG_Continous/agents/maddpg/DDPG_agent.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from copy import deepcopy
 3 | from typing import List
 4 | 
 5 | import torch 
 6 | import torch.nn.functional as F
 7 | from torch import nn, Tensor
 8 | from torch.optim import Adam
 9 | from agents.maddpg.NN_actor import MLPNetworkActor
10 | from agents.maddpg.NN_critic import MLPNetworkCritic
11 | 
12 | class DDPG():
13 |     def __init__(self, obs_dim, act_dim, global_obs_dim, actor_lr, critic_lr, device, action_bound,  chkpt_dir, chkpt_name):
14 |         self.actor = MLPNetworkActor(in_dim=obs_dim, out_dim=act_dim, hidden_dim = 64, action_bound=action_bound, chkpt_dir = chkpt_dir, chkpt_name = (chkpt_name + 'actor.pth')).to(device)
15 |         self.critic = MLPNetworkCritic(in_dim=global_obs_dim, out_dim=1, hidden_dim = 64, chkpt_dir = chkpt_dir, chkpt_name = (chkpt_name + 'critic.pth')).to(device)
16 |         #优化器
17 |         self.actor_optimizer = Adam(self.actor.parameters(), lr = actor_lr)
18 |         self.critic_optimizer = Adam(self.critic.parameters(), lr = critic_lr)
19 |         # 创建相对于的target网络
20 |         """
21 |         使用 deepcopy 创建 target 网络是一个更好的选择，原因如下：
22 |         初始化一致性：
23 |             - deepcopy 确保 target 网络和原网络完全相同的初始参数
24 |             - 重新创建网络可能因为随机初始化导致参数不一致
25 |         """
26 |         self.target_actor = deepcopy(self.actor)
27 |         self.target_critic = deepcopy(self.critic)
28 | 
29 |     def action(self, obs, model_out = False):
30 |         # 其中没有用到logi, 接受其返回值第二项为 '_' 具体地:  a, _ = self.agents[agent].action(o) 
31 |         action, logi = self.actor(obs)
32 |         return action, logi
33 | 
34 |     def target_action(self,obs):
35 |         action, logi = self.target_actor(obs)
36 |         return action, logi
37 |     
38 |     def critic_value(self, state_list: List[Tensor], act_list: List[Tensor]):  # 包含Tensor对象的列表
39 |         x = torch.cat(state_list + act_list, 1)
40 |         return self.critic(x).squeeze(1)  # tensor with a given length
41 |     
42 |     def target_critic_value(self, state_list: List[Tensor], act_list: List[Tensor]):
43 |         x = torch.cat(state_list + act_list, 1)
44 |         return self.target_critic(x).squeeze(1)  # tensor with a given length
45 |     
46 |     def update_actor(self, loss):
47 |         self.actor_optimizer.zero_grad()
48 |         loss.backward()
49 |         nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)  # clip_grad_norm_ ：带有下划线后缀，表示这是一个就地操作，会直接修改传入的参数梯度。
50 |         self.actor_optimizer.step()
51 |     
52 |     def update_critic(self, loss):
53 |         self.critic_optimizer.zero_grad()
54 |         loss.backward()
55 |         nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)  # clip_grad_norm_ ：带有下划线后缀，表示这是一个就地操作，会直接修改传入的参数梯度。
56 |         self.critic_optimizer.step()
57 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/agents/maddpg/MADDPG_agent.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from agents.maddpg.DDPG_agent import DDPG
  7 | from agents.maddpg.buffer import BUFFER
  8 | 
  9 | class MADDPG():
 10 |     # device = 'cpu'
 11 |     # device = 'cuda' if torch.cuda.is_available() else 'cpu'
 12 | 
 13 |     def __init__(self, dim_info, capacity, batch_size, actor_lr, critic_lr, action_bound, _chkpt_dir, _device = 'cpu', _model_timestamp = None):
 14 |         # 确保模型保存路径存在
 15 |         if _chkpt_dir is not None:
 16 |             os.makedirs(_chkpt_dir, exist_ok=True)
 17 | 
 18 |         self.device = _device
 19 |         self.model_timestamp = _model_timestamp
 20 |         # 状态（全局观测）与所有智能体动作维度的和 即critic网络的输入维度  dim_info =  [obs_dim, act_dim]
 21 |         global_obs_act_dim = sum(sum(val) for val in dim_info.values())
 22 |         # 创建智能体与buffer，每个智能体有自己的buffer, actor, critic
 23 |         self.agents = {}
 24 |         self.buffers = {}
 25 |         for agent_id, (obs_dim, act_dim) in dim_info.items():
 26 |             # print("dim_info -> agent_id:",agent_id)
 27 |             # 每一个智能体都是一个DDPG智能体
 28 |             
 29 |             self.agents[agent_id] = DDPG(obs_dim, act_dim, global_obs_act_dim, actor_lr, critic_lr, self.device, action_bound[agent_id], chkpt_name = (agent_id + '_'), chkpt_dir = _chkpt_dir)
 30 |             # buffer均只是存储自己的观测与动作
 31 |             self.buffers[agent_id] = BUFFER(capacity, obs_dim, act_dim, self.device)
 32 |         self.dim_info = dim_info
 33 |         self.batch_size = batch_size
 34 | 
 35 |     def add(self, obs, action, reward, next_obs, done):
 36 |         #NOTE that the experience is a dict with agent name as its key
 37 |         for agent_id in obs.keys():
 38 |             o = obs[agent_id]
 39 |             a = action[agent_id]
 40 |             if isinstance(a, int):  #返回值为True or False, 判断a是否为int类型，是，返回True。
 41 |                 # the action from env.action_space.sample() is int, we have to convert it to onehot
 42 |                 a = np.eye(self.dim_info[agent_id][1])[a]
 43 |             r = reward[agent_id]
 44 |             next_o = next_obs[agent_id]
 45 |             d = done[agent_id]
 46 |             self.buffers[agent_id].add(o, a, r, next_o, d)
 47 |     
 48 |     def sample(self, batch_size):
 49 |         """sample experience from all the agents' buffers, and collect data for network input"""
 50 |         # get the total num of transitions, these buffers should have same number of transitions
 51 |         total_num = len(self.buffers['agent_0'])
 52 |         indices = np.random.choice(total_num, size = batch_size, replace = False)
 53 |         # NOTE that in MADDPG, we need the obs and actions of all agents
 54 |         # but only the reward and done of the current agent is needed in the calculation
 55 |         obs, act, reward, next_obs, done, next_act = {}, {}, {}, {}, {}, {}
 56 |         for agent_id, buffer in self.buffers.items():
 57 |             o, a, r, n_o, d = buffer.sample(indices)
 58 |             obs[agent_id] = o
 59 |             act[agent_id] = a
 60 |             reward[agent_id] = r
 61 |             next_obs[agent_id] = n_o
 62 |             done[agent_id] = d
 63 |             # calculate next_action using target_network and next_state
 64 |             next_act[agent_id], _ = self.agents[agent_id].target_action(n_o)
 65 |         
 66 |         return obs, act, reward, next_obs, done, next_act
 67 |     
 68 |     def select_action(self, obs):
 69 |         action = {}
 70 |         for agent, o in obs.items():
 71 |             o = torch.from_numpy(o).unsqueeze(0).float().to(self.device)
 72 |             a, _ = self.agents[agent].action(o)   # torch.Size([1, action_size])    #action函数：  action, logi = self.actor(obs)
 73 |             # NOTE that the output is a tensor, convert it to int before input to the environment
 74 |             action[agent] = a.squeeze(0).detach().cpu().numpy()
 75 |         return action
 76 |     # 更多解释-飞书链接：https://m6tsmtxj3r.feishu.cn/docx/Kb1vdqvBholiIUxcvYxcIcBcnEg?from=from_copylink   密码：6u2257#8
 77 |     def learn(self, batch_size, gamma):
 78 |         for agent_id, agent in self.agents.items():
 79 |             obs, act, reward, next_obs, done, next_act = self.sample(batch_size)
 80 |             # upate critic
 81 |             critic_value = agent.critic_value( list(obs.values()), list(act.values()) )
 82 | 
 83 |             next_target_critic_value = agent.target_critic_value(list(next_obs.values()),
 84 |                                                                  list(next_act.values()))
 85 |             target_value = reward[agent_id] + gamma * next_target_critic_value* (1-done[agent_id])
 86 |             critic_loss = F.mse_loss(critic_value, target_value.detach(), reduction = 'mean')
 87 |             agent.update_critic(critic_loss)
 88 | 
 89 |             #update actor
 90 |             action, logits = agent.action(obs[agent_id], model_out = True)
 91 |             act[agent_id] = action
 92 |             actor_loss = - agent.critic_value( list(obs.values()), list(act.values()) ).mean()
 93 |             actor_loss_pse = torch.pow(logits, 2).mean()  #这个是干嘛的？
 94 |             agent.update_actor(actor_loss + 1e-3 *actor_loss_pse)
 95 |     
 96 |     def update_target(self, tau): #  嵌套函数定义
 97 |         def soft_update(from_network, to_network):
 98 |             """ copy the parameters of `from_network` to `to_network` with a proportion of tau """
 99 |             for from_p, to_p in zip(from_network.parameters(), to_network.parameters()):
100 |                 to_p.data.copy_(tau * from_p.data + (1.0 - tau) * to_p.data)
101 | 
102 |         for agent in self.agents.values():
103 |             soft_update(agent.actor, agent.target_actor)  #体现使用嵌套函数的作用！ 易于维护和使用
104 |             soft_update(agent.critic, agent.target_critic)
105 | 
106 |     @classmethod
107 |     def load( cls, dim_info, file):
108 |         """ init maddpg using the model saved in `file` """
109 |         instance = cls(dim_info, 0, 0, 0, 0, os.path.dirname(file))
110 |         data = torch.load(file, map_location=instance.device)
111 |         for agent_id, agent in instance.agents.items():
112 |             agent.actor.load_state_dict(data[agent_id])
113 |         return instance
114 |     
115 |     def save_model(self):
116 |         for agent_id in self.dim_info.keys():
117 |             self.agents[agent_id].actor.save_checkpoint(is_target = False, timestamp = True)
118 |             self.agents[agent_id].target_actor.save_checkpoint(is_target = True, timestamp = True)
119 |             self.agents[agent_id].critic.save_checkpoint(is_target = False, timestamp = True)
120 |             self.agents[agent_id].target_critic.save_checkpoint(is_target = True, timestamp = True)
121 | 
122 |         agent_id = list(self.dim_info.keys())[0]  # 获取第一个代理的 ID
123 |         agent = self.agents[agent_id]
124 |         for name, param in agent.actor.state_dict().items():
125 |         # 仅打印前几个值（例如前5个）
126 |             print(f"Layer: {name}, Shape: {param.shape}, Values: {param.flatten()[:5]}")  # flatten() 展开参数为一维数组
127 | 
128 | 
129 |     def load_model(self):
130 |         for agent_id in self.dim_info.keys():
131 |             self.agents[agent_id].actor.load_checkpoint(device = self.device, is_target = False, timestamp = self.model_timestamp)
132 |             self.agents[agent_id].target_actor.load_checkpoint(device = self.device, is_target = True, timestamp = self.model_timestamp)
133 |             self.agents[agent_id].critic.load_checkpoint(device = self.device, is_target = False, timestamp = self.model_timestamp)
134 |             self.agents[agent_id].target_critic.load_checkpoint(device = self.device, is_target = True, timestamp = self.model_timestamp)
135 | 
136 |         agent_id = list(self.dim_info.keys())[0]  # 获取第一个代理的 ID
137 |         agent = self.agents[agent_id]
138 |         for name, param in agent.actor.state_dict().items():
139 |         # 仅打印前几个值（例如前5个）
140 |             print(f"Layer: {name}, Shape: {param.shape}, Values: {param.flatten()[:5]}")  # flatten() 展开参数为一维数组
141 |   
142 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/agents/maddpg/NN_actor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.functional as F
 4 | import os
 5 | from datetime import datetime
 6 | 
 7 | class MLPNetworkActor(nn.Module):
 8 |     def __init__(self,chkpt_name,  chkpt_dir, in_dim, out_dim, action_bound, hidden_dim = 64, non_linear = nn.ReLU()):
 9 |         super(MLPNetworkActor, self).__init__()
10 |         self.chkpt_dir = chkpt_dir
11 |         self.chkpt_name = chkpt_name
12 | 
13 |         # different ,为什么要保持这两个信息？
14 |         self.out_dim = out_dim
15 |         self.action_bound = action_bound
16 | 
17 |         self.net = nn.Sequential(
18 |             nn.Linear(in_dim, hidden_dim),
19 |             non_linear,
20 |             nn.Linear(hidden_dim, hidden_dim),
21 |             non_linear,
22 |             nn.Linear(hidden_dim, out_dim),
23 |         ).apply(self.init)
24 | 
25 |     @staticmethod
26 |     def init(m):
27 |         '''init patameters of the module'''
28 |         gain = nn.init.calculate_gain('relu')
29 |         if isinstance(m, nn.Linear):
30 |             nn.init.xavier_uniform_(m.weight, gain = gain)  #使用了 Xavier 均匀分布初始化（也叫 Glorot 初始化）
31 |             m.bias.data.fill_(0.01)
32 |     
33 |     def forward(self, x):
34 |         x = self.net(x)
35 |         logi = x
36 |         a_min = self.action_bound[0]
37 |         a_max = self.action_bound[1]
38 |         ''' 这三行为什么要这么处理？ 引入了bias项干嘛'''
39 |         k = torch.tensor( (a_max - a_min) /2 , device=x.device )
40 |         bias = torch.tensor( (a_max + a_min) /2, device=x.device )
41 |         action = k * torch.tanh(x) + bias
42 |         return action, logi
43 | 
44 |     def save_checkpoint(self, is_target=False, timestamp = False):
45 |         # 使用时间戳保存功能
46 |         if timestamp is True:
47 |              # 使用时间戳创建新文件夹
48 |              current_timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M')
49 |              save_dir = os.path.join(self.chkpt_dir, current_timestamp)
50 |         else:
51 |             # 直接保存在主目录下，不使用时间戳
52 |             save_dir = self.chkpt_dir
53 |         
54 |         # 确保目录存在
55 |         os.makedirs(save_dir, exist_ok=True)
56 |         
57 |         # 创建保存路径
58 |         self.chkpt_file = os.path.join(save_dir, self.chkpt_name)
59 | 
60 |         if is_target:
61 |             target_chkpt_name = self.chkpt_file.replace('actor', 'target_actor')
62 |             os.makedirs(os.path.dirname(target_chkpt_name), exist_ok=True)
63 |             torch.save(self.state_dict(), target_chkpt_name)
64 |         else:
65 |             os.makedirs(os.path.dirname(self.chkpt_file), exist_ok=True)
66 |             torch.save(self.state_dict(), self.chkpt_file)
67 | 
68 |     def load_checkpoint(self, device = 'cpu', is_target = False, timestamp = None): # 默认加载target
69 |         if timestamp and isinstance(timestamp, str):
70 |             # 如果提供了有效的时间戳字符串，从对应文件夹加载
71 |             load_dir = os.path.join(self.chkpt_dir, timestamp)
72 |         else:
73 |             # 否则从主目录加载
74 |             load_dir = self.chkpt_dir
75 |     
76 |         # 使用os.path.join确保路径分隔符的一致性
77 |         self.chkpt_file = os.path.join(load_dir, self.chkpt_name)
78 |     
79 |         if is_target:
80 |             target_chkpt_name = self.chkpt_file.replace('actor', 'target_actor')
81 |             # 确保路径存在
82 |             if not os.path.exists(target_chkpt_name):
83 |                 print(f"警告: 找不到目标模型文件: {target_chkpt_name}")
84 |                 return
85 |             self.load_state_dict(torch.load(target_chkpt_name, map_location=torch.device(device)))
86 |         else:
87 |             # 确保路径存在
88 |             if not os.path.exists(self.chkpt_file):
89 |                 print(f"警告: 找不到模型文件: {self.chkpt_file}")
90 |                 return
91 |             self.load_state_dict(torch.load(self.chkpt_file, map_location=torch.device(device)))


--------------------------------------------------------------------------------
/MADDPG_Continous/agents/maddpg/NN_critic.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.functional as F
 4 | import os
 5 | from datetime import datetime
 6 | """
 7 | self.target_critic = CriticNetwork(*, *, 
 8 |                                 chkpt_dir=chkpt_dir,
 9 |                                 name=self.agent_name+'_target_critic')
10 | """
11 | class MLPNetworkCritic(nn.Module):
12 |     def __init__(self, chkpt_name,  chkpt_dir, in_dim, out_dim, hidden_dim = 64, non_linear = nn.ReLU()):
13 |         super(MLPNetworkCritic, self).__init__()
14 |         self.chkpt_dir = chkpt_dir
15 |         self.chkpt_name = chkpt_name
16 | 
17 |         self.net = nn.Sequential(
18 |             nn.Linear(in_dim, hidden_dim),
19 |             non_linear,
20 |             nn.Linear(hidden_dim, hidden_dim),
21 |             non_linear,
22 |             nn.Linear(hidden_dim, out_dim),
23 |         ).apply(self.init)
24 | 
25 |     @staticmethod
26 |     def init(m):
27 |         '''init patameters of the module'''
28 |         gain = nn.init.calculate_gain('relu')
29 |         if isinstance(m, nn.Linear):
30 |             nn.init.xavier_uniform_(m.weight, gain = gain)  #使用了 Xavier 均匀分布初始化（也叫 Glorot 初始化）
31 |             m.bias.data.fill_(0.01)
32 |     
33 |     def forward(self, x):
34 |         return self.net(x)
35 |     
36 |     def save_checkpoint(self, is_target = False, timestamp = False):
37 |         if timestamp is True:
38 |             # 使用时间戳创建新文件夹
39 |             current_timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M')
40 |             save_dir = os.path.join(self.chkpt_dir, current_timestamp)
41 |         else:
42 |             # 直接保存在主目录下
43 |             save_dir = self.chkpt_dir
44 |         
45 |         # 确保目录存在
46 |         os.makedirs(save_dir, exist_ok=True)
47 |         
48 |         self.chkpt_file = os.path.join(save_dir, self.chkpt_name)
49 | 
50 |         if is_target:
51 |             target_chkpt_name = self.chkpt_file.replace('critic', 'target_critic')
52 |             os.makedirs(os.path.dirname(target_chkpt_name), exist_ok=True)
53 |             torch.save(self.state_dict(), target_chkpt_name)
54 |         else:
55 |             os.makedirs(os.path.dirname(self.chkpt_file), exist_ok=True)
56 |             torch.save(self.state_dict(), self.chkpt_file)
57 | 
58 |     def load_checkpoint(self, device = 'cpu', is_target = False, timestamp = None):
59 |         if timestamp and isinstance(timestamp, str):
60 |             # 如果提供了有效的时间戳字符串，从对应文件夹加载
61 |             load_dir = os.path.join(self.chkpt_dir, timestamp)
62 |         else:
63 |             # 否则从主目录加载
64 |             load_dir = self.chkpt_dir
65 |         
66 |         self.chkpt_file = os.path.join(load_dir, self.chkpt_name)
67 | 
68 |         if is_target:
69 |             target_chkpt_name = self.chkpt_file.replace('critic', 'target_critic')
70 |             self.load_state_dict(torch.load(target_chkpt_name, map_location=torch.device(device)))
71 |         else:
72 |             self.load_state_dict(torch.load(self.chkpt_file, map_location=torch.device(device)))
73 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/agents/maddpg/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | class BUFFER():
 5 |     
 6 |     def __init__(self,capacity, obs_dim, act_dim, device):
 7 |         self.capacity = capacity
 8 |         self.obs = np.zeros((capacity, obs_dim))
 9 |         self.action = np.zeros((capacity, act_dim))
10 |         self.reward = np.zeros(capacity)
11 |         self.next_obs = np.zeros((capacity, obs_dim))
12 |         self.done = np.zeros(capacity, dtype = bool)
13 |         self._index = 0
14 |         self._size = 0
15 |         self.device = device
16 | 
17 |     def add(self,obs, action, reward, next_obs, done):
18 |         self.obs[self._index] = obs
19 |         self.action[self._index] = action
20 |         self.reward[self._index] = reward
21 |         self.next_obs[self._index] = next_obs
22 |         self.done[self._index] = done
23 | 
24 |         self._index = (self._index +1) % self.capacity
25 |         if self._size < self.capacity:
26 |             self._size += 1
27 | 
28 | 
29 |     def sample(self, indices):
30 |         obs = self.obs[indices]
31 |         action = self.action[indices]
32 |         reward = self.reward[indices]
33 |         next_obs = self.next_obs[indices]
34 |         done = self.done[indices]
35 | 
36 |         obs = torch.from_numpy(obs).float().to(self.device)  # torch.Size([batch_size, state_dim])
37 |         action = torch.from_numpy(action).float().to(self.device)  # torch.Size([batch_size, action_dim])
38 |         reward = torch.from_numpy(reward).float().to(self.device)  # just a tensor with length: batch_size
39 |         # reward = (reward - reward.mean()) / (reward.std() + 1e-7)
40 |         next_obs = torch.from_numpy(next_obs).float().to(self.device)  # Size([batch_size, state_dim])
41 |         done = torch.from_numpy(done).float().to(self.device)  # just a tensor with length: batch_size
42 |         
43 |         return obs, action, reward, next_obs, done
44 | 
45 |     def __len__(self):  #保留方法
46 |         return self._size
47 |         


--------------------------------------------------------------------------------
/MADDPG_Continous/agents/maddpg/readme.md:
--------------------------------------------------------------------------------
 1 | 2025.2.18
 2 | 
 3 | TODO:
 4 | 
 5 | 1. 经典MADDPG：Critic是集中式的（全局输入），但每个智能体可独立更新Critic。  （需找经典代码）~
 6 | 
 7 | 
 8 | # 一、独立Critic网络的核心意义
 9 | 1. 异构奖励函数支持
10 | 竞争场景：若智能体间存在利益冲突（如对抗游戏），每个Critic需学习不同的Q函数以反映各自奖励目标。
11 | 例如：足球游戏中，进攻方Critic需评估射门收益，防守方Critic需评估拦截收益。
12 | 混合协作场景：部分智能体可能有辅助性奖励（如无人机编队中的领航者与跟随者）。
13 | 
14 | 2. 策略独立性
15 | 策略空间差异：即使输入相同，不同智能体的Actor网络输出动作分布不同，Critic需独立评估各自策略的全局影响。
16 | 非对称学习速率：独立Critic允许智能体以不同速度学习，避免共享网络导致的策略耦合震荡。
17 | 3. 实现灵活性
18 | 扩展性：支持未来扩展至异构观测/动作空间（如部分智能体为连续控制，其他为离散决策）。
19 | 调试便利：独立网络便于单独监控和调整特定智能体的学习过程。
20 | 
21 | # 二、输入相同时的Critic差异性来源
22 | 即使Critic输入相同（所有Agent的obs+actions），以下因素仍会导致各Critic输出不同：
23 | 
24 | 1. 网络参数独立性
25 | 初始随机化：独立网络参数初始值不同，导致梯度更新路径分化。
26 | 优化过程差异：不同Critic的优化器状态（如动量）独立积累。
27 | 2. 目标Q值差异
28 | 奖励函数不同：若 r_i ≠ r_j，目标Q值 target_q = r_i + γQ' 直接不同。
29 | 下一状态动作差异：不同智能体的目标Actor生成的动作策略不同（如进攻者选择突破，防守者选择拦截）。
30 | 3. 环境动力学影响
31 | 状态转移差异：不同智能体对环境的改变方式不同（如机器人推箱子任务中，不同推法导致不同后续状态）。
32 | # 三、独立Critic的代价与优化
33 | 1. 计算开销分析
34 | 训练速度：独立Critic的并行计算可通过GPU批处理缓解，实际影响有限。
35 | 内存占用：网络参数数量与智能体数量线性增长，可通过网络结构简化（如共享隐层）优化。
36 | 2. 优化策略
37 | 参数共享试探：在同构完全协作场景中，可尝试同类智能体共享Critic。
38 | ```
39 | {
40 | # 示例：追击者共享Critic
41 | class SharedCritic(nn.Module):
42 |     def __init__(self):
43 |         super().__init__()
44 |         self.fc1 = nn.Linear(global_input_dim, 64)
45 | }
46 | ```
47 |         
48 | # 初始化时分配共享实例
49 | chaser_critic = SharedCritic()
50 | for agent in chaser_agents:
51 |     agent.critic = chaser_critic}
52 | # 初始化时分配共享实例
53 | chaser_critic = SharedCritic()
54 | for agent in chaser_agents:
55 |     agent.critic = chaser_critic
56 | 分布式训练：利用多GPU或Ray框架实现并行更新。
57 | # 四、场景驱动的设计选择
58 | 
59 | |场景类型|推荐架构|理由|
60 | |---|---|---|
61 | 完全协作+同构|	共享Critic（同类智能体）	|减少冗余计算，利用环境对称性<br>
62 | 竞争/混合奖励|	独立Critic|	反映不同奖励函数和策略目标
63 | 异构观测/动作空间|	独立Critic|	适应不同输入输出维度
64 | 初步算法验证|	独立Critic|	实现简单，避免共享逻辑复杂性
65 | 
66 | # 五、代码实现对比解析
67 | ### 用户代码1（混合MADDPG/DDPG）
68 | https://github.com/shariqiqbal2810/maddpg-pytorch/blob/master/algorithms/maddpg.py <br>
69 | 1. Critic输入：<br>
70 |     - MADDPG模式：全局obs+actions → 输入相同但Critic独立。<br>
71 |     - DDPG模式：仅自身obs+action → 输入不同。<br>
72 | 2. 设计意图：兼容独立训练（DDPG）与协作训练（MADDPG），牺牲效率换取灵活性。
73 | ### 用户代码2（标准MADDPG）
74 | https://github.com/starry-sky6688/MADDPG/blob/master/maddpg/maddpg.py
75 | 1. Critic输入：强制全局obs+actions → 输入相同但Critic独立。
76 | 2. 设计意图：严格遵循CTDE范式，适合同构协作场景，扩展性较弱但结构清晰。
77 | 
78 | # 六、总结
79 | 1. 必要性：独立Critic是处理异构奖励、策略差异和环境非平稳性的核心设计，即使输入相同，各Critic仍需独立更新以捕捉不同策略的全局影响。
80 | 2. 效率权衡：通过参数共享试探和分布式训练可缓解计算开销，但在多数复杂场景中，独立Critic的收益远大于其成本。
81 | 3. 实践建议：优先采用独立Critic实现，待任务明确后针对性优化（如同类共享）。


--------------------------------------------------------------------------------
/MADDPG_Continous/envs/custom_agents_dynamics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 该文件定义了自定义的环境，用于测试自定义的智能体动力学模型
  3 | 
  4 | 继承自core.py
  5 | 
  6 | """
  7 | import numpy as np
  8 | from pettingzoo.mpe._mpe_utils.core import EntityState, AgentState, Action, Entity, Landmark, Agent
  9 | from pettingzoo.mpe._mpe_utils.core import World
 10 | 
 11 | class CustomWorld(World):
 12 |     def __init__(self, world_size = 2.5 ): #
 13 |         super().__init__() # 调用父类的构造函数
 14 |         self.world_size = world_size # Ronchy 添加世界大小
 15 |         self.dt = 0.1 # 时间步长
 16 |         self.damping = 0.2 # 阻尼系数
 17 |         # contact response parameters
 18 |         self.contact_force = 1e2 # 控制碰撞强度（默认1e2，值越大反弹越强）
 19 |         self.contact_margin = 1e-3 # 控制碰撞"柔软度"（默认1e-3，值越小越接近刚体）
 20 |         """
 21 |         常见问题示例
 22 |         实体重叠穿透	contact_force太小	增大contact_force至1e3或更高
 23 |         碰撞后震荡	damping太低	增大阻尼系数（如0.5）
 24 |         微小距离抖动	contact_margin不合理	调整到1e-2~1e-4之间
 25 |         """
 26 |     """ 
 27 |         重载底层动力学逻辑
 28 |         主要是integrate_state()函数
 29 |     """
 30 |     def step(self):
 31 |         # set actions for scripted agents
 32 |         # print("Using world -> step()") # 重载成功！
 33 |         for agent in self.scripted_agents:
 34 |             agent.action = agent.action_callback(agent, self)
 35 |         # gather forces applied to entities
 36 |         p_force = [None] * len(self.entities)
 37 |         # apply agent physical controls
 38 |         p_force = self.apply_action_force(p_force) # 加入噪声
 39 |         # apply environment forces
 40 |         p_force = self.apply_environment_force(p_force) # 碰撞力计算 collide为True时
 41 |         # integrate physical state
 42 |         self.integrate_state(p_force) # 动力学逻辑
 43 |         # update agent state
 44 |         for agent in self.agents:
 45 |             self.update_agent_state(agent) # 更新 communication action 后的状态
 46 |         
 47 |     # integrate physical state
 48 |     #函数功能：动力学逻辑。更新实体的位置和速度
 49 |     def integrate_state(self, p_force):
 50 |         for i, entity in enumerate(self.entities):
 51 |             if not entity.movable:
 52 |                 continue
 53 |             # 速度阻尼衰减
 54 |             entity.state.p_vel *= (1 - self.damping)  # 正确应用阻尼
 55 |              # 动力学 -> 运动学
 56 |             if p_force[i] is not None:
 57 |                 acceleration = p_force[i] / entity.mass # F = ma
 58 |                 entity.state.p_vel += acceleration * self.dt # v = v_0 + a * t
 59 |             # 更新位置
 60 |             entity.state.p_pos += entity.state.p_vel * self.dt  # 更新位置
 61 |             # 限制位置在世界大小范围内
 62 |             # entity.state.p_pos = np.clip(entity.state.p_pos, -self.world_size, self.world_size) # Ronchy 添加世界大小限制
 63 | 
 64 |             # 速度限幅
 65 |             if entity.max_speed is not None:
 66 |                 ########
 67 |                 speed = np.sqrt(
 68 |                     np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1])
 69 |                 )
 70 |                 if speed > entity.max_speed:
 71 |                     entity.state.p_vel = (
 72 |                         entity.state.p_vel
 73 |                         / np.sqrt(
 74 |                             np.square(entity.state.p_vel[0])
 75 |                             + np.square(entity.state.p_vel[1])
 76 |                         )
 77 |                         * entity.max_speed
 78 |                     )
 79 |                 ########可替换为下列代码 ，效果相同
 80 |                 # speed = np.linalg.norm(entity.state.p_vel)  # 计算向量模长
 81 |                 # if speed > entity.max_speed:
 82 |                 #     entity.state.p_vel = entity.state.p_vel * (entity.max_speed / speed)  # 向量缩放                   
 83 | 
 84 | 
 85 |     # get collision forces for any contact between two entities
 86 |     # TODO: 碰撞逻辑待细化
 87 |     def get_collision_force(self, entity_a, entity_b):
 88 |         if (not entity_a.collide) or (not entity_b.collide):
 89 |             return [None, None]  # not a collider
 90 |         if entity_a is entity_b:
 91 |             return [None, None]  # don't collide against itself
 92 |         # compute actual distance between entities
 93 |         delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
 94 |         dist = np.sqrt(np.sum(np.square(delta_pos))) #用norm更简洁
 95 |         # minimum allowable distance
 96 |         dist_min = entity_a.size + entity_b.size  # 两个实体的半径之和
 97 |         # softmax penetration
 98 |         k = self.contact_margin 
 99 |         penetration = np.logaddexp(0, -(dist - dist_min) / k) * k  #渗透深度， 当 dist < dist_min 时产生虚拟渗透量
100 |         force = self.contact_force * delta_pos / dist * penetration
101 |         force_a = +force if entity_a.movable else None
102 |         force_b = -force if entity_b.movable else None
103 |         return [force_a, force_b]
104 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/main_evaluate.py:
--------------------------------------------------------------------------------
 1 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3
 2 | from main_parameters import main_parameters
 3 | from utils.runner import RUNNER
 4 | from agents.maddpg.MADDPG_agent import MADDPG
 5 | import torch
 6 | from envs import simple_tag_env
 7 | import os
 8 | 
 9 | def get_env(env_name, ep_len=50, render_mode = "None"):
10 |     """create environment and get observation and action dimension of each agent in this environment"""
11 |     new_env = None
12 |     if env_name == 'simple_adversary_v3':
13 |         new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True)
14 |     if env_name == 'simple_spread_v3':
15 |         new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array")
16 |     if env_name == 'simple_tag_v3':
17 |         new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)
18 |         # new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)
19 |     new_env.reset()
20 |     _dim_info = {}
21 |     action_bound = {}
22 |     for agent_id in new_env.agents:
23 |         print("agent_id:",agent_id)
24 |         _dim_info[agent_id] = []  # [obs_dim, act_dim]
25 |         action_bound[agent_id] = [] #[low action,  hign action]
26 |         _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0])
27 |         _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0])
28 |         action_bound[agent_id].append(new_env.action_space(agent_id).low)
29 |         action_bound[agent_id].append(new_env.action_space(agent_id).high)
30 | 
31 |     return new_env, _dim_info, action_bound
32 | 
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     device ='cpu'
37 |     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38 |     print("Using device:",device)
39 |     # 模型存储路径
40 |     current_dir = os.path.dirname(os.path.abspath(__file__))
41 |     chkpt_dir = os.path.join(current_dir, 'models/maddpg_models/')
42 |     # 加载模型的时间戳
43 |     load_timestamp = "" # 请输入形如：2025-04-15_15-51   ->  时间戳位置models/maddpg_models/xxxx
44 |     model_timestamp = None if load_timestamp == '' else load_timestamp
45 |     # 定义参数
46 |     args = main_parameters()
47 |     args.render_mode = "human"
48 | 
49 |     # 创建环境
50 |     env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode)
51 |     # print(env, dim_info, action_bound)
52 |     # 创建MA-DDPG智能体 dim_info: 字典，键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意
53 |     agent = MADDPG(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, _chkpt_dir = chkpt_dir, _model_timestamp = model_timestamp)
54 |     print("--- Loading models ---")
55 |     agent.load_model()
56 |     print('---- Evaluating ----')
57 |     env.reset()
58 |     runner = RUNNER(agent, env, args, device, mode = 'evaluate')
59 |     runner.evaluate() # 使用evaluate方法
60 |     print('---- Done! ----')
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/main_evaluate_save_render2gif.py:
--------------------------------------------------------------------------------
  1 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3
  2 | from main_parameters import main_parameters
  3 | from utils.runner import RUNNER
  4 | from agents.maddpg.MADDPG_agent import MADDPG
  5 | import torch
  6 | from envs import simple_tag_env
  7 | import os
  8 | import numpy as np
  9 | import imageio  # 需要安装: pip install imageio
 10 | 
 11 | 
 12 | def get_env(env_name, ep_len=50, render_mode = "None"):
 13 |     """create environment and get observation and action dimension of each agent in this environment"""
 14 |     new_env = None
 15 |     if env_name == 'simple_adversary_v3':
 16 |         new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True)
 17 |     if env_name == 'simple_spread_v3':
 18 |         new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array")
 19 |     if env_name == 'simple_tag_v3':
 20 |         new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)
 21 |         # new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)
 22 |     new_env.reset()
 23 |     _dim_info = {}
 24 |     action_bound = {}
 25 |     for agent_id in new_env.agents:
 26 |         print("agent_id:",agent_id)
 27 |         _dim_info[agent_id] = []  # [obs_dim, act_dim]
 28 |         action_bound[agent_id] = [] #[low action,  hign action]
 29 |         _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0])
 30 |         _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0])
 31 |         action_bound[agent_id].append(new_env.action_space(agent_id).low)
 32 |         action_bound[agent_id].append(new_env.action_space(agent_id).high)
 33 | 
 34 |     return new_env, _dim_info, action_bound
 35 | 
 36 | # 修改RUNNER类以捕获帧
 37 | class RecordingRunner(RUNNER):
 38 |     def evaluate(self):
 39 |         # 记录每个episode的和奖励 用于平滑，显示平滑奖励函数
 40 |         self.reward_sum_record = []
 41 |         # 记录每个智能体在每个episode的奖励
 42 |         self.episode_rewards = {agent_id: np.zeros(self.par.episode_num) for agent_id in self.env.agents}
 43 |         # episode循环
 44 |         for episode in range(self.par.episode_num):
 45 |             step = 0  # 每回合step重置
 46 |             print(f"评估第 {episode + 1} 回合")
 47 |             # 初始化环境 返回初始状态
 48 |             obs, _ = self.env.reset()  # 重置环境，开始新回合
 49 |             self.done = {agent_id: False for agent_id in self.env_agents}
 50 |             # 每个智能体当前episode的奖励
 51 |             agent_reward = {agent_id: 0 for agent_id in self.env.agents}
 52 |             
 53 |             # 捕获初始帧
 54 |             frame = self.env.render()
 55 |             if frame is not None:
 56 |                 frames.append(frame)
 57 |             
 58 |             # 每个智能体与环境进行交互
 59 |             while self.env.agents:
 60 |                 step += 1
 61 |                 # 使用训练好的智能体选择动作
 62 |                 action = self.agent.select_action(obs)
 63 |                 # 执行动作 获得下一状态 奖励 终止情况
 64 |                 next_obs, reward, terminated, truncated, info = self.env.step(action)
 65 |                 
 66 |                 # 捕获当前帧
 67 |                 frame = self.env.render()
 68 |                 if frame is not None:
 69 |                     frames.append(frame)
 70 |                 
 71 |                 self.done = {agent_id: bool(terminated[agent_id] or truncated[agent_id]) for agent_id in self.env_agents}
 72 |                 # 累积每个智能体的奖励
 73 |                 for agent_id, r in reward.items():
 74 |                     agent_reward[agent_id] += r
 75 |                 obs = next_obs
 76 |                 if step % 10 == 0:
 77 |                     print(f"Step {step}, obs: {obs}, action: {action}, reward: {reward}, done: {self.done}")
 78 |             sum_reward = sum(agent_reward.values())
 79 |             self.reward_sum_record.append(sum_reward)
 80 |                 
 81 | if __name__ == '__main__':
 82 |     device ='cpu'
 83 |     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 84 |     print("Using device:",device)
 85 |     # 模型存储路径
 86 |     current_dir = os.path.dirname(os.path.abspath(__file__))
 87 |     chkpt_dir = os.path.join(current_dir, 'models/maddpg_models/')
 88 |     # 加载模型的时间戳
 89 |     load_timestamp = "2025-02-19_16-38"
 90 |     model_timestamp = None if load_timestamp == '' else load_timestamp
 91 |     # 定义参数
 92 |     args = main_parameters()
 93 |     
 94 |     # 设置为rgb_array模式以便捕获帧
 95 |     args.render_mode = "rgb_array"  # 修改为rgb_array以便捕获帧
 96 |     args.episode_num = 5
 97 | 
 98 |     # 创建环境
 99 |     env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode)
100 |     # print(env, dim_info, action_bound)
101 |     # 创建MA-DDPG智能体 dim_info: 字典，键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意
102 |     agent = MADDPG(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, _chkpt_dir = chkpt_dir, _model_timestamp = model_timestamp)
103 |     print("--- Loading models ---")
104 |     agent.load_model()
105 |     print('---- Evaluating and Recording ----')
106 |     
107 |     # 准备录制
108 |     frames = []
109 |     # 使用修改后的Runner
110 |     runner = RecordingRunner(agent, env, args, device, mode='evaluate')
111 |     runner.evaluate()
112 |     
113 |     # 保存为GIF
114 |     gif_path = os.path.join(current_dir, 'plot', f'{args.env_name}_demo.gif')
115 |     print(f"正在保存GIF到: {gif_path}")
116 |     imageio.mimsave(gif_path, frames, fps=10)
117 |     
118 |     print(f'---- 完成! GIF已保存到 {gif_path} ----')


--------------------------------------------------------------------------------
/MADDPG_Continous/main_parameters.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def main_parameters():
 4 |     parser = argparse.ArgumentParser()
 5 |     ############################################ 选择环境 ############################################
 6 |     parser.add_argument("--env_name", type =str, default = "simple_tag_v3", help = "name of the env",   
 7 |                         choices=['simple_adversary_v3', 'simple_spread_v3', 'simple_tag_v3', 'simple_tag_env']) 
 8 |     parser.add_argument("--render_mode", type=str, default = "None", help = "None | human | rgb_array")
 9 |     parser.add_argument("--episode_num", type = int, default = 5) # 5000
10 |     parser.add_argument("--episode_length", type = int, default = 500) #50
11 |     parser.add_argument('--learn_interval', type=int, default=10,
12 |                         help='steps interval between learning time')
13 |     parser.add_argument('--random_steps', type=int, default=500, help='random steps before the agent start to learn') #  2e3
14 |     parser.add_argument('--tau', type=float, default=0.001, help='soft update parameter')
15 |     parser.add_argument('--gamma', type=float, default=0.99, help='discount factor')
16 |     parser.add_argument('--buffer_capacity', type=int, default=int(1e6), help='capacity of replay buffer')
17 |     parser.add_argument('--batch_size', type=int, default=128, help='batch-size of replay buffer')  
18 |     parser.add_argument('--actor_lr', type=float, default=0.0002, help='learning rate of actor') # .00002
19 |     parser.add_argument('--critic_lr', type=float, default=0.002, help='learning rate of critic') # .002
20 |     # The parameters for the communication network
21 |     # TODO
22 |     parser.add_argument('--visdom', type=bool, default=False, help="Open the visdom")
23 |     parser.add_argument('--size_win', type=int, default=200, help="Open the visdom") # 1000
24 | 
25 | 
26 |     args = parser.parse_args()
27 |     return args


--------------------------------------------------------------------------------
/MADDPG_Continous/main_train.py:
--------------------------------------------------------------------------------
 1 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3
 2 | from envs import simple_tag_env, custom_agents_dynamics
 3 | 
 4 | from main_parameters import main_parameters
 5 | from utils.runner import RUNNER
 6 | 
 7 | from agents.maddpg.MADDPG_agent import MADDPG
 8 | import torch
 9 | import os
10 | 
11 | import time
12 | from datetime import timedelta
13 | from utils.logger import TrainingLogger  # 添加导入
14 | 
15 | def get_env(env_name, ep_len=25, render_mode ="None"):
16 |     """create environment and get observation and action dimension of each agent in this environment"""
17 |     new_env = None
18 |     if env_name == 'simple_adversary_v3':
19 |         new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True)
20 |     if env_name == 'simple_spread_v3':
21 |         new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array")
22 |     if env_name == 'simple_tag_v3':
23 |         new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)
24 |     if env_name == 'simple_tag_env':
25 |         new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)
26 |     new_env.reset()
27 |     _dim_info = {}
28 |     action_bound = {}
29 |     for agent_id in new_env.agents:
30 |         print("agent_id:",agent_id)
31 |         _dim_info[agent_id] = []  # [obs_dim, act_dim]
32 |         action_bound[agent_id] = [] #[low action,  hign action]
33 |         _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0])
34 |         _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0])
35 |         action_bound[agent_id].append(new_env.action_space(agent_id).low)
36 |         action_bound[agent_id].append(new_env.action_space(agent_id).high)
37 |     print("_dim_info:",_dim_info)
38 |     print("action_bound:",action_bound)
39 |     return new_env, _dim_info, action_bound
40 | 
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45 |     # device = torch.device('mps' if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() 
46 |     #                         else 'cuda' if torch.cuda.is_available() else 'cpu')
47 |     device = "cpu"
48 |     print("Using device:",device)
49 |     start_time = time.time() # 记录开始时间
50 |     
51 |     # 模型保存路径
52 |     current_dir = os.path.dirname(os.path.abspath(__file__))
53 |     chkpt_dir = os.path.join(current_dir, 'models/maddpg_models/')
54 |     # 定义参数
55 |     args = main_parameters()
56 |     # 创建环境
57 |     print("Using Env's name",args.env_name)
58 |     env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode)
59 |     # print(env, dim_info, action_bound)
60 |     # 创建MA-DDPG智能体 dim_info: 字典，键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意。
61 |     agent = MADDPG(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, _chkpt_dir = chkpt_dir, _device = device)
62 |     # 创建运行对象
63 |     runner = RUNNER(agent, env, args, device, mode = 'train')
64 |     # 开始训练
65 |     runner.train()
66 |     print("agent",agent)
67 | 
68 |     # 计算训练时间
69 |     end_time = time.time()
70 |     training_time = end_time - start_time
71 |     # 转换为时分秒格式
72 |     training_duration = str(timedelta(seconds=int(training_time)))
73 |     print(f"\n===========训练完成!===========")
74 |     print(f"训练设备: {device}")
75 |     print(f"训练用时: {training_duration}")
76 | 
77 |     # 使用logger保存训练日志
78 |        # 使用logger保存训练日志
79 |     logger = TrainingLogger()
80 |     current_time = logger.save_training_log(args, device, training_duration, runner)
81 |     print(f"完成时间: {current_time}")
82 | 
83 |     print("--- saving trained models ---")
84 |     agent.save_model()
85 |     print("--- trained models saved ---")
86 |     
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/plot/convert_gif_to_loop.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import subprocess
  6 | import argparse
  7 | from pathlib import Path
  8 | 
  9 | try:
 10 |     from PIL import Image
 11 |     PIL_AVAILABLE = True
 12 | except ImportError:
 13 |     PIL_AVAILABLE = False
 14 | 
 15 | def convert_gif_to_loop(input_path, output_path=None, backup=True):
 16 |     """
 17 |     将GIF转换为循环播放的GIF
 18 |     
 19 |     参数:
 20 |         input_path: 输入GIF文件路径或包含GIF文件的目录
 21 |         output_path: 输出GIF文件路径或目录，默认为在原文件名后添加"_loop"
 22 |         backup: 是否备份原始文件
 23 |     """
 24 |     # 检查是否安装了PIL
 25 |     if not PIL_AVAILABLE:
 26 |         print("错误: 未安装PIL/Pillow库。请使用以下命令安装:")
 27 |         print("pip install Pillow")
 28 |         return
 29 |         
 30 |     input_path = Path(input_path)
 31 |     
 32 |     # 检查输入路径是文件还是目录
 33 |     if input_path.is_file() and input_path.suffix.lower() == '.gif':
 34 |         gif_files = [input_path]
 35 |     elif input_path.is_dir():
 36 |         gif_files = list(input_path.glob('*.gif'))
 37 |     else:
 38 |         print(f"错误: {input_path} 不是有效的GIF文件或目录")
 39 |         return
 40 |     
 41 |     if not gif_files:
 42 |         print(f"在 {input_path} 中没有找到GIF文件")
 43 |         return
 44 |     
 45 |     for gif_file in gif_files:
 46 |         # 确定输出文件路径
 47 |         if output_path is None:
 48 |             output_file = gif_file.parent / f"{gif_file.stem}_loop{gif_file.suffix}"
 49 |         elif Path(output_path).is_dir():
 50 |             output_file = Path(output_path) / f"{gif_file.stem}_loop{gif_file.suffix}"
 51 |         else:
 52 |             output_file = Path(output_path)
 53 |         
 54 |         # 备份原始文件
 55 |         if backup and gif_file.exists():
 56 |             backup_file = gif_file.parent / f"{gif_file.stem}_original{gif_file.suffix}"
 57 |             if not backup_file.exists():
 58 |                 print(f"备份 {gif_file} 到 {backup_file}")
 59 |                 subprocess.run(['cp', str(gif_file), str(backup_file)])
 60 |         
 61 |         # 使用PIL/Pillow转换GIF为循环播放
 62 |         print(f"转换 {gif_file} 为循环播放GIF: {output_file}")
 63 |         
 64 |         try:
 65 |             # 打开GIF文件
 66 |             img = Image.open(gif_file)
 67 |             
 68 |             # 提取所有帧
 69 |             frames = []
 70 |             durations = []
 71 |             
 72 |             try:
 73 |                 while True:
 74 |                     # 记录当前帧的持续时间
 75 |                     durations.append(img.info.get('duration', 100))  # 默认100ms
 76 |                     # 复制当前帧
 77 |                     frames.append(img.copy())
 78 |                     # 尝试移动到下一帧
 79 |                     img.seek(img.tell() + 1)
 80 |             except EOFError:
 81 |                 pass  # 到达文件末尾
 82 |             
 83 |             # 保存为循环播放的GIF
 84 |             if frames:
 85 |                 frames[0].save(
 86 |                     str(output_file),
 87 |                     save_all=True,
 88 |                     append_images=frames[1:],
 89 |                     optimize=False,
 90 |                     duration=durations,
 91 |                     loop=0  # 0表示无限循环
 92 |                 )
 93 |                 print(f"成功创建循环播放GIF: {output_file}")
 94 |             else:
 95 |                 print(f"警告: {gif_file} 似乎不是有效的GIF动画")
 96 |                 
 97 |         except Exception as e:
 98 |             print(f"处理 {gif_file} 时出错: {e}")
 99 | 
100 | if __name__ == "__main__":
101 |     parser = argparse.ArgumentParser(description='将GIF转换为循环播放的GIF')
102 |     parser.add_argument('input', help='输入GIF文件路径或包含GIF文件的目录')
103 |     parser.add_argument('-o', '--output', help='输出GIF文件路径或目录，默认为在原文件名后添加"_loop"')
104 |     parser.add_argument('--no-backup', action='store_false', dest='backup', 
105 |                         help='不备份原始文件')
106 |     
107 |     args = parser.parse_args()
108 |     convert_gif_to_loop(args.input, args.output, args.backup)


--------------------------------------------------------------------------------
/MADDPG_Continous/plot/demo-rewards_plot_ma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/MADDPG_Continous/plot/demo-rewards_plot_ma.png


--------------------------------------------------------------------------------
/MADDPG_Continous/plot/plot_rewards.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | from datetime import datetime
  5 | import numpy as np
  6 | import platform
  7 | '''
  8 | 注意：
  9 | 作者用pands==2.2.3出错了。
 10 | pip install pandas==2.2.1 没问题。
 11 | '''
 12 | 
 13 | def moving_average(data, window_size=50):
 14 |     """简单移动平均"""
 15 |     weights = np.ones(window_size) / window_size
 16 |     return np.convolve(data, weights, mode='valid')
 17 | 
 18 | def exponential_moving_average(data, alpha=0.1):
 19 |     """指数移动平均"""
 20 |     ema = np.zeros_like(data)
 21 |     ema[0] = data[0]
 22 |     for i in range(1, len(data)):
 23 |         ema[i] = alpha * data[i] + (1 - alpha) * ema[i-1]
 24 |     return ema
 25 | 
 26 | # def plot_rewards(csv_file, window_size=50, alpha=0.1):
 27 | #     # 读取CSV文件，不指定数据类型
 28 | #     df = pd.read_csv(csv_file)
 29 | #     # 设置中文字体（如果需要显示中文）
 30 | #     plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  # MacOS
 31 | #     plt.rcParams['axes.unicode_minus'] = False  # 正常显示负号
 32 |     
 33 |     
 34 | #         # 计算平滑后的数据
 35 | #     adv_ma = moving_average(df['Adversary Average Reward'].values)
 36 | #     adv_ema = exponential_moving_average(df['Adversary Average Reward'].values)
 37 | #     sum_ma = moving_average(df['Sum Reward of All Agents'].values)
 38 | #     sum_ema = exponential_moving_average(df['Sum Reward of All Agents'].values)
 39 |     
 40 | #     # 创建图形
 41 | #     plt.figure(figsize=(15, 10))
 42 |     
 43 | #     # 绘制追捕者平均奖励
 44 | #     plt.subplot(2, 1, 1)
 45 | #     plt.plot(df['Episode'], df['Adversary Average Reward'], 'lightgray', alpha=0.3, label='原始数据')
 46 | #     plt.plot(df['Episode'][window_size-1:], adv_ma, 'b-', linewidth=2, label='移动平均')
 47 | #     plt.plot(df['Episode'], adv_ema, 'r-', linewidth=2, label='指数移动平均')
 48 | #     plt.title('追捕者平均奖励随回合数变化')
 49 | #     plt.xlabel('回合数')
 50 | #     plt.ylabel('平均奖励')
 51 | #     plt.grid(True, linestyle='--', alpha=0.7)
 52 | #     plt.legend()
 53 |     
 54 | #     # 绘制所有智能体总奖励
 55 | #     plt.subplot(2, 1, 2)
 56 | #     plt.plot(df['Episode'], df['Sum Reward of All Agents'], 'lightgray', alpha=0.3, label='原始数据')
 57 | #     plt.plot(df['Episode'][window_size-1:], sum_ma, 'b-', linewidth=2, label='移动平均')
 58 | #     plt.plot(df['Episode'], sum_ema, 'r-', linewidth=2, label='指数移动平均')
 59 | #     plt.title('所有智能体总奖励随回合数变化')
 60 | #     plt.xlabel('回合数')
 61 | #     plt.ylabel('总奖励')
 62 | #     plt.grid(True, linestyle='--', alpha=0.7)
 63 | #     plt.legend()
 64 |     
 65 | #     # 调整子图之间的间距
 66 | #     plt.tight_layout()
 67 |     
 68 | #     # 保存图片
 69 | #     save_path = os.path.join(os.path.dirname(csv_file), f'rewards_plot.png')
 70 | #     plt.savefig(save_path, dpi=300, bbox_inches='tight')
 71 | #     print(f"图片已保存至 {save_path}")
 72 |     
 73 | #     # 显示图形
 74 | #     plt.show()
 75 | 
 76 | def set_font_for_plot():
 77 |     """根据平台动态设置字体"""
 78 |     system_platform = platform.system()
 79 |     print("system_platform:", system_platform)
 80 |     if system_platform == "Darwin":  # MacOS
 81 |         font = 'Arial Unicode MS'
 82 |     elif system_platform == "Windows":  # Windows
 83 |         font = 'SimHei'
 84 |     else:  # Linux
 85 |         font = 'DejaVu Sans'
 86 |     
 87 |     # 设置matplotlib的字体
 88 |     plt.rcParams['font.sans-serif'] = [font]
 89 |     plt.rcParams['axes.unicode_minus'] = False  # 正常显示负号
 90 | 
 91 | def different_plot_rewards(csv_file, window_size=50, alpha=0.1):
 92 |     df = pd.read_csv(csv_file)
 93 | 
 94 |     # plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
 95 |     # plt.rcParams['axes.unicode_minus'] = False
 96 |     set_font_for_plot()  # 设置字体
 97 | 
 98 |     # 计算平滑后的数据
 99 |     adv_ma = moving_average(df['Adversary Average Reward'].values, window_size)
100 |     adv_ema = exponential_moving_average(df['Adversary Average Reward'].values, alpha)
101 |     sum_ma = moving_average(df['Sum Reward of All Agents'].values, window_size)
102 |     sum_ema = exponential_moving_average(df['Sum Reward of All Agents'].values, alpha)
103 |     
104 |     # 创建两个图形
105 |     # 1. 移动平均对比图
106 |     plt.figure(figsize=(15, 10))
107 |     # 追捕者奖励
108 |     plt.subplot(2, 1, 1)
109 |     plt.plot(df['Episode'], df['Adversary Average Reward'], 'lightgray', alpha=0.3, label='原始数据')
110 |     plt.plot(df['Episode'][window_size-1:], adv_ma, 'b-', linewidth=2, label='移动平均')
111 |     plt.title('追捕者平均奖励 - 移动平均对比')
112 |     plt.xlabel('回合数')
113 |     plt.ylabel('平均奖励')
114 |     plt.grid(True, linestyle='--', alpha=0.7)
115 |     plt.legend()
116 |     
117 |     # 总奖励
118 |     plt.subplot(2, 1, 2)
119 |     plt.plot(df['Episode'], df['Sum Reward of All Agents'], 'lightgray', alpha=0.3, label='原始数据')
120 |     plt.plot(df['Episode'][window_size-1:], sum_ma, 'b-', linewidth=2, label='移动平均')
121 |     plt.title('所有智能体总奖励 - 移动平均对比')
122 |     plt.xlabel('回合数')
123 |     plt.ylabel('总奖励')
124 |     plt.grid(True, linestyle='--', alpha=0.7)
125 |     plt.legend()
126 |     plt.tight_layout()
127 |     
128 |     # 保存移动平均对比图
129 |     save_path_ma = os.path.join(os.path.dirname(csv_file), f'rewards_plot_ma.png')
130 |     plt.savefig(save_path_ma, dpi=300, bbox_inches='tight')
131 |     
132 |     # 2. 指数移动平均对比图
133 |     plt.figure(figsize=(15, 10))
134 |     # 追捕者奖励
135 |     plt.subplot(2, 1, 1)
136 |     plt.plot(df['Episode'], df['Adversary Average Reward'], 'lightgray', alpha=0.3, label='原始数据')
137 |     plt.plot(df['Episode'], adv_ema, 'r-', linewidth=2, label='指数移动平均')
138 |     plt.title('追捕者平均奖励 - 指数移动平均对比')
139 |     plt.xlabel('回合数')
140 |     plt.ylabel('平均奖励')
141 |     plt.grid(True, linestyle='--', alpha=0.7)
142 |     plt.legend()
143 |     
144 |     # 总奖励
145 |     plt.subplot(2, 1, 2)
146 |     plt.plot(df['Episode'], df['Sum Reward of All Agents'], 'lightgray', alpha=0.3, label='原始数据')
147 |     plt.plot(df['Episode'], sum_ema, 'r-', linewidth=2, label='指数移动平均')
148 |     plt.title('所有智能体总奖励 - 指数移动平均对比')
149 |     plt.xlabel('回合数')
150 |     plt.ylabel('总奖励')
151 |     plt.grid(True, linestyle='--', alpha=0.7)
152 |     plt.legend()
153 |     plt.tight_layout()
154 |     
155 |     # 保存指数移动平均对比图
156 |     save_path_ema = os.path.join(os.path.dirname(csv_file), f'rewards_plot_ema.png')
157 |     plt.savefig(save_path_ema, dpi=300, bbox_inches='tight')
158 |     
159 |     print(f"移动平均对比图已保存至 {save_path_ma}")
160 |     print(f"指数移动平均对比图已保存至 {save_path_ema}")
161 |     
162 |     plt.show()
163 | 
164 | if __name__ == "__main__":
165 |     # CSV文件路径（相对于当前脚本的路径）
166 |     csv_file = os.path.join(os.path.dirname(__file__), 'data', 'data_rewards_2025-02-25_04-39.csv')
167 |     print("csv_file name:",csv_file)
168 | 
169 |     if os.path.exists(csv_file):
170 |         df = pd.read_csv(csv_file)
171 |         # print(df.head())
172 |         # plot_rewards(csv_file)
173 |         different_plot_rewards(csv_file)
174 |     else:
175 |         print(f"错误：未找到CSV文件：{csv_file}")


--------------------------------------------------------------------------------
/MADDPG_Continous/plot/simple_tag_v3_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/MADDPG_Continous/plot/simple_tag_v3_demo.gif


--------------------------------------------------------------------------------
/MADDPG_Continous/plot/simple_tag_v3_demo_loop.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/MADDPG_Continous/plot/simple_tag_v3_demo_loop.gif


--------------------------------------------------------------------------------
/MADDPG_Continous/utils/conda-environment.yml:
--------------------------------------------------------------------------------
 1 | name: MARL
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 |   - https://repo.anaconda.com/pkgs/main
 6 |   - https://repo.anaconda.com/pkgs/r
 7 | dependencies:
 8 |   - blas=1.0
 9 |   - brotli-python=1.0.9
10 |   - bzip2=1.0.8
11 |   - ca-certificates=2025.2.25
12 |   - certifi=2025.1.31
13 |   - filelock=3.13.1
14 |   - freetype=2.12.1
15 |   - gmpy2=2.1.2
16 |   - intel-openmp=2023.1.0
17 |   - jinja2=3.1.4
18 |   - jpeg=9e
19 |   - lcms2=2.16
20 |   - lerc=4.0.0
21 |   - libdeflate=1.22
22 |   - libffi=3.4.4
23 |   - libjpeg-turbo=2.0.0
24 |   - libpng=1.6.39
25 |   - libtiff=4.5.1
26 |   - libwebp-base=1.3.2
27 |   - llvm-openmp=14.0.6
28 |   - lz4-c=1.9.4
29 |   - markupsafe=2.1.3
30 |   - mkl=2023.1.0
31 |   - mkl-service=2.4.0
32 |   - mkl_fft=1.3.8
33 |   - mkl_random=1.2.4
34 |   - mpc=1.1.0
35 |   - mpfr=4.0.2
36 |   - mpmath=1.3.0
37 |   - numpy=1.26.4
38 |   - numpy-base=1.26.4
39 |   - openjpeg=2.5.2
40 |   - openssl=3.0.16
41 |   - pip=24.2
42 |   - pybind11-abi=4
43 |   - pysocks=1.7.1
44 |   - python=3.11.8
45 |   - pyyaml=6.0.2
46 |   - requests=2.32.3
47 |   - setuptools=75.1.0
48 |   - sqlite=3.45.3
49 |   - sympy=1.13.3
50 |   - tbb=2021.8.0
51 |   - tk=8.6.14
52 |   - typing_extensions=4.12.2
53 |   - wheel=0.44.0
54 |   - xz=5.4.6
55 |   - yaml=0.2.5
56 |   - zlib=1.2.13
57 |   - zstd=1.5.6
58 | prefix: /Users/ronchy2000/DevelopEnv/anaconda3/envs/MARL
59 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/utils/linux_environment.yml:
--------------------------------------------------------------------------------
 1 | name: MARL
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 |   - https://repo.anaconda.com/pkgs/main
 6 |   - https://repo.anaconda.com/pkgs/r
 7 | dependencies:
 8 |   - blas=1.0
 9 |   - brotli-python=1.0.9
10 |   - bzip2=1.0.8
11 |   - ca-certificates=2025.2.25
12 |   - certifi=2025.1.31
13 |   - filelock=3.13.1
14 |   - freetype=2.12.1
15 |   - gmp=6.2.1
16 |   - gmpy2=2.1.2
17 |   - intel-openmp=2023.1.0
18 |   - jinja2=3.1.4
19 |   - jpeg=9e
20 |   - lcms2=2.16
21 |   - lerc=4.0.0
22 |   - libcxx=14.0.6
23 |   - libdeflate=1.22
24 |   - libffi=3.4.4
25 |   - libgfortran=5.0.0
26 |   - libgfortran5=11.3.0
27 |   - libjpeg-turbo=2.0.0
28 |   - libpng=1.6.39
29 |   - libtiff=4.5.1
30 |   - libwebp-base=1.3.2
31 |   - llvm-openmp=14.0.6
32 |   - lz4-c=1.9.4
33 |   - markupsafe=2.1.3
34 |   - mkl=2023.1.0
35 |   - mkl-service=2.4.0
36 |   - mkl_fft=1.3.8
37 |   - mkl_random=1.2.4
38 |   - mpc=1.1.0
39 |   - mpfr=4.0.2
40 |   - mpmath=1.3.0
41 |   - ncurses=6.4
42 |   - numpy-base=1.26.4
43 |   - openjpeg=2.5.2
44 |   - openssl=3.0.16
45 |   - pip=24.2
46 |   - pybind11-abi=4
47 |   - pysocks=1.7.1
48 |   - python=3.11.8
49 |   - pytorch=2.2.2
50 |   - pyyaml=6.0.2
51 |   - readline=8.2
52 |   - requests=2.32.3
53 |   - setuptools=75.1.0
54 |   - sqlite=3.45.3
55 |   - sympy=1.13.3
56 |   - tbb=2021.8.0
57 |   - tk=8.6.14
58 |   - torchaudio=2.2.2
59 |   - torchvision=0.17.2
60 |   - typing_extensions=4.12.2
61 |   - wheel=0.44.0
62 |   - xz=5.4.6
63 |   - yaml=0.2.5
64 |   - zlib=1.2.13
65 |   - zstd=1.5.6
66 |   - pip:
67 |       - charset-normalizer==3.4.1
68 |       - cloudpickle==3.1.0
69 |       - contourpy==1.3.1
70 |       - cycler==0.12.1
71 |       - farama-notifications==0.0.4
72 |       - fonttools==4.55.3
73 |       - gymnasium==1.0.0
74 |       - idna==3.10
75 |       - jsonpatch==1.33
76 |       - jsonpointer==3.0.0
77 |       - kiwisolver==1.4.8
78 |       - matplotlib==3.8.3
79 |       - networkx==3.4.2
80 |       - numpy==2.2.1
81 |       - packaging==24.2
82 |       - pandas==2.2.1
83 |       - pettingzoo==1.24.4
84 |       - pillow==11.1.0
85 |       - pip-chill==1.0.3
86 |       - pygame==2.6.1
87 |       - pyparsing==3.2.1
88 |       - python-dateutil==2.9.0.post0
89 |       - pytz==2025.1
90 |       - scipy==1.15.0
91 |       - six==1.17.0
92 |       - tornado==6.4.2
93 |       - tzdata==2025.1
94 |       - urllib3==2.3.0
95 |       - visdom==0.2.4
96 |       - websocket-client==1.8.0
97 | prefix: /Users/ronchy2000/DevelopEnv/anaconda3/envs/MARL
98 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | from datetime import datetime
 5 | 
 6 | 
 7 | 
 8 | """
 9 | 1. 在../logs/下保存了一个 training_log.json 文件，它包含了训练的所有参数和日志信息。
10 | 2. 保存的 plot_data_{current_time.replace(':', '-')}.pkl 是一个 PyTorch 保存的文件，它并不包含模型本身，而是 训练过程中的奖励数据。
11 | 
12 | """
13 | class TrainingLogger:
14 |     def __init__(self, log_dir="../logs"):
15 |         # 使用绝对路径
16 |         current_dir = os.path.dirname(os.path.abspath(__file__))
17 |         self.log_dir = os.path.join(current_dir,'..', 'logs')
18 |         
19 |         # 确保目录存在
20 |         if not os.path.exists(self.log_dir):
21 |             os.makedirs(self.log_dir)
22 |         
23 |     def save_training_log(self, args, device, training_duration, runner):
24 |         current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
25 | 
26 |         # 准备训练日志信息
27 |         log_info = {
28 |             "训练时间": current_time,
29 |             "训练设备": str(device),
30 |             "训练用时": training_duration,
31 |             "环境名称": args.env_name,
32 |             "渲染模式": args.render_mode,
33 |             "总回合数": args.episode_num,
34 |             "每回合步数": args.episode_length,
35 |             "学习间隔": args.learn_interval,
36 |             "随机步数": args.random_steps,
37 |             "tau": args.tau,
38 |             "gamma": args.gamma,
39 |             "buffer容量": args.buffer_capacity,
40 |             "batch_size": args.batch_size,
41 |             "actor学习率": args.actor_lr,
42 |             "critic学习率": args.critic_lr,
43 |             "是否使用visdom": args.visdom,
44 |             "visdom窗口大小": args.size_win
45 |         }
46 | 
47 |         # 保存训练日志
48 |         log_file = os.path.join(self.log_dir, "training_log.json")
49 |         
50 |         # 打印当前目录和目标目录
51 |         print(f"Current directory: {os.getcwd()}")
52 |         print(f"Saving training log to: {log_file}")
53 | 
54 |         # 确保目录存在并且具有写权限
55 |         if os.path.exists(self.log_dir):
56 |             print(f"Log directory exists: {self.log_dir}")
57 |         else:
58 |             print(f"Log directory does not exist. Trying to create it...")
59 |             os.makedirs(self.log_dir, exist_ok=True)
60 |         
61 |         # 读取现有的日志文件，如果存在的话
62 |         existing_logs = []
63 |         if os.path.exists(log_file):
64 |             with open(log_file, 'r', encoding='utf-8') as f:
65 |                 existing_logs = json.load(f)
66 |         
67 |         existing_logs.append(log_info)
68 |         
69 |         # 保存更新后的日志文件
70 |         with open(log_file, 'w', encoding='utf-8') as f:
71 |             json.dump(existing_logs, f, ensure_ascii=False, indent=4)
72 | 
73 |         # 保存训练曲线数据
74 |         plot_data = {
75 |             "all_sum_rewards": runner.all_sum_rewards,  # 所有智能体的总奖励
76 |             "all_adversary_avg_rewards": runner.all_adversary_avg_rewards,  # 追捕者的平均奖励
77 |             "episode_rewards": runner.episode_rewards,  # 每个智能体的奖励历史
78 |             "running_rewards": runner.get_running_reward(runner.reward_sum_record),  # 平滑后的奖励
79 |             "timestamps": current_time
80 |         }
81 |         
82 |         plot_file = os.path.join(self.log_dir, f"plot_data_{current_time.replace(':', '-')}.pkl")
83 |         torch.save(plot_data, plot_file)
84 | 
85 |         return current_time
86 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/utils/mac_arm_M4_environment.yml:
--------------------------------------------------------------------------------
 1 | name: MARL
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 |   - https://repo.anaconda.com/pkgs/main
 6 |   - https://repo.anaconda.com/pkgs/r
 7 | dependencies:
 8 |   - blas=1.0
 9 |   - brotli-python=1.0.9
10 |   - bzip2=1.0.8
11 |   - ca-certificates=2025.2.25
12 |   - certifi=2025.1.31
13 |   - filelock=3.13.1
14 |   - freetype=2.12.1
15 |   - gmp=6.2.1
16 |   - gmpy2=2.1.2
17 | #  - intel-openmp=2023.1.0
18 |   - jinja2=3.1.4
19 |   - jpeg=9e
20 |   - lcms2=2.16
21 |   - lerc=4.0.0
22 |   - libcxx=14.0.6
23 |   - libdeflate=1.22
24 |   - libffi=3.4.4
25 |   - libgfortran=5.0.0
26 |   - libgfortran5=11.3.0
27 |   - libjpeg-turbo=2.0.0
28 |   - libpng=1.6.39
29 |   - libtiff=4.5.1
30 |   - libwebp-base=1.3.2
31 |   - llvm-openmp=14.0.6
32 |   - lz4-c=1.9.4
33 |   - markupsafe=2.1.3
34 | #  - mkl=2023.1.0
35 | #  - mkl-service=2.4.0
36 | #  - mkl_fft=1.3.8
37 | #  - mkl_random=1.2.4
38 |   - mpc=1.1.0
39 |   - mpfr=4.0.2
40 |   - mpmath=1.3.0
41 |   - ncurses=6.4
42 |   - numpy-base=1.26.4
43 |   - openjpeg=2.5.2
44 |   - openssl=3.0.16
45 |   - pip=24.2
46 |   - pybind11-abi=4
47 |   - pysocks=1.7.1
48 |   - python=3.11.8
49 |   - pytorch=2.2.2
50 |   - pyyaml=6.0.2
51 |   - readline=8.2
52 |   - requests=2.32.3
53 |   - setuptools=75.1.0
54 |   - sqlite=3.45.3
55 |   - sympy=1.13.3
56 |   - tbb=2021.8.0
57 |   - tk=8.6.14
58 |   - torchaudio=2.2.2
59 |   - torchvision=0.17.2
60 |   - typing_extensions=4.12.2
61 |   - wheel=0.44.0
62 |   - xz=5.4.6
63 |   - yaml=0.2.5
64 |   - zlib=1.2.13
65 |   - zstd=1.5.6
66 | 
67 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/utils/pip-requirements.txt:
--------------------------------------------------------------------------------
 1 | brotli==1.0.9
 2 | gmpy2==2.1.2
 3 | gymnasium==1.0.0
 4 | importlib-metadata==8.0.0
 5 | importlib-resources==6.4.0
 6 | jaraco.collections==5.1.0
 7 | matplotlib==3.8.3
 8 | mkl-fft==1.3.8
 9 | mkl-random==1.2.4
10 | pandas==2.2.1
11 | pip-chill==1.0.3
12 | platformdirs==4.2.2
13 | pygame==2.6.1
14 | pysocks==1.7.1
15 | pyyaml==6.0.2
16 | tomli==2.0.1
17 | visdom==0.2.4
18 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/utils/pip-requirements_mac_arm_M4.txt:
--------------------------------------------------------------------------------
 1 | brotli==1.0.9
 2 | gmpy2==2.1.2
 3 | gymnasium==1.1.1
 4 | importlib-metadata==8.0.0
 5 | importlib-resources==6.4.0
 6 | jaraco.collections==5.1.0
 7 | matplotlib==3.8.3
 8 | pandas==2.2.1
 9 | pip-chill==1.0.3
10 | platformdirs==4.2.2
11 | pygame==2.6.1
12 | pysocks==1.7.1
13 | pyyaml==6.0.2
14 | tomli==2.0.1
15 | visdom==0.2.4
16 | 


--------------------------------------------------------------------------------
/MADDPG_Continous/utils/setupPettingzoo.py:
--------------------------------------------------------------------------------
 1 | # 使用 sys.executable 获取当前虚拟环境的 pip，这样它会始终使用当前虚拟环境的 pip 安装包，而不是系统环境的 pip
 2 | import pkg_resources
 3 | import sys
 4 | import platform
 5 | import os
 6 | from subprocess import call
 7 | 
 8 | def check_and_install_pettingzoo():
 9 |     # 打印当前虚拟环境的相关信息
10 |     print("================================")
11 |     print(f"Current Python executable: {sys.executable}")
12 |     print(f"Python version: {sys.version}")
13 |     print(f"Current virtual environment: {sys.prefix}")
14 |     print(f"Platform: {platform.system()} {platform.release()}")
15 |     print("================================")
16 | 
17 |     try:
18 |         # 检查 pettingzoo 是否已经安装
19 |         pkg_resources.get_distribution("pettingzoo")
20 |         print("================================")
21 |         print("pettingzoo is already installed.")
22 |         print("================================")
23 |     except pkg_resources.DistributionNotFound:
24 |         # 如果 pettingzoo 没有安装，执行安装操作
25 |         print("================================")
26 |         print("pettingzoo is not installed. Installing pettingzoo...")
27 |         print("================================")
28 |         
29 |         # 获取当前虚拟环境的 Python 解释器路径
30 |         python_executable = sys.executable
31 |         
32 |         # 根据操作系统确定 pip 路径
33 |         if platform.system() == "Windows":
34 |             # Windows 系统下，pip 通常在 Scripts 目录下
35 |             pip_executable = os.path.join(os.path.dirname(python_executable), "Scripts", "pip.exe")
36 |         else:
37 |             # macOS/Linux 系统下
38 |             pip_dir = os.path.dirname(python_executable)
39 |             pip_executable = os.path.join(pip_dir, "pip")
40 |             if not os.path.exists(pip_executable):
41 |                 pip_executable = python_executable.replace("python", "pip")
42 |         
43 |         print(f"Using pip executable: {pip_executable}")
44 | 
45 |         # 尝试安装 pettingzoo==1.24.4
46 |         try:
47 |             print("Attempting to install pettingzoo==1.24.4...")
48 |             result = call([pip_executable, "install", "pettingzoo==1.24.4"])
49 |             if result == 0:
50 |                 print("================================")
51 |                 print("Successfully installed pettingzoo==1.24.4")
52 |                 print("================================")
53 |             else:
54 |                 print("Installation of pettingzoo==1.24.4 failed. Trying GitHub installation...")
55 |                 # 如果安装失败，尝试从 GitHub 安装
56 |                 try:
57 |                     # 根据操作系统调整命令格式
58 |                     if platform.system() == "Windows":
59 |                         # Windows 下不使用引号
60 |                         result = call([pip_executable, "install", "pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git"])
61 |                     else:
62 |                         # macOS/Linux 下使用引号
63 |                         result = call([pip_executable, "install", "pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git"])
64 |                     
65 |                     if result == 0:
66 |                         print("================================")
67 |                         print("Successfully installed pettingzoo from GitHub.")
68 |                         print("================================")
69 |                     else:
70 |                         print("GitHub installation failed. Please check the error above.")
71 |                 except Exception as e:
72 |                     print(f"Failed to install pettingzoo from GitHub: {e}")
73 |                     print("================================")
74 |                     print("Please manually install pettingzoo or check the error above.")
75 |         except Exception as e:
76 |             print(f"Failed to install pettingzoo==1.24.4: {e}")
77 |             print("Attempting to install pettingzoo from GitHub...")
78 | 
79 | if __name__ == "__main__":
80 |     check_and_install_pettingzoo()


--------------------------------------------------------------------------------
/MATD3_Continous/agents/NN_actor_td3.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.functional as F
 4 | import os
 5 | from datetime import datetime
 6 | 
 7 | """
 8 | 和MADDPG中的actor网络相同.
 9 | """
10 | 
11 | class MLPNetworkActor_td3(nn.Module):
12 |     def __init__(self,chkpt_name,  chkpt_dir, in_dim, out_dim, action_bound, hidden_dim = 128, non_linear = nn.ReLU()):
13 |         super(MLPNetworkActor_td3, self).__init__()
14 |         self.chkpt_dir = chkpt_dir
15 |         self.chkpt_name = chkpt_name
16 | 
17 |         # different ,为什么要保持这两个信息？
18 |         self.out_dim = out_dim
19 |         self.action_bound = action_bound
20 | 
21 |         self.net = nn.Sequential(
22 |             nn.Linear(in_dim, hidden_dim),
23 |             non_linear,
24 |             nn.Linear(hidden_dim, hidden_dim),
25 |             non_linear,
26 |             nn.Linear(hidden_dim, out_dim),
27 |         ).apply(self.init)
28 | 
29 |     @staticmethod
30 |     def init(m):
31 |         '''init patameters of the module'''
32 |         gain = nn.init.calculate_gain('relu')
33 |         if isinstance(m, nn.Linear):
34 |             nn.init.xavier_uniform_(m.weight, gain = gain)  #使用了 Xavier 均匀分布初始化（也叫 Glorot 初始化）
35 |             m.bias.data.fill_(0.01)
36 |     
37 |     def forward(self, x):
38 |         x = self.net(x)
39 |         # logi = x
40 |         # a_min = self.action_bound[0]
41 |         # a_max = self.action_bound[1]
42 |         # ''' 这三行为什么要这么处理？ 引入了bias项干嘛'''
43 |         # k = torch.tensor( (a_max - a_min) /2 , device=x.device )
44 |         # bias = torch.tensor( (a_max + a_min) /2, device=x.device )
45 |         # action = k * torch.tanh(x) + bias
46 |         # return action, logi
47 |         x = torch.tanh(x)
48 |         return x
49 | 
50 |     def save_checkpoint(self, is_target=False, timestamp = False):
51 |         # 使用时间戳保存功能
52 |         if timestamp is True:
53 |              # 使用时间戳创建新文件夹
54 |              current_timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M')
55 |              save_dir = os.path.join(self.chkpt_dir, current_timestamp)
56 |         else:
57 |             # 直接保存在主目录下，不使用时间戳
58 |             save_dir = self.chkpt_dir
59 |         # 创建保存路径
60 |         self.chkpt_file = os.path.join(save_dir, self.chkpt_name)
61 | 
62 |         if is_target:
63 |             target_chkpt_name = self.chkpt_file.replace('actor', 'target_actor')
64 |             os.makedirs(os.path.dirname(target_chkpt_name), exist_ok=True)
65 |             torch.save(self.state_dict(), target_chkpt_name)
66 |         else:
67 |             os.makedirs(os.path.dirname(self.chkpt_file), exist_ok=True)
68 |             torch.save(self.state_dict(), self.chkpt_file)
69 | 
70 |     def load_checkpoint(self, device = 'cpu', is_target = False, timestamp = None): # 默认加载target
71 |         if timestamp and isinstance(timestamp, str):
72 |             # 如果提供了有效的时间戳字符串，从对应文件夹加载
73 |             load_dir = os.path.join(self.chkpt_dir, timestamp)
74 |         else:
75 |             # 否则从主目录加载
76 |             load_dir = self.chkpt_dir
77 | 
78 |         self.chkpt_file = os.path.join(load_dir, self.chkpt_name)
79 | 
80 |         if is_target:
81 |                 target_chkpt_name = self.chkpt_file.replace('actor', 'target_actor')
82 |                 self.load_state_dict(torch.load(target_chkpt_name, map_location=torch.device(device)))
83 |         else:
84 |             self.load_state_dict(torch.load(self.chkpt_file, map_location=torch.device(device)))


--------------------------------------------------------------------------------
/MATD3_Continous/agents/NN_critic_td3.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.functional as F
 4 | import os
 5 | from datetime import datetime
 6 | 
 7 | 
 8 | ''' 创新点1：双截断Q网络'''
 9 | class MLPNetworkCritic_td3(nn.Module):
10 |     def __init__(self, chkpt_name,  chkpt_dir, in_dim, out_dim = 1, hidden_dim = 128, non_linear = nn.ReLU()):
11 |         super(MLPNetworkCritic_td3, self).__init__()
12 |         self.chkpt_dir = chkpt_dir
13 |         self.chkpt_name = chkpt_name
14 |         # Q1网络
15 |         self.net1 = nn.Sequential(
16 |             nn.Linear(in_dim, hidden_dim),
17 |             non_linear,
18 |             nn.Linear(hidden_dim, hidden_dim),
19 |             non_linear,
20 |             nn.Linear(hidden_dim, out_dim),
21 |         ).apply(self.init)
22 |         # Q2网络
23 |         self.net2 = nn.Sequential(
24 |             nn.Linear(in_dim, hidden_dim),
25 |             non_linear,
26 |             nn.Linear(hidden_dim, hidden_dim),
27 |             non_linear,
28 |             nn.Linear(hidden_dim, out_dim),
29 |         ).apply(self.init)
30 |     @staticmethod
31 |     def init(m):
32 |         '''init patameters of the module'''
33 |         gain = nn.init.calculate_gain('relu')
34 |         if isinstance(m, nn.Linear):
35 |             nn.init.xavier_uniform_(m.weight, gain = gain)  #使用了 Xavier 均匀分布初始化（也叫 Glorot 初始化）
36 |             m.bias.data.fill_(0.01)
37 |     
38 |     def forward(self, x):
39 |         # 返回两个Q值
40 |         q1 = self.net1(x)
41 |         q2 = self.net2(x)
42 |         return q1, q2
43 | 
44 |     def Q1(self, x):
45 |         # 只使用Q1网络进行评估
46 |         return self.net1(x)
47 |     
48 | 
49 |     def save_checkpoint(self, is_target = False, timestamp = False):
50 |         if timestamp is True:
51 |             # 使用时间戳创建新文件夹
52 |             current_timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M')
53 |             save_dir = os.path.join(self.chkpt_dir, current_timestamp)
54 |         else:
55 |             # 直接保存在主目录下
56 |             save_dir = self.chkpt_dir
57 |         
58 |         self.chkpt_file = os.path.join(save_dir, self.chkpt_name)
59 | 
60 |         if is_target:
61 |             target_chkpt_name = self.chkpt_file.replace('critic', 'target_critic')
62 |             os.makedirs(os.path.dirname(target_chkpt_name), exist_ok=True)
63 |             torch.save(self.state_dict(), target_chkpt_name)
64 |         else:
65 |             os.makedirs(os.path.dirname(self.chkpt_file), exist_ok=True)
66 |             torch.save(self.state_dict(), self.chkpt_file)
67 | 
68 |     def load_checkpoint(self, device = 'cpu', is_target = False, timestamp = None):
69 |         if timestamp and isinstance(timestamp, str):
70 |             # 如果提供了有效的时间戳字符串，从对应文件夹加载
71 |             load_dir = os.path.join(self.chkpt_dir, timestamp)
72 |         else:
73 |             # 否则从主目录加载
74 |             load_dir = self.chkpt_dir
75 |         
76 |         self.chkpt_file = os.path.join(load_dir, self.chkpt_name)
77 | 
78 |         if is_target:
79 |             target_chkpt_name = self.chkpt_file.replace('critic', 'target_critic')
80 |             self.load_state_dict(torch.load(target_chkpt_name, map_location=torch.device(device)))
81 |         else:
82 |             self.load_state_dict(torch.load(self.chkpt_file, map_location=torch.device(device)))
83 | 


--------------------------------------------------------------------------------
/MATD3_Continous/agents/TD3_agent.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from copy import deepcopy
 3 | from typing import List
 4 | 
 5 | import torch 
 6 | import torch.nn.functional as F
 7 | from torch import nn, Tensor
 8 | from torch.optim import Adam
 9 | from .NN_actor_td3 import MLPNetworkActor_td3
10 | from .NN_critic_td3 import MLPNetworkCritic_td3
11 | 
12 | 
13 | class TD3:
14 |     def __init__(self, obs_dim, act_dim, global_obs_dim, actor_lr, critic_lr, device, action_bound,  chkpt_dir, chkpt_name):
15 |         self.actor = MLPNetworkActor_td3(in_dim=obs_dim, out_dim=act_dim, hidden_dim = 64, action_bound=action_bound, chkpt_dir = chkpt_dir, chkpt_name = (chkpt_name + 'actor_td3.pth')).to(device)
16 |         self.critic = MLPNetworkCritic_td3(in_dim=global_obs_dim, out_dim=1, hidden_dim = 64, chkpt_dir = chkpt_dir, chkpt_name = (chkpt_name + 'critic_td3.pth')).to(device)
17 |         #优化器
18 |         self.actor_optimizer = Adam(self.actor.parameters(), lr = actor_lr)
19 |         self.critic_optimizer = Adam(self.critic.parameters(), lr = critic_lr)
20 |         # 创建相对于的target网络
21 |         self.actor_target = deepcopy(self.actor)
22 |         self.critic_target = deepcopy(self.critic)
23 |         """
24 |         使用 deepcopy 创建 target 网络是一个更好的选择，原因如下：
25 |         初始化一致性：
26 |             - deepcopy 确保 target 网络和原网络完全相同的初始参数
27 |             - 重新创建网络可能因为随机初始化导致参数不一致
28 |         """
29 | 
30 |     def actor_action(self, obs):
31 |         # 如果是list，先合并为单个tensor
32 |         # if isinstance(obs, list):
33 |         #     obs = torch.cat(obs, dim=1)
34 |         action = self.actor(obs)
35 |         return action
36 | 
37 |     def actor_target_action(self, obs):
38 |         # 如果是list，先合并为单个tensor
39 |         if isinstance(obs, list):
40 |             obs = torch.cat(obs, dim=1)
41 |         action = self.actor_target(obs)
42 |         return action
43 | 
44 |     def critic_qvalue(self, obs, action):
45 |         """获取  critic网络  的Q值"""
46 |         # 合并观测和动作
47 |         if isinstance(obs, list) and isinstance(action, list):
48 |             sa = torch.cat(list(obs) + list(action), dim=1)
49 |         else:
50 |             sa = torch.cat([obs, action], dim=1)
51 |         q1, q2 = self.critic(sa)# 返回两个Q值
52 |         return q1.squeeze(1), q2.squeeze(1)
53 |     
54 |     def critic_target_q(self, obs, action):
55 |         """获取  critic目标网络  的Q值"""
56 |         # 合并观测和动作
57 |         if isinstance(obs, list) and isinstance(action, list):
58 |             sa = torch.cat(list(obs) + list(action), dim=1)
59 |         else:
60 |             sa = torch.cat([obs, action], dim=1)
61 |         q1, q2 = self.critic_target(sa)# 返回两个Q值
62 |         return q1.squeeze(1), q2.squeeze(1)
63 | 
64 |     def critic_q1(self, obs, action):
65 |         """只获取  critic网络的  第一个Q值  ，用于策略更新"""
66 |         # 合并观测和动作
67 |         if isinstance(obs, list) and isinstance(action, list):
68 |             sa = torch.cat(list(obs) + list(action), dim=1)
69 |         else:
70 |             sa = torch.cat([obs, action], dim=1)  
71 |         return self.critic.Q1(sa).squeeze(1) # 只返回Q1
72 | 
73 | 
74 |     def update_actor(self, loss):
75 |         self.actor_optimizer.zero_grad()
76 |         loss.backward()
77 |         '''
78 |             在较新版本的PyTorch中， clip_grad_norm 已被弃用，推荐使用 clip_grad_norm_
79 |             clip_grad_norm_ 是 clip_grad_norm 的原地版本，不会创建新的张量，而是直接在输入张量上进行修改.
80 |         '''
81 |         nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5) # 与clip_grad_norm的不同？
82 |         self.actor_optimizer.step()
83 |     
84 |     def update_critic(self, loss):
85 |         self.critic_optimizer.zero_grad()
86 |         loss.backward()
87 |         nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
88 |         self.critic_optimizer.step()
89 | 


--------------------------------------------------------------------------------
/MATD3_Continous/agents/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | class BUFFER():
 5 |     
 6 |     def __init__(self,capacity, obs_dim, act_dim, device):
 7 |         # 使用连续内存布局
 8 |         self.capacity = capacity
 9 |         self.obs = np.zeros((capacity, obs_dim), dtype=np.float32)  # 指定dtype
10 |         self.action = np.zeros((capacity, act_dim), dtype=np.float32)
11 |         self.reward = np.zeros(capacity, dtype=np.float32)
12 |         self.next_obs = np.zeros((capacity, obs_dim), dtype=np.float32)
13 |         self.done = np.zeros(capacity, dtype=np.float32)  # 使用bool_
14 |         self._index = 0
15 |         self._size = 0
16 |         self.device = device
17 | 
18 |     def add(self,obs, action, reward, next_obs, done):
19 |         # 确保输入数据类型一致
20 |         self.obs[self._index] = np.asarray(obs, dtype=np.float32)
21 |         self.action[self._index] = np.asarray(action, dtype=np.float32)
22 |         self.reward[self._index] = np.float32(reward)
23 |         self.next_obs[self._index] = np.asarray(next_obs, dtype=np.float32)
24 |         self.done[self._index] = np.float32(done)
25 | 
26 |         self._index = (self._index +1) % self.capacity
27 |         if self._size < self.capacity:
28 |             self._size += 1
29 | 
30 | 
31 |     def sample(self, indices):
32 |         # 一次性批量处理
33 |         batch = (
34 |             self.obs[indices],
35 |             self.action[indices],
36 |             self.reward[indices],
37 |             self.next_obs[indices],
38 |             self.done[indices]
39 |         )
40 |         # 批量转换为tensor并移动到设备
41 |         return tuple(
42 |             torch.as_tensor(data, device=self.device)
43 |             for data in batch
44 |         )
45 | 
46 |         # obs = torch.from_numpy(obs).float().to(self.device)  # torch.Size([batch_size, state_dim])
47 |         # action = torch.from_numpy(action).float().to(self.device)  # torch.Size([batch_size, action_dim])
48 |         # reward = torch.from_numpy(reward).float().to(self.device)  # just a tensor with length: batch_size
49 |         # # reward = (reward - reward.mean()) / (reward.std() + 1e-7) # 暂不使用
50 |         # next_obs = torch.from_numpy(next_obs).float().to(self.device)  # Size([batch_size, state_dim])
51 |         # done = torch.from_numpy(done).float().to(self.device)  # just a tensor with length: batch_size
52 |         
53 |         # return obs, action, reward, next_obs, done
54 | 
55 |     def __len__(self):  #保留方法
56 |         return self._size
57 |         


--------------------------------------------------------------------------------
/MATD3_Continous/envs/custom_agents_dynamics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 该文件定义了自定义的环境，用于测试自定义的智能体动力学模型
 3 | 
 4 | 继承自core.py
 5 | 
 6 | """
 7 | import numpy as np
 8 | from pettingzoo.mpe._mpe_utils.core import EntityState, AgentState, Action, Entity, Landmark, Agent
 9 | from pettingzoo.mpe._mpe_utils.core import World
10 | 
11 | class CustomWorld(World):
12 |     def __init__(self, world_size = 2.5 ): #
13 |         super().__init__() # 调用父类的构造函数
14 |         self.world_size = world_size # Ronchy 添加世界大小
15 |         self.dt = 0.1 # 时间步长
16 |         self.damping = 0.2 # 阻尼系数
17 |         # contact response parameters
18 |         self.contact_force = 1e2 # 控制碰撞强度（默认1e2，值越大反弹越强）
19 |         self.contact_margin = 1e-3 # 控制碰撞"柔软度"（默认1e-3，值越小越接近刚体）
20 |         """
21 |         常见问题示例
22 |         实体重叠穿透	contact_force太小	增大contact_force至1e3或更高
23 |         碰撞后震荡	damping太低	增大阻尼系数（如0.5）
24 |         微小距离抖动	contact_margin不合理	调整到1e-2~1e-4之间
25 |         """
26 |     """ 
27 |         重载底层动力学逻辑
28 |         主要是integrate_state()函数
29 |     """
30 |     def step(self):
31 |         # set actions for scripted agents
32 |         # print("Using world -> step()") # 重载成功！
33 |         for agent in self.scripted_agents:
34 |             agent.action = agent.action_callback(agent, self)
35 |         # gather forces applied to entities
36 |         p_force = [None] * len(self.entities)
37 |         # apply agent physical controls
38 |         p_force = self.apply_action_force(p_force) # 加入噪声
39 |         # apply environment forces
40 |         p_force = self.apply_environment_force(p_force) # 碰撞力计算 collide为True时
41 |         # integrate physical state
42 |         self.integrate_state(p_force) # 动力学逻辑
43 |         # update agent state
44 |         for agent in self.agents:
45 |             self.update_agent_state(agent) # 更新 communication action 后的状态
46 |         
47 |     # integrate physical state
48 |     #函数功能：动力学逻辑。更新实体的位置和速度
49 |     def integrate_state(self, p_force):
50 |         for i, entity in enumerate(self.entities):
51 |             if not entity.movable:
52 |                 continue
53 |             # 速度阻尼衰减
54 |             entity.state.p_vel *= (1 - self.damping)  # 正确应用阻尼
55 |              # 动力学 -> 运动学
56 |             if p_force[i] is not None:
57 |                 acceleration = p_force[i] / entity.mass # F = ma
58 |                 entity.state.p_vel += acceleration * self.dt # v = v_0 + a * t
59 | 
60 |             # 速度限幅
61 |             if entity.max_speed is not None:
62 |                 speed = np.linalg.norm(entity.state.p_vel)  # 计算向量模长
63 |                 if speed > entity.max_speed:
64 |                     entity.state.p_vel = entity.state.p_vel * (entity.max_speed / speed)  # 向量缩放         
65 |             
66 |             # 更新位置
67 |             entity.state.p_pos += entity.state.p_vel * self.dt  # 更新位置
68 |             # 限制位置在世界大小范围内
69 |             # entity.state.p_pos = np.clip(entity.state.p_pos, -self.world_size, self.world_size) # Ronchy 添加世界大小限制
70 |              
71 | 
72 |     # get collision forces for any contact between two entities
73 |     # TODO: 碰撞逻辑待细化
74 |     def get_collision_force(self, entity_a, entity_b):
75 |         if (not entity_a.collide) or (not entity_b.collide):
76 |             return [None, None]  # not a collider
77 |         if entity_a is entity_b:
78 |             return [None, None]  # don't collide against itself
79 |         # compute actual distance between entities
80 |         delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
81 |         dist = np.sqrt(np.sum(np.square(delta_pos))) #用norm更简洁
82 |         # minimum allowable distance
83 |         dist_min = entity_a.size + entity_b.size  # 两个实体的半径之和
84 |         # softmax penetration
85 |         k = self.contact_margin 
86 |         penetration = np.logaddexp(0, -(dist - dist_min) / k) * k  #渗透深度， 当 dist < dist_min 时产生虚拟渗透量
87 |         force = self.contact_force * delta_pos / dist * penetration
88 |         force_a = +force if entity_a.movable else None
89 |         force_b = -force if entity_b.movable else None
90 |         return [force_a, force_b]
91 | 


--------------------------------------------------------------------------------
/MATD3_Continous/main/main_evaluate.py:
--------------------------------------------------------------------------------
 1 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3
 2 | from main_parameters import main_parameters
 3 | 
 4 | # 修改导入路径
 5 | import sys
 6 | import os
 7 | # 将项目根目录添加到Python路径
 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 9 | 
10 | 
11 | from agents.MATD3_runner import RUNNER
12 | from agents.MATD3_agent import MATD3
13 | import torch
14 | import random
15 | import numpy as np
16 | from envs import simple_tag_env
17 | 
18 | def setup_seed(seed):
19 |     torch.manual_seed(seed)
20 |     if torch.cuda.is_available():
21 |         torch.cuda.manual_seed(seed)
22 |         torch.cuda.manual_seed_all(seed)
23 |     np.random.seed(seed)
24 |     random.seed(seed)
25 |     torch.backends.cudnn.deterministic = True
26 |     torch.backends.cudnn.benchmark = False
27 | 
28 | 
29 | def get_env(env_name, ep_len=50, render_mode = "None", seed = None):
30 |     """create environment and get observation and action dimension of each agent in this environment"""
31 |     new_env = None
32 |     if env_name == 'simple_adversary_v3':
33 |         new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True)
34 |     if env_name == 'simple_spread_v3':
35 |         new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array")
36 |     if env_name == 'simple_tag_v3':
37 |         new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)    
38 |     if env_name == 'simple_tag_env':
39 |         new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)
40 | 
41 |     new_env.reset(seed)
42 |     _dim_info = {}
43 |     action_bound = {}
44 |     for agent_id in new_env.agents:
45 |         print("agent_id:",agent_id)
46 |         _dim_info[agent_id] = []  # [obs_dim, act_dim]
47 |         action_bound[agent_id] = [] #[low action,  hign action]
48 |         _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0])
49 |         _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0])
50 |         action_bound[agent_id].append(new_env.action_space(agent_id).low)
51 |         action_bound[agent_id].append(new_env.action_space(agent_id).high)
52 | 
53 |     return new_env, _dim_info, action_bound
54 | 
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     device ='cpu'
59 |     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
60 |     print("Using device:",device)
61 |     # 模型存储路径
62 |     current_dir = os.path.dirname(os.path.abspath(__file__))
63 |     chkpt_dir = os.path.join(current_dir, 'models', 'matd3_models')
64 |     load_timestamp = "2025-04-15_22-23"
65 |     model_timestamp = None if load_timestamp == '' else load_timestamp
66 |     # 定义参数
67 |     args = main_parameters()
68 |     args.render_mode = "human"
69 |     # args.episode_num = 1
70 | 
71 |     # 创建环境
72 |     print("Using Env's name",args.env_name)
73 |         # 判断是否使用固定种子
74 |     if args.seed is None:
75 |         print("使用随机种子 (不固定)")
76 |     else:
77 |         print(f"使用固定种子: {args.seed}")
78 |         setup_seed(args.seed)
79 |     
80 |     env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode, seed = args.seed)
81 |     # print(env, dim_info, action_bound)
82 |     # 创建MA-DDPG智能体 dim_info: 字典，键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意
83 |     agent = MATD3(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, args.tau, _chkpt_dir = chkpt_dir, _model_timestamp = model_timestamp)
84 |     print("--- Loading models ---")
85 |     agent.load_model()
86 |     print('---- Evaluating ----')
87 |     env.reset(args.seed)
88 |     runner = RUNNER(agent, env, args, device, mode = 'evaluate')
89 |     runner.evaluate() # 使用evaluate方法
90 |     print('---- Done! ----')
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/MATD3_Continous/main/main_parameters.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def main_parameters():
 4 |     parser = argparse.ArgumentParser("MADDPG legacy")
 5 |     ############################################ 选择环境 ############################################
 6 |     parser.add_argument("--seed", type=int, default=-1, help='随机种子 (使用-1表示不使用固定种子)')
 7 |     parser.add_argument("--use_variable_seeds", type=bool, default=False, help="使用可变随机种子")
 8 |     
 9 |     parser.add_argument("--env_name", type=str, default="simple_tag_v3", help="name of the env",   
10 |                         choices=['simple_adversary_v3', 'simple_spread_v3', 'simple_tag_v3', 'simple_tag_env']) 
11 |     parser.add_argument("--render_mode", type=str, default="None", help="None | human | rgb_array")
12 |     parser.add_argument("--episode_num", type=int, default=3, help="训练轮数")
13 |     parser.add_argument("--episode_length", type=int, default=100, help="每轮最大步数")
14 |     parser.add_argument("--evaluate_episode_num", type=int, default=100, help="评估轮数")
15 |     parser.add_argument('--learn_interval', type=int, default=10,
16 |                         help='学习间隔步数')
17 |     
18 |     parser.add_argument('--random_steps', type=int, default=500, help='初始随机探索步数')
19 |     parser.add_argument('--tau', type=float, default=0.01, help='软更新参数')
20 |     parser.add_argument('--gamma', type=float, default=0.9, help='折扣因子')
21 |     parser.add_argument('--buffer_capacity', type=int, default=int(1e6), help='经验回放缓冲区容量')
22 |     parser.add_argument('--batch_size', type=int, default=128, help='批次大小')
23 |     parser.add_argument('--actor_lr', type=float, default=0.0001, help='Actor学习率')
24 |     parser.add_argument('--critic_lr', type=float, default=0.003, help='Critic学习率')
25 |     parser.add_argument('--comm_lr', type=float, default=0.00001, help='Comm学习率')
26 |     # 通信网络参数
27 |     parser.add_argument('--message_dim', type=int, default=3, help='通信消息维度')
28 |     
29 |     parser.add_argument('--best_score', type=int, default= -20, help='最佳分数_初始值')
30 | 
31 |     # 可视化参数
32 |     parser.add_argument('--visdom', action="store_true", help="是否使用visdom可视化")
33 |     parser.add_argument('--size_win', type=int, default=200, help="平滑窗口大小")
34 |     
35 |     # 训练设备
36 |     parser.add_argument("--device", type=str, default='cpu', help="训练设备，默认自动选择cpu")
37 | 
38 |     args = parser.parse_args()
39 |     
40 |     # 如果seed为-1，则设置为None
41 |     if args.seed == -1:
42 |         args.seed = None
43 |         
44 |     return args


--------------------------------------------------------------------------------
/MATD3_Continous/main/main_train.py:
--------------------------------------------------------------------------------
  1 | MODULE_NAME = "log_td3_main" # 使用logger保存训练日志
  2 | 
  3 | from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3
  4 | 
  5 | # 添加项目根目录到Python路径
  6 | import sys
  7 | import os
  8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  9 | 
 10 | from envs import simple_tag_env, custom_agents_dynamics
 11 | 
 12 | from main_parameters import main_parameters
 13 | from agents.MATD3_runner import RUNNER
 14 | 
 15 | from agents.MATD3_agent import MATD3
 16 | import torch
 17 | import random
 18 | import numpy as np 
 19 | 
 20 | import time
 21 | from datetime import datetime, timedelta
 22 | from utils.logger import TrainingLogger  # 添加导入
 23 | 
 24 | 
 25 | def setup_seed(seed):
 26 |     torch.manual_seed(seed)
 27 |     if torch.cuda.is_available():
 28 |         torch.cuda.manual_seed(seed)
 29 |         torch.cuda.manual_seed_all(seed)
 30 |     np.random.seed(seed)
 31 |     random.seed(seed)
 32 |     torch.backends.cudnn.deterministic = True
 33 |     torch.backends.cudnn.benchmark = False
 34 | 
 35 | def get_env(env_name, ep_len=25, render_mode ="None", seed = None):
 36 |     """create environment and get observation and action dimension of each agent in this environment"""
 37 |     new_env = None
 38 |     if env_name == 'simple_adversary_v3':
 39 |         new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len, continuous_actions=True)
 40 |     if env_name == 'simple_spread_v3':
 41 |         new_env = simple_spread_v3.parallel_env(max_cycles=ep_len, render_mode="rgb_array")
 42 |     if env_name == 'simple_tag_v3':
 43 |         new_env = simple_tag_v3.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)
 44 |     if env_name == 'simple_tag_env':
 45 |         new_env = simple_tag_env.parallel_env(render_mode = render_mode, num_good=1, num_adversaries=3, num_obstacles=0, max_cycles=ep_len, continuous_actions=True)
 46 |     
 47 |     # 使用reset时处理None种子
 48 |     if seed is not None:
 49 |         new_env.reset(seed=seed)  # 指定种子值
 50 |     else:
 51 |         new_env.reset()  # 不指定种子，使用随机种子
 52 | 
 53 |     _dim_info = {}
 54 |     action_bound = {}
 55 |     for agent_id in new_env.agents:
 56 |         print("agent_id:",agent_id)
 57 |         _dim_info[agent_id] = []  # [obs_dim, act_dim]
 58 |         action_bound[agent_id] = [] #[low action,  hign action]
 59 |         _dim_info[agent_id].append(new_env.observation_space(agent_id).shape[0])
 60 |         _dim_info[agent_id].append(new_env.action_space(agent_id).shape[0])
 61 |         action_bound[agent_id].append(new_env.action_space(agent_id).low)
 62 |         action_bound[agent_id].append(new_env.action_space(agent_id).high)
 63 |     print("_dim_info:",_dim_info)
 64 |     print("action_bound:",action_bound)
 65 |     return new_env, _dim_info, action_bound
 66 | 
 67 | 
 68 | if __name__ == '__main__':
 69 |     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 70 |     device = torch.device('mps' if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() 
 71 |                             else 'cuda' if torch.cuda.is_available() else 'cpu')
 72 |     device = "cpu"
 73 |     print("Using device:",device)
 74 |     start_time = time.time() # 记录开始时间
 75 |     # 模型保存路径
 76 |     current_dir = os.path.dirname(os.path.abspath(__file__))
 77 |     chkpt_dir = os.path.join(current_dir, 'models', 'matd3_models')
 78 |     # 定义参数
 79 |     args = main_parameters()
 80 |     # 创建环境
 81 |     print("Using Env's name",args.env_name)
 82 | 
 83 |     # 判断是否使用固定种子
 84 |     if args.seed is None:
 85 |         print("使用随机种子 (不固定)")
 86 |     else:
 87 |         print(f"使用固定种子: {args.seed}")
 88 |         setup_seed(args.seed)
 89 |     
 90 |     env, dim_info, action_bound = get_env(args.env_name, args.episode_length, args.render_mode, seed = args.seed)
 91 |     # print(env, dim_info, action_bound)
 92 |     # 创建MA-DDPG智能体 dim_info: 字典，键为智能体名字 内容为二维数组 分别表示观测维度和动作维度 是观测不是状态 需要注意。
 93 |     agent = MATD3(dim_info, args.buffer_capacity, args.batch_size, args.actor_lr, args.critic_lr, action_bound, args.tau, _chkpt_dir = chkpt_dir, _device = device)
 94 |     # 创建运行对象
 95 |     runner = RUNNER(agent, env, args, device, mode = 'train')
 96 |     
 97 |     # 记录训练开始时间
 98 |     start_time = datetime.now()
 99 |     start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S")
100 |     print(f"训练开始时间: {start_time_str}")
101 |     
102 |     # 开始训练
103 |     runner.train()
104 |     
105 |     # 记录训练结束时间和计算训练用时
106 |     end_time = datetime.now()
107 |     end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S")
108 |     duration = end_time - start_time
109 |     training_duration = str(timedelta(seconds=int(duration.total_seconds())))
110 |     
111 |     print(f"\n===========训练完成!===========")
112 |     print(f"训练开始时间: {start_time_str}")
113 |     print(f"训练结束时间: {end_time_str}")
114 |     print(f"训练用时: {training_duration}")
115 |     print(f"训练设备: {device}")
116 | 
117 |     # 使用logger保存训练日志
118 |     logger = TrainingLogger(module_name = MODULE_NAME)
119 |     logger.save_training_log(args, device, start_time_str, end_time_str, training_duration, runner)
120 | 
121 |     print("--- saving trained models ---")
122 |     agent.save_model(timestamp = True)
123 |     print("--- trained models saved ---")
124 |     
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/MATD3_Continous/plot/README.md:
--------------------------------------------------------------------------------
1 | # 使用说明
2 | 
3 | 请将plot_rewards.py放置在对应的文件夹内，而非在根目录下使用！
4 | 
5 | 如：
6 | 
7 | cp plot_rewards.py ./maddpg_scripted_prey/plot_rewards.py


--------------------------------------------------------------------------------
/MATD3_Continous/plot/plot_rewards.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | from datetime import datetime
  5 | import numpy as np
  6 | import platform
  7 | '''
  8 | 注意：
  9 | 作者用pands==2.2.3出错了。
 10 | pip install pandas==2.2.1 没问题。
 11 | '''
 12 | 
 13 | def moving_average(data, window_size=50):
 14 |     """简单移动平均"""
 15 |     weights = np.ones(window_size) / window_size
 16 |     return np.convolve(data, weights, mode='valid')
 17 | 
 18 | def exponential_moving_average(data, alpha=0.1):
 19 |     """指数移动平均"""
 20 |     ema = np.zeros_like(data)
 21 |     ema[0] = data[0]
 22 |     for i in range(1, len(data)):
 23 |         ema[i] = alpha * data[i] + (1 - alpha) * ema[i-1]
 24 |     return ema
 25 | 
 26 | def set_font_for_plot():
 27 |     """根据平台动态设置字体"""
 28 |     system_platform = platform.system()
 29 |     print("system_platform:", system_platform)
 30 |     if system_platform == "Darwin":  # MacOS
 31 |         font = 'Arial Unicode MS'
 32 |     elif system_platform == "Windows":  # Windows
 33 |         font = 'SimHei'
 34 |     else:  # Linux
 35 |         # 中文字体需要手动安装
 36 |         # 参考：https://blog.csdn.net/takedachia/article/details/131017286  https://blog.csdn.net/weixin_45707277/article/details/118631442
 37 |         font = 'SimHei'
 38 |     
 39 |     plt.rcParams['font.sans-serif'] = [font]
 40 |     plt.rcParams['axes.unicode_minus'] = False
 41 | 
 42 | def plot_all_rewards(csv_file, window_size=50):
 43 |     """在一张图上绘制所有智能体的奖励曲线（包括追捕者和逃避者）"""
 44 |     df = pd.read_csv(csv_file)
 45 |     set_font_for_plot()
 46 |     
 47 |     # 打印CSV文件的列名，帮助调试
 48 |     print(f"CSV文件列名: {df.columns.tolist()}")
 49 |     
 50 |     # 获取数据点数量，动态调整窗口大小
 51 |     data_points = len(df)
 52 |     print(f"数据点数量: {data_points}")
 53 |     
 54 |     # 如果数据点数量小于窗口大小，则调整窗口大小为数据点数量的一半
 55 |     if data_points < window_size:
 56 |         window_size = max(2, data_points // 2)  # 确保窗口大小至少为2
 57 |         print(f"数据点不足，调整窗口大小为: {window_size}")
 58 |     
 59 |     # 从CSV文件名中提取时间戳
 60 |     base_name = os.path.basename(csv_file)
 61 |     if 'rewards_' in base_name and '.csv' in base_name:
 62 |         timestamp = base_name.replace('rewards_', '').replace('.csv', '')
 63 |     else:
 64 |         timestamp = ''
 65 |     
 66 |     # 创建一个包含两个子图的图形
 67 |     fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
 68 |     
 69 |     # 第一个子图：所有智能体的奖励曲线
 70 |     # 修改：适配CSV文件列名
 71 |     adversary_col = 'Adversary_Mean' if 'Adversary_Mean' in df.columns else 'Adversary_Mean_Reward'
 72 |     agent_columns = [col for col in df.columns if col not in ['Episode', adversary_col]]
 73 |     colors = plt.cm.tab10(np.linspace(0, 1, len(agent_columns)))
 74 |     
 75 |     # 绘制每个智能体的奖励曲线
 76 |     for agent, color in zip(agent_columns, colors):
 77 |         # 原始数据（半透明）
 78 |         ax1.plot(df['Episode'], df[agent], color=color, alpha=0.2, label=f'{agent} (原始)')
 79 |         # 移动平均
 80 |         ma_data = moving_average(df[agent].values, window_size)
 81 |         # 确保x轴和y轴数据长度匹配
 82 |         x_data = df['Episode'][window_size-1:window_size-1+len(ma_data)]
 83 |         ax1.plot(x_data, ma_data, 
 84 |                 color=color, linewidth=2, label=f'{agent} (移动平均)')
 85 |     
 86 |     ax1.set_title('所有智能体奖励曲线')
 87 |     ax1.set_xlabel('回合数')
 88 |     ax1.set_ylabel('奖励')
 89 |     ax1.grid(True, linestyle='--', alpha=0.7)
 90 |     ax1.legend()
 91 |     
 92 |     # 第二个子图：追捕者平均奖励
 93 |     # 修改：适配CSV文件列名
 94 |     # 原始数据（半透明）
 95 |     ax2.plot(df['Episode'], df[adversary_col], 
 96 |             'gray', alpha=0.2, label='原始数据')
 97 |     # 移动平均
 98 |     adv_ma = moving_average(df[adversary_col].values, window_size)
 99 |     # 确保x轴和y轴数据长度匹配
100 |     x_data = df['Episode'][window_size-1:window_size-1+len(adv_ma)]
101 |     ax2.plot(x_data, adv_ma, 
102 |             'r-', linewidth=2, label='移动平均')
103 |     
104 |     ax2.set_title('追捕者平均奖励趋势')
105 |     ax2.set_xlabel('回合数')
106 |     ax2.set_ylabel('平均奖励')
107 |     ax2.grid(True, linestyle='--', alpha=0.7)
108 |     ax2.legend()
109 |     
110 |     # 调整子图之间的间距
111 |     plt.tight_layout()
112 |     
113 |     # 保存图片
114 |     if timestamp:
115 |         save_path = os.path.join(os.path.dirname(csv_file), f'training_rewards_{timestamp}.png')
116 |     else:
117 |         save_path = os.path.join(os.path.dirname(csv_file), 'training_rewards.png')
118 |     plt.savefig(save_path, dpi=300, bbox_inches='tight')
119 |     print(f"训练奖励图像已保存至: {save_path}")
120 |     plt.close()
121 | 
122 | if __name__ == "__main__":
123 |     # 修改：指定具体的CSV文件名
124 |     csv_file = os.path.join(os.path.dirname(__file__), 'xxxx.csv') # 替换为你的CSV文件名
125 |     print("csv_file name:", csv_file)
126 | 
127 |     if os.path.exists(csv_file):
128 |         plot_all_rewards(csv_file)
129 |     else:
130 |         print(f"错误：未找到CSV文件：{csv_file},请检查路径及文件名是否正确!")


--------------------------------------------------------------------------------
/MATD3_Continous/plot/training_rewards_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/MATD3_Continous/plot/training_rewards_demo.png


--------------------------------------------------------------------------------
/MATD3_Continous/readme.md:
--------------------------------------------------------------------------------
  1 | [🇨🇳 中文文档](readme.md) | [🇺🇸 English](readme_en.md)
  2 | 
  3 | # 多智能体深度强化学习MATD3算法 - Predator-Prey追逃博弈
  4 | 
  5 | >**本项目专为Predator-Prey追逃博弈任务优化！** 基于TD3算法的多智能体扩展版本(MATD3:Twin Delayed Deep Deterministic Policy Gradient)，在`PettingZoo MPE`环境基础上重构修改，提供了完整的多智能体协作与对抗环境，专注于连续动作空间的多智能体协作与对抗任务;适用于围捕控制、群体智能和策略博弈研究.
  6 | 
  7 | > MATD3算法优势：相比MADDPG，通过双Q网络和目标策略平滑机制有效解决过估计问题，提供更稳定的训练和更优的策略。
  8 | 
  9 | > Reference: https://github.com/wild-firefox/FreeRL/blob/main/MADDPG_file/MATD3_simple.py
 10 | 
 11 | ## 📈 训练效果
 12 | <div align="center">
 13 |   <img src="./plot/training_rewards_demo.png" alt="训练收敛结果" width="80%"/>
 14 |   <p><strong>MATD3算法在simple_tag_v3环境中的奖励收敛曲线</strong></p>
 15 | </div>
 16 | 
 17 | > **⚠️ 重要提示**：使用前请查看🔍 [**已知问题与解决方案KNOWN_ISSUES.md**](KNOWN_ISSUES.md)文档，了解常见问题的解决方法，特别是Windows系统的渲染卡死问题和PettingZoo版本兼容性问题。
 18 | 
 19 | > **奖励函数优化**：官方的奖励配置无法训练出良好的围捕行为，本项目专门优化了追捕者的奖励函数，实现更高效的协作围捕
 20 | 
 21 | ## 🚀 实现进度
 22 | | 算法          | 状态   | 位置                | 核心组件                         |
 23 | |--------------|--------|-------------------|----------------------------------|
 24 | | MATD3        | ✅ 1.0 | `agents/`   | MATD3_agent, buffer, networks    |
 25 | 
 26 | 
 27 | ## 项目结构
 28 | 
 29 | ```tree
 30 | MATD3_Continous/
 31 | ├── agents/                   # 智能体算法实现
 32 | │   ├── buffer.py            # 经验回放缓冲区
 33 | │   ├── MATD3_agent.py       # MATD3智能体控制器
 34 | │   ├── MATD3_runner.py      # 训练与评估运行器
 35 | │   ├── NN_actor_td3.py      # Actor网络结构
 36 | │   ├── NN_critic_td3.py     # Critic网络结构(双Q网络)
 37 | │   └── TD3_agent.py         # 基础TD3实现
 38 | ├── envs/                     # 环境实现
 39 | │   ├── custom_agents_dynamics.py  # 自定义智能体动力学
 40 | │   └── simple_tag_env.py    # 修改版追逃环境
 41 | ├── main/                     # 主程序脚本
 42 | │   ├── main_evaluate.py     # 评估脚本
 43 | │   ├── main_parameters.py   # 参数配置
 44 | │   └── main_train.py        # 训练入口
 45 | ├── plot/                     # 数据可视化
 46 | │   ├── matd3_data/          # 训练数据存储
 47 | │   ├── plot_rewards.py      # 奖励绘图脚本
 48 | │   ├── README.md            # 绘图说明
 49 | │   └── training_rewards_demo.png  # 样例训练曲线
 50 | ├── logs/                     # 日志文件
 51 | │   └── log_td3_main/        # TD3训练日志
 52 | └── utils/                    # 工具函数
 53 |     ├── conda-environment.yml  # Conda环境配置(Windows和Intel芯片的macOS)
 54 |     ├── linux_environment.yml  # Linux环境配置
 55 |     ├── logger.py            # 日志工具
 56 |     ├── mac_arm_M4_environment.yml  # Mac M系列芯片环境配置
 57 |     ├── pip-requirements.txt  # pip依赖
 58 |     ├── pip-requirements_mac_arm_M4.txt  # Mac M系列芯片专用依赖
 59 |     └── setupPettingzoo.py   # PettingZoo环境设置
 60 | ```
 61 | 
 62 | ## 环境说明
 63 | 
 64 | 本项目基于 PettingZoo 的 MPE (Multi-Particle Environment) 环境，主要实现了 simple_tag 追逐逃避任务：
 65 | 
 66 | - **追捕者 (Adversaries)**: 多个追捕者协作追捕逃避者
 67 | - **逃避者 (Good Agents)**: 尝试逃离追捕者
 68 | 
 69 | 环境特点：
 70 | - 连续动作空间
 71 | - 部分可观测状态
 72 | - 多智能体协作与对抗
 73 | 
 74 | ## 算法实现
 75 | 
 76 | 项目实现了 MATD3 (Multi-Agent Twin Delayed Deep Deterministic Policy Gradient) 算法，这是 TD3 算法的多智能体扩展版本，主要特点：
 77 | 
 78 | - 双重 Q 网络减少过估计
 79 | - 延迟策略更新
 80 | - 目标策略平滑正则化
 81 | - 集中式训练，分布式执行 (CTDE) 范式
 82 | 
 83 | 
 84 | ## 🛠️ 快速开始
 85 | 
 86 | ### 环境配置
 87 | 
 88 | > 相关配置需求在utils/文件夹下。
 89 | 
 90 | ### Linux环境（ubuntu）
 91 | 1. 使用linux_environment.yml创建新环境
 92 | ```bash
 93 | # 注意：将"MPE"替换为您喜欢的环境名称
 94 | conda env create -f utils/linux_environment.yml -n MPE
 95 | # 激活刚创建的环境
 96 | conda activate MPE
 97 | ```
 98 | 2. pip安装核心依赖
 99 | ```bash
100 | pip install -r utils/pip-requirements.txt
101 | ```
102 | ### Mac M系列芯片环境
103 | 1. 使用mac_arm_M4_environment.yml创建新conda环境
104 | ```bash
105 | # 注意：将"MPE"替换为您喜欢的环境名称
106 | conda env create -f utils/mac_arm_M4_environment.yml -n MPE
107 | # 激活刚创建的环境
108 | conda activate MPE
109 | ```
110 | 2. pip安装Mac M芯片专用依赖
111 | ```bash
112 | pip install -r utils/pip-requirements_mac_arm_M4.txt
113 | ```
114 | 
115 | ### Windows创建并激活虚拟环境（推荐）
116 | 1. 使用conda-environment.yml创建新环境
117 | ```bash
118 | # 注意：将"MPE"替换为您喜欢的环境名称
119 | conda env create -f utils/conda-environment.yml -n MPE
120 | # 激活刚创建的环境
121 | conda activate MPE
122 | ```
123 | 2. pip安装核心依赖
124 | ```bash
125 | pip install -r utils/pip-requirements.txt
126 | ```
127 | ### 手动安装依赖
128 | > 上述虚拟环境创建成功后，您需要手动安装以下依赖：
129 | 3. 从PyTorch官网安装对应版本的PyTorch
130 | ```bash
131 | # 请访问 https://pytorch.org 选择适合您系统的安装命令
132 | # 例如：
133 | pip3 install torch torchvision torchaudio
134 | ```
135 | 
136 | 4. 2025.4.26 update: 安装`PettingZoo 1.25.0`版本，官方PyPI仓库最新版本更新为为1.25.0，内容与1.24.4相同。MPE被拆分出PettingZoo, **警告可忽略**，`MPE2`详情可见:https://github.com/Farama-Foundation/MPE2
137 | ```bash
138 | pip install pettingzoo==1.25.0
139 | ```
140 | 
141 | 4. ~~安装PettingZoo 1.24.4版本~~
142 | ```bash
143 | # 重要说明：本项目需要PettingZoo 1.24.4版本，但官方PyPI仓库最新版本仅为1.24.3
144 | # 必须从GitHub源码安装才能获取1.24.4版本，安装命令为：
145 | # pip install "pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git"
146 | 或者，您可以直接运行提供的安装脚本安装pettingzoo1.25.0：
147 | python utils/setupPettingzoo.py
148 | ```
149 | 
150 | ### 🖥️ 运行配置
151 | > **注意：** 当前版本采用本地数据存储模式，无需额外配置可视化服务器。训练数据将保存在plot/matd3_data/目录下。
152 | 
153 | ## 🔄 训练流程
154 | 1. **参数配置**   
155 | 在 `main_parameters.py` 中设置环境和算法参数：
156 | ```python
157 | env_name = 'simple_tag_v3'  # 可选：simple_adversary_v3/simple_spread_v3
158 | episode_num = 5000         # 总训练回合数
159 | # 训练参数
160 | batch_size = 128          # 经验回放批次大小
161 | actor_lr = 0.0002         # Actor网络学习率
162 | critic_lr = 0.002         # Critic网络学习率
163 | ```
164 | 
165 | 2. **运行训练脚本**
166 | ```bash
167 | # 使用默认参数训练
168 | cd main
169 | python main_train.py
170 | ```
171 | 
172 | 3. **查看训练进度**
173 | 训练数据将实时保存到CSV文件中，可使用plot_rewards.py脚本进行可视化：
174 | ```bash
175 | python plot/plot_rewards.py
176 | ```
177 | 
178 | 4. **评估训练模型**
179 | ```bash
180 | # 渲染训练好的模型策略
181 | cd main
182 | python main_evaluate.py
183 | ```
184 | 
185 | ### 🌐 环境特性与优化
186 | 本项目基于PettingZoo的MPE环境进行了大量优化：
187 | 
188 | - **TD3增强的策略稳定性**: 相比MADDPG，MATD3通过双Q网络和目标策略平滑有效解决过估计问题
189 | - **围捕行为的奖励优化**: 通过精心设计的奖励函数，实现更具协作性的围捕策略
190 | - **物理参数优化**: 
191 |   - 世界大小：2.5单位（可根据追逃需求自定义）
192 |   - 时间步长：0.1秒（影响动作响应速度）
193 |   - 阻尼系数：0.2（影响智能体的惯性）
194 | 
195 | #### 🌟 MATD3 vs MADDPG
196 | MATD3对标准MADDPG进行了以下关键增强：
197 | 
198 | 1. **双Q网络设计**: 减少对动作值的过估计
199 | 2. **延迟策略更新**: 提高训练稳定性
200 | 3. **目标策略平滑**: 通过在目标动作中加入噪声防止过拟合
201 | 4. **自适应噪声调整**: 根据训练进度动态调整探索噪声
202 | 
203 | 这些优化使MATD3在追逃博弈场景中展现出更强大的性能和更快的收敛速度。
204 | 
205 | ## 📦 数据管理
206 | ### 模型存储
207 | 训练模型自动保存在：
208 | ```tree
209 | ./main/models/
210 | └── matd3_models/           # MATD3检查点目录
211 |     ├── {timestamp}_agent_0_actor.pth    # Actor网络参数
212 |     ├── {timestamp}_agent_0_critic_1.pth # 第一个Critic网络参数
213 |     ├── {timestamp}_agent_0_critic_2.pth # 第二个Critic网络参数
214 |     └── ...                             # 其他智能体网络
215 | ```
216 | 
217 | ### 可视化系统
218 | 训练指标可视化：
219 | ```tree
220 | plot/
221 | ├── matd3_data/             # 训练数据存储
222 | │   └── rewards_{timestamp}.csv   # CSV格式奖励记录
223 | └── plot_rewards.py         # 可视化工具
224 | ```
225 | 
226 | ## 🤝 贡献
227 | 本项目的主要贡献在于：
228 | - TD3算法在多智能体场景下的扩展与优化
229 | - 针对Predator-Prey追逃博弈任务的环境适配与优化
230 | - 改进的奖励函数设计，实现高效的围捕协作行为
231 | - 稳定的训练框架，支持各种复杂追逃场景
232 | 
233 | 如遇到任何问题，欢迎提交Issue或Pull Request。若您有兴趣扩展更多追逃博弈场景或改进算法，我们欢迎您的贡献！


--------------------------------------------------------------------------------
/MATD3_Continous/utils/conda-environment.yml:
--------------------------------------------------------------------------------
 1 | name: MARL
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 |   - https://repo.anaconda.com/pkgs/main
 6 |   - https://repo.anaconda.com/pkgs/r
 7 | dependencies:
 8 |   - blas=1.0
 9 |   - brotli-python=1.0.9
10 |   - bzip2=1.0.8
11 |   - ca-certificates=2025.2.25
12 |   - certifi=2025.1.31
13 |   - filelock=3.13.1
14 |   - freetype=2.12.1
15 |   - gmpy2=2.1.2
16 |   - intel-openmp=2023.1.0
17 |   - jinja2=3.1.4
18 |   - jpeg=9e
19 |   - lcms2=2.16
20 |   - lerc=4.0.0
21 |   - libdeflate=1.22
22 |   - libffi=3.4.4
23 |   - libjpeg-turbo=2.0.0
24 |   - libpng=1.6.39
25 |   - libtiff=4.5.1
26 |   - libwebp-base=1.3.2
27 |   - llvm-openmp=14.0.6
28 |   - lz4-c=1.9.4
29 |   - markupsafe=2.1.3
30 |   - mkl=2023.1.0
31 |   - mkl-service=2.4.0
32 |   - mkl_fft=1.3.8
33 |   - mkl_random=1.2.4
34 |   - mpc=1.1.0
35 |   - mpfr=4.0.2
36 |   - mpmath=1.3.0
37 |   - numpy=1.26.4
38 |   - numpy-base=1.26.4
39 |   - openjpeg=2.5.2
40 |   - openssl=3.0.16
41 |   - pip=24.2
42 |   - pybind11-abi=4
43 |   - pysocks=1.7.1
44 |   - python=3.11.8
45 |   - pyyaml=6.0.2
46 |   - requests=2.32.3
47 |   - setuptools=75.1.0
48 |   - sqlite=3.45.3
49 |   - sympy=1.13.3
50 |   - tbb=2021.8.0
51 |   - tk=8.6.14
52 |   - typing_extensions=4.12.2
53 |   - wheel=0.44.0
54 |   - xz=5.4.6
55 |   - yaml=0.2.5
56 |   - zlib=1.2.13
57 |   - zstd=1.5.6
58 | prefix: /Users/ronchy2000/DevelopEnv/anaconda3/envs/MARL
59 | 


--------------------------------------------------------------------------------
/MATD3_Continous/utils/linux_environment.yml:
--------------------------------------------------------------------------------
 1 | name: MARL
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 |   - https://repo.anaconda.com/pkgs/main
 6 |   - https://repo.anaconda.com/pkgs/r
 7 | dependencies:
 8 |   - blas=1.0
 9 |   - brotli-python=1.0.9
10 |   - bzip2=1.0.8
11 |   - ca-certificates=2025.2.25
12 |   - certifi=2025.1.31
13 |   - filelock=3.13.1
14 |   - freetype=2.12.1
15 |   - gmp=6.2.1
16 |   - gmpy2=2.1.2
17 |   - intel-openmp=2023.1.0
18 |   - jinja2=3.1.4
19 |   - jpeg=9e
20 |   - lcms2=2.16
21 |   - lerc=4.0.0
22 |   - libcxx=14.0.6
23 |   - libdeflate=1.22
24 |   - libffi=3.4.4
25 |   - libgfortran=5.0.0
26 |   - libgfortran5=11.3.0
27 |   - libjpeg-turbo=2.0.0
28 |   - libpng=1.6.39
29 |   - libtiff=4.5.1
30 |   - libwebp-base=1.3.2
31 |   - llvm-openmp=14.0.6
32 |   - lz4-c=1.9.4
33 |   - markupsafe=2.1.3
34 |   - mkl=2023.1.0
35 |   - mkl-service=2.4.0
36 |   - mkl_fft=1.3.8
37 |   - mkl_random=1.2.4
38 |   - mpc=1.1.0
39 |   - mpfr=4.0.2
40 |   - mpmath=1.3.0
41 |   - ncurses=6.4
42 |   - numpy-base=1.26.4
43 |   - openjpeg=2.5.2
44 |   - openssl=3.0.16
45 |   - pip=24.2
46 |   - pybind11-abi=4
47 |   - pysocks=1.7.1
48 |   - python=3.11.8
49 |   - pytorch=2.2.2
50 |   - pyyaml=6.0.2
51 |   - readline=8.2
52 |   - requests=2.32.3
53 |   - setuptools=75.1.0
54 |   - sqlite=3.45.3
55 |   - sympy=1.13.3
56 |   - tbb=2021.8.0
57 |   - tk=8.6.14
58 |   - torchaudio=2.2.2
59 |   - torchvision=0.17.2
60 |   - typing_extensions=4.12.2
61 |   - wheel=0.44.0
62 |   - xz=5.4.6
63 |   - yaml=0.2.5
64 |   - zlib=1.2.13
65 |   - zstd=1.5.6
66 |   - pip:
67 |       - charset-normalizer==3.4.1
68 |       - cloudpickle==3.1.0
69 |       - contourpy==1.3.1
70 |       - cycler==0.12.1
71 |       - farama-notifications==0.0.4
72 |       - fonttools==4.55.3
73 |       - gymnasium==1.0.0
74 |       - idna==3.10
75 |       - jsonpatch==1.33
76 |       - jsonpointer==3.0.0
77 |       - kiwisolver==1.4.8
78 |       - matplotlib==3.8.3
79 |       - networkx==3.4.2
80 |       - numpy==2.2.1
81 |       - packaging==24.2
82 |       - pandas==2.2.1
83 |       - pettingzoo==1.24.4
84 |       - pillow==11.1.0
85 |       - pip-chill==1.0.3
86 |       - pygame==2.6.1
87 |       - pyparsing==3.2.1
88 |       - python-dateutil==2.9.0.post0
89 |       - pytz==2025.1
90 |       - scipy==1.15.0
91 |       - six==1.17.0
92 |       - tornado==6.4.2
93 |       - tzdata==2025.1
94 |       - urllib3==2.3.0
95 |       - visdom==0.2.4
96 |       - websocket-client==1.8.0
97 | prefix: /Users/ronchy2000/DevelopEnv/anaconda3/envs/MARL
98 | 


--------------------------------------------------------------------------------
/MATD3_Continous/utils/mac_arm_M4_environment.yml:
--------------------------------------------------------------------------------
 1 | name: MARL
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 |   - https://repo.anaconda.com/pkgs/main
 6 |   - https://repo.anaconda.com/pkgs/r
 7 | dependencies:
 8 |   - blas=1.0
 9 |   - brotli-python=1.0.9
10 |   - bzip2=1.0.8
11 |   - ca-certificates=2025.2.25
12 |   - certifi=2025.1.31
13 |   - filelock=3.13.1
14 |   - freetype=2.12.1
15 |   - gmp=6.2.1
16 |   - gmpy2=2.1.2
17 | #  - intel-openmp=2023.1.0
18 |   - jinja2=3.1.4
19 |   - jpeg=9e
20 |   - lcms2=2.16
21 |   - lerc=4.0.0
22 |   - libcxx=14.0.6
23 |   - libdeflate=1.22
24 |   - libffi=3.4.4
25 |   - libgfortran=5.0.0
26 |   - libgfortran5=11.3.0
27 |   - libjpeg-turbo=2.0.0
28 |   - libpng=1.6.39
29 |   - libtiff=4.5.1
30 |   - libwebp-base=1.3.2
31 |   - llvm-openmp=14.0.6
32 |   - lz4-c=1.9.4
33 |   - markupsafe=2.1.3
34 | #  - mkl=2023.1.0
35 | #  - mkl-service=2.4.0
36 | #  - mkl_fft=1.3.8
37 | #  - mkl_random=1.2.4
38 |   - mpc=1.1.0
39 |   - mpfr=4.0.2
40 |   - mpmath=1.3.0
41 |   - ncurses=6.4
42 |   - numpy-base=1.26.4
43 |   - openjpeg=2.5.2
44 |   - openssl=3.0.16
45 |   - pip=24.2
46 |   - pybind11-abi=4
47 |   - pysocks=1.7.1
48 |   - python=3.11.8
49 |   - pytorch=2.2.2
50 |   - pyyaml=6.0.2
51 |   - readline=8.2
52 |   - requests=2.32.3
53 |   - setuptools=75.1.0
54 |   - sqlite=3.45.3
55 |   - sympy=1.13.3
56 |   - tbb=2021.8.0
57 |   - tk=8.6.14
58 |   - torchaudio=2.2.2
59 |   - torchvision=0.17.2
60 |   - typing_extensions=4.12.2
61 |   - wheel=0.44.0
62 |   - xz=5.4.6
63 |   - yaml=0.2.5
64 |   - zlib=1.2.13
65 |   - zstd=1.5.6
66 | 
67 | 


--------------------------------------------------------------------------------
/MATD3_Continous/utils/pip-requirements.txt:
--------------------------------------------------------------------------------
 1 | brotli==1.0.9
 2 | gmpy2==2.1.2
 3 | gymnasium==1.0.0
 4 | importlib-metadata==8.0.0
 5 | importlib-resources==6.4.0
 6 | jaraco.collections==5.1.0
 7 | matplotlib==3.8.3
 8 | mkl-fft==1.3.8
 9 | mkl-random==1.2.4
10 | pandas==2.2.1
11 | pip-chill==1.0.3
12 | platformdirs==4.2.2
13 | pygame==2.6.1
14 | pysocks==1.7.1
15 | pyyaml==6.0.2
16 | tomli==2.0.1
17 | visdom==0.2.4
18 | 


--------------------------------------------------------------------------------
/MATD3_Continous/utils/pip-requirements_mac_arm_M4.txt:
--------------------------------------------------------------------------------
 1 | brotli==1.0.9
 2 | gmpy2==2.1.2
 3 | gymnasium==1.1.1
 4 | importlib-metadata==8.0.0
 5 | importlib-resources==6.4.0
 6 | jaraco.collections==5.1.0
 7 | matplotlib==3.8.3
 8 | pandas==2.2.1
 9 | pip-chill==1.0.3
10 | platformdirs==4.2.2
11 | pygame==2.6.1
12 | pysocks==1.7.1
13 | pyyaml==6.0.2
14 | tomli==2.0.1
15 | visdom==0.2.4
16 | 


--------------------------------------------------------------------------------
/MATD3_Continous/utils/setupPettingzoo.py:
--------------------------------------------------------------------------------
 1 | # 使用 sys.executable 获取当前虚拟环境的 pip，这样它会始终使用当前虚拟环境的 pip 安装包，而不是系统环境的 pip
 2 | import pkg_resources
 3 | import sys
 4 | import platform
 5 | from subprocess import call
 6 | 
 7 | def check_and_install_pettingzoo():
 8 |     # 打印当前虚拟环境的相关信息
 9 |     print("================================")
10 |     print(f"Current Python executable: {sys.executable}")
11 |     print(f"Python version: {sys.version}")
12 |     print(f"Current virtual environment: {sys.prefix}")
13 |     print(f"Platform: {platform.system()} {platform.release()}")
14 |     print("================================")
15 | 
16 |     try:
17 |         # 检查 pettingzoo 是否已经安装
18 |         pkg_resources.get_distribution("pettingzoo")
19 |         print("================================")
20 |         print("pettingzoo is already installed.")
21 |         print("================================")
22 |     except pkg_resources.DistributionNotFound:
23 |         # 如果 pettingzoo 没有安装，执行安装操作
24 |         print("================================")
25 |         print("pettingzoo is not installed. Installing pettingzoo...")
26 |         print("================================")
27 |         
28 |         # 获取当前虚拟环境的 Python 解释器路径
29 |         python_executable = sys.executable
30 |         pip_executable = python_executable.replace("python", "pip")  # 获取 pip 路径
31 | 
32 |         # 尝试安装 pettingzoo==1.24.4
33 |         try:
34 |             print("Attempting to install pettingzoo==1.24.4...")
35 |             result = call([pip_executable, "install", "pettingzoo==1.24.4"])
36 |             if result == 0:
37 |                 print("================================")
38 |                 print("Successfully installed pettingzoo==1.24.4")
39 |                 print("================================")
40 |             else:
41 |                 print("Installation of pettingzoo==1.24.4 failed. Trying GitHub installation...")
42 |                 # 如果安装失败，尝试从 GitHub 安装
43 |                 try:
44 |                     result = call([pip_executable, "install", "\"pettingzoo[mpe] @ git+https://github.com/Farama-Foundation/PettingZoo.git\""])
45 |                     if result == 0:
46 |                         print("================================")
47 |                         print("Successfully installed pettingzoo from GitHub.")
48 |                         print("================================")
49 |                     else:
50 |                         print("GitHub installation failed. Please check the error above.")
51 |                 except Exception as e:
52 |                     print(f"Failed to install pettingzoo from GitHub: {e}")
53 |                     print("================================")
54 |                     print("Please manually install pettingzoo or check the error above.")
55 |         except Exception as e:
56 |             print(f"Failed to install pettingzoo==1.24.4: {e}")
57 |             print("Attempting to install pettingzoo from GitHub...")
58 | 
59 | if __name__ == "__main__":
60 |     check_and_install_pettingzoo()
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 强化学习与多智能体强化学习项目集
  2 | [ 🇺🇸 English](./README_en.md) | 🇨🇳 中文文档
  3 | 
  4 | ![项目总状态](https://img.shields.io/badge/状态-维护模式-blue) ![Python](https://img.shields.io/badge/Python-3.11.8%2B-blue) ![强化学习](https://img.shields.io/badge/强化学习-基础到高级-orange) ![多智能体](https://img.shields.io/badge/多智能体-MADDPG实现-success)
  5 | 
  6 | 本仓库包含强化学习（RL）和多智能体强化学习（MARL）相关的多个项目，既有经典算法的复现，也有个人的研究实现。通过这些项目，我希望构建从基础强化学习到多智能体强化学习的完整学习路径。
  7 | 
  8 | | 项目 | 状态 | 完成度 | 技术栈 | 文档索引 |
  9 | |------|------|--------|--------|----------|
 10 | | [RL_Learning-main](./RL_Learning-main/) | ![状态](https://img.shields.io/badge/状态-已完成-success) | ![完成度](https://img.shields.io/badge/完成度-90%25-green) | ![技术](https://img.shields.io/badge/技术-基础RL算法-blue) | [已实现算法](./RL_Learning-main/README.md#已实现算法) |
 11 | | [动手学强化学习](./动手学强化学习/) | ![状态](https://img.shields.io/badge/状态-参考实现-informational) | ![完成度](https://img.shields.io/badge/完成度-100%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-DQN到DDPG-blue) | [README](./动手学强化学习/README.md) |
 12 | | [MADDPG_Continous](./MADDPG_Continous/) | ![状态](https://img.shields.io/badge/状态-已完成-success) | ![完成度](https://img.shields.io/badge/完成度-100%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-连续MADDPG-blue) | [中文文档](./MADDPG_Continous/README.md#项目特色) |
 13 | | [MATD3_Continous](./MATD3_Continous/) | ![状态](https://img.shields.io/badge/状态-已完成-success) | ![完成度](https://img.shields.io/badge/完成度-100%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-连续MATD3-blue) | [中文文档](./MATD3_Continous/readme.md) |
 14 | 
 15 | 
 16 | ## 学习路径与项目关联
 17 | 本仓库中的项目构成了一条从基础强化学习到多智能体强化学习的完整学习路径：
 18 | 
 19 | 1. **基础理论与算法** (RL_Learning-main)：掌握强化学习的数学基础和基本算法
 20 | 2. **基础算法实现** (动手学强化学习)：动手实现基础强化学习算法
 21 | 4. **多智能体扩展** (MADDPG_Continous, MATD3_Continous)：将单智能体算法扩展到多智能体场景
 22 | 
 23 | ## 项目结构
 24 | ### RL_Learning-main：强化学习基础代码复现
 25 | 
 26 | 复现西湖大学**赵世钰老师**的强化学习课程代码，包括值迭代、策略迭代、蒙特卡洛、时序差分、DQN、Reinforce等算法实现。这部分是理解强化学习基础算法的最佳起点。
 27 | 
 28 | <div align="center">
 29 |   <img src="./RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/policy_iteration.png" width="45%" alt="策略迭代可视化"/>
 30 |   <img src="./RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/value_iteration.png" width="45%" alt="值迭代可视化"/>
 31 |   <p><strong>从左到右: 策略迭代算法、值迭代算法可视化</strong></p>
 32 | </div>
 33 | 
 34 | #### 参考资源
 35 | - [赵老师强化学习课程](https://www.bilibili.com/video/BV1sd4y167NS)
 36 | - [强化学习的数学原理](https://github.com/MathFoundationRL/Book-Mathematical-Foundation-of-Reinforcement-Learning)
 37 | #### 代码位置  [`赵老师强化学习代码仓库: ./RL_Learning-main`](./RL_Learning-main/scripts)
 38 | 
 39 | #### 更新日志
 40 | 
 41 | **2024.6.7**  
 42 | 重大更新! 原作者render坐标与state设置不一致。坐标已统一修改为：  
 43 | ![img.png](img.png)
 44 | > 原始代码来源: https://github.com/jwk1rose/RL_Learning  
 45 | > 本人正在重构代码，尽量分解为更多独立模块并添加详细注释。
 46 | >Refactoring the code of jwk1rose,I'm trying to divide it into as many sections as possible and write comments.
 47 | 
 48 | ---
 49 | ### 二、动手学强化学习
 50 | 《动手学强化学习》书籍代码的复现与扩展，最终目标是扩展到MADDPG。这部分是我系统学习强化学习的记录，从基础算法到高级算法的实现。
 51 | #### 实现算法
 52 | - DQN (Deep Q-Network)
 53 | - Policy Gradient (REINFORCE)
 54 | - Actor-Critic
 55 | - DDPG (Deep Deterministic Policy Gradient)
 56 | #### 学习路径
 57 | 这部分展示了从基础DQN到DDPG，再到MADDPG的学习路径，是理解多智能体强化学习的基础铺垫。
 58 | #### 代码位置 [`./动手学强化学习`](./动手学强化学习/)
 59 | 
 60 | #### 参考资源
 61 | - [动手学强化学习](https://hrl.boyuai.com/chapter/2/dqn%E7%AE%97%E6%B3%95)
 62 | - [HandsOnRL GitHub](https://github.com/peterwu4084/HandsOnRL/tree/main)
 63 | 
 64 | ---
 65 | ### 三、多智能体强化学习实现
 66 | > **本项目专为Predator-Prey追逃博弈任务优化！** 在`PettingZoo MPE`环境基础上重构修改，提供了完整的多智能体协作与对抗环境，适用于围捕控制、群体智能和策略博弈研究。
 67 | 
 68 | 在掌握了基础强化学习算法后，我们自然会思考：如何将这些方法扩展到多个智能体同时学习的场景？多智能体强化学习（MARL）正是解决这一问题的关键技术。以下是我在MARL领域的两个主要实现。
 69 | 
 70 | #### 3.1 MADDPG_Continous：多智能体深度确定性策略梯度算法
 71 | 
 72 | 
 73 | 个人基于最新版**Pettingzoo**`(pettingzoo==1.25.0)`中的MPE环境，实现的连续状态，连续动作下的MADDPG算法，支持连续动作空间的多智能体协作与竞争。
 74 | 
 75 | > MADDPG algorithm Reference: https://github.com/Git-123-Hub/maddpg-pettingzoo-pytorch
 76 | 
 77 | <div align="center">
 78 |   <img src="./MADDPG_Continous/plot/simple_tag_v3_demo_loop.gif" alt="智能体行为" width="45%"/>
 79 |   <p><strong>训练后的智能体行为展示：捕食者(红色)追逐猎物(绿色)的过程</strong></p>
 80 | 
 81 |   <img src="./MADDPG_Continous/plot/demo-rewards_plot_ma.png" alt="训练收敛结果" width="80%"/>
 82 |   <p><strong>MADDPG算法在simple_tag_v3环境中的奖励收敛曲线</strong></p>
 83 | </div>
 84 | 
 85 | 
 86 | #### 实现进度
 87 | | 算法            | 状态   | 位置                  | 核心组件                           |
 88 | |----------------|--------|----------------------|----------------------------------|
 89 | | MADDPG         | ✅ 1.0 | `agents/maddpg/`        | MADDPG_agent, DDPG_agent, buffer |
 90 | | Independent RL | ⏳ 待完成 | `agents/independent/`| IndependentRL (计划中)          |
 91 | | Centralized RL | ⏳ 待完成 | `agents/centralized/`| CentralizedRL (计划中)          |
 92 | #### 代码位置  [`./MADDPG_Continous`](./MADDPG_Continous)
 93 | 
 94 | 
 95 | #### 3.2 MATD3_Continous：多智能体双延迟深度确定性策略梯度算法
 96 | 
 97 | 基于TD3算法的多智能体扩展版本(MATD3: Twin Delayed Deep Deterministic Policy Gradient)，相比MADDPG，通过双Q网络和目标策略平滑机制有效解决过估计问题，提供更稳定的训练和更优的策略。
 98 | 
 99 | > MATD3 algorithm Reference: https://github.com/wild-firefox/FreeRL/blob/main/MADDPG_file/MATD3_simple.py
100 | 
101 | <div align="center">
102 |   <img src="./MATD3_Continous/plot/training_rewards_demo.png" alt="训练收敛结果" width="80%"/>
103 |   <p><strong>MATD3算法在simple_tag_env环境中的奖励收敛曲线</strong></p>
104 | </div>
105 | 
106 | #### MATD3 vs MADDPG
107 | MATD3对标准MADDPG进行了以下关键增强：
108 | 
109 | 1. **双Q网络设计**: 减少对动作值的过估计
110 | 2. **延迟策略更新**: 提高训练稳定性
111 | 3. **目标策略平滑**: 通过在目标动作中加入噪声防止过拟合
112 | 4. **自适应噪声调整**: 根据训练进度动态调整探索噪声
113 | 
114 | #### 代码位置  [`./MATD3_Continous`](./MATD3_Continous)
115 | 
116 | 
117 | 
118 | ## 进行中的项目
119 | - **MARL**: 基于深度强化学习的多智能体协作与协调
120 |   - 探索不同通信机制对多智能体协作的影响
121 |   - 研究异构智能体在复杂环境中的协作策略
122 | 
123 | - **图上的多智能体协调与决策**
124 |   - 将多智能体强化学习与图神经网络结合
125 |   - 研究大规模图结构上的多智能体协调问题
126 | - **多智能体强化学习的应用**
127 |   - 探索多智能体强化学习在工业、医疗等领域的应用
128 |   - 研究多智能体强化学习在不同场景下的性能优化
129 | 
130 | ## 联系方式
131 | 如有任何问题，请随时联系我。
132 | ronchy_lu AT 163 dot com
133 | 
134 | Fight for MARL.
135 | 
136 | 
137 | 
138 | ## Star History
139 | 
140 | <a href="https://www.star-history.com/#Ronchy2000/Multi-agent-RL&Date">
141 | 
142 |  <picture>
143 |    <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Ronchy2000/Multi-agent-RL&type=Date&theme=dark" />
144 |    <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Ronchy2000/Multi-agent-RL&type=Date" />
145 |    <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Ronchy2000/Multi-agent-RL&type=Date" />
146 |  </picture>
147 | </a>
148 | 


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter10_Actor Critic/1.[QAC]Simplest actor critic.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter10_Actor Critic/1.[QAC]Simplest actor critic.py


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter10_Actor Critic/2.[A2C]Advantage actor critic.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter10_Actor Critic/2.[A2C]Advantage actor critic.py


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter10_Actor Critic/3.1Importance sampling.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | '''
 6 | There is no need sampling too much, because you can't tell the samples if you collect more than 1000 samples.
 7 | That's catastrophe for ploting!
 8 | 200 samples are enough.
 9 | 2024.10.4
10 | '''
11 | class Importance_sampling:
12 |     def __init__(self,p0_probability, p1_probability):
13 |         self.seed = 42
14 |         self.p0_probability = p0_probability
15 |         self.p1_probability = p1_probability
16 |         self.p1_values = [1, -1]
17 |         self.p1_samples = np.array([])
18 |     def sampling(self,sampling_size):
19 |         # generate samples
20 |         np.random.seed(self.seed)
21 |         self.p1_samples = np.random.choice(self.p1_values, size=sampling_size, p=self.p1_probability)
22 | 
23 |         print("p1_samples::", self.p1_samples.shape) # 采样结果 向量。
24 |     def calculate(self):
25 |         if not self.p1_samples.size: #if p1_samples is empty, raise error.   self.sample返回的不是一个bool值
26 |             raise ValueError("Please generate p1_samples first")
27 |         # 计算累积和
28 |         cumulative_sum = np.cumsum(self.p1_samples)
29 |         # 计算累积平均值
30 |         p1_samples_average = cumulative_sum / np.arange(1, len(self.p1_samples) + 1)
31 | 
32 |         p_xi1 = np.where(self.p1_samples == -1, self.p1_probability[1], self.p1_probability[0])
33 |         p_xi0 = np.where(self.p1_samples == -1, self.p0_probability[1], self.p0_probability[0])
34 |         # 计算
35 |         importance_p0_samples = (p_xi0/p_xi1) * self.p1_samples  #Core，importance sampling 的体现
36 | 
37 | 
38 |         cumulative_sum = np.cumsum(importance_p0_samples)
39 |         cumulative_importance_p0_average = cumulative_sum / np.arange(1, len(importance_p0_samples) + 1)
40 | 
41 |         return p1_samples_average, cumulative_importance_p0_average
42 |     def render(self,average_result, importance_sampling_result):
43 |         plt.figure(figsize=(10, 6)) # set size of figure
44 | 
45 | 
46 |         x1 = np.arange(len(self.p1_samples[:200]))
47 |         y1 = self.p1_samples[:200]
48 |         # plt.xlim(0, x.shape[0])  # adaptive is fine.
49 |         plt.ylim(-2, 2) # set x,y range
50 |         plt.plot(x1, y1, 'ro', markerfacecolor='none', label='p0_samples')
51 | 
52 |         y0 = average_result[:200]
53 |         plt.plot(x1, y0, 'b.', label='average')
54 | 
55 |         y2 = importance_sampling_result[:200]
56 |         plt.plot(x1, y2, 'g-', label='importance sampling')
57 | 
58 | 
59 |         plt.xlabel('Sample index')
60 |         # plt.ylabel()
61 |         plt.legend() #图中带标签
62 |         plt.show()
63 | 
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     p0_probability = [0.5, 0.5]
68 |     p1_probability = [0.8, 0.2]
69 |     importance_sampling = Importance_sampling(p0_probability, p1_probability) #实例化
70 | 
71 |     importance_sampling.sampling(200)
72 |     average_result, importance_sampling_result = importance_sampling.calculate()
73 | 
74 |     importance_sampling.render(average_result, importance_sampling_result)
75 | 
76 |     print("Done!")
77 | 


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter10_Actor Critic/3.[Importance sampling]Off-policy actor critic.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter10_Actor Critic/3.[Importance sampling]Off-policy actor critic.py


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter10_Actor Critic/4.[DPG]Deterministic actor critic.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter10_Actor Critic/4.[DPG]Deterministic actor critic.py


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/policy_iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/policy_iteration.png


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/value_iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/plot_figure/value_iteration.png


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter4_Value iteration and Policy iteration/value iteration.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import time
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | from torch.utils import data
  7 | from torch.utils.tensorboard import SummaryWriter  # 导入SummaryWriter
  8 | 
  9 | # 引用上级目录
 10 | import sys
 11 | sys.path.append("..")
 12 | import grid_env
 13 | 
 14 | 
 15 | class class_value_iteration():
 16 |     def __init__(self, env: grid_env.GridEnv):
 17 |         self.gama = 0.9   #discount rate
 18 |         self.env = env
 19 |         self.action_space_size = env.action_space_size
 20 |         self.state_space_size = env.size**2  #幂运算，grid world的尺寸 如 5 ** 2 = 25的网格世界。
 21 |         self.reward_space_size, self.reward_list = len(self.env.reward_list), self.env.reward_list  #父类中：self.reward_list = [0, 1, -10, -10]
 22 |         #state_value
 23 |         self.state_value = np.zeros(shape=self.state_space_size)  # 1维数组
 24 |         #action value -> Q-table
 25 |         self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size)) # 25 x 5
 26 | 
 27 |         self.mean_policy = np.ones(shape=(self.state_space_size, self.action_space_size)) / self.action_space_size
 28 |         self.policy = self.mean_policy.copy()
 29 |         self.writer = SummaryWriter("../logs")  # 实例化SummaryWriter对象
 30 | 
 31 |         print("action_space_size: {} state_space_size：{}" .format(self.action_space_size ,self.state_space_size) )
 32 |         print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,self.qvalue.shape, self.mean_policy.shape))
 33 |         print("\n分别是non-forbidden area, target area, forbidden area 以及撞墙:")
 34 |         print("self.reward_space_size:{},self.reward_list:{}".format(self.reward_space_size,self.reward_list))
 35 |         print('----------------------------------------------------------------')
 36 | 
 37 |     def value_iteration_new(self, tolerance=0.001, steps=100):
 38 |         """
 39 |         迭代求解最优贝尔曼公式 得到 最优state value tolerance 和 steps 满足其一即可
 40 |         :param tolerance: 当 前后 state_value 的范数小于tolerance 则认为state_value 已经收敛
 41 |         :param steps: 当迭代次数大于step时 停止 建议将此变量设置大一些
 42 |         :return: 剩余迭代次数
 43 |         """
 44 |         # 初始化 V0 为 1
 45 |         state_value_k = np.ones(self.state_space_size)
 46 |         while np.linalg.norm(state_value_k - self.state_value, ord=1)>tolerance and steps>0:
 47 |             steps -= 1
 48 |             self.state_value = state_value_k.copy()
 49 |             """
 50 |                   是普通 policy_improvement 的变种 相当于是值迭代算法 也可以 供策略迭代使用 做策略迭代时不需要 接收第二个返回值
 51 |                   更新 qvalue ；qvalue[state,action]=reward+value[next_state]
 52 |                   找到 state 处的 action*：action* = arg max(qvalue[state,action]) 即最优action即最大qvalue对应的action
 53 |                   更新 policy ：将 action*的概率设为1 其他action的概率设为0 这是一个greedy policy
 54 |                   :param: state_value: policy对应的state value
 55 |                   :return: improved policy, 以及迭代下一步的state_value
 56 |             """
 57 |             # 方法初始化了一个新的策略 policy，所有状态的所有动作的概率都被设置为0
 58 |             policy = np.zeros(shape=(self.state_space_size, self.action_space_size))
 59 |             #state_value_k = state_value_k.copy()
 60 |             #遍历所有的 state
 61 |             q_table = np.zeros(shape=(self.state_space_size, self.action_space_size))
 62 |             for state in range(self.state_space_size):
 63 |                 qvalue_list = []
 64 |                 #遍历所有的 action
 65 |                 for action in range(self.action_space_size):
 66 |                     # 计算qvalue,即acton value.
 67 |                     """
 68 |                      计算qvalue elementwise形式
 69 |                     :param state: 对应的state
 70 |                     :param action: 对应的action
 71 |                     :param state_value: 状态值
 72 |                     :return: 计算出的结果
 73 |                     """
 74 |                     qvalue = 0
 75 |                     for i in range(self.reward_space_size):
 76 |                         # print("self.reward_list[i] * self.env.Rsa[state, action, i]:{}x{}={}".format(self.reward_list[i], self.env.Rsa[state, action, i],self.reward_list[i] * self.env.Rsa[state, action, i]))
 77 |                         qvalue += self.reward_list[i] * self.env.Rsa[state, action, i]
 78 | 
 79 |                     for next_state in range(self.state_space_size):
 80 |                         qvalue += self.gama * self.env.Psa[state, action, next_state] * state_value_k[next_state]
 81 |                     qvalue_list.append(qvalue)
 82 |                 # print("qvalue_list:",qvalue_list)
 83 |                 q_table[state,:] = qvalue_list.copy()
 84 | 
 85 |                 state_value_k[state] = max(qvalue_list)  #取该state 的最大state value
 86 |                 action_star = qvalue_list.index(max(qvalue_list))  #取该state 的最大state value对应的action
 87 |                 policy[state, action_star] = 1  #更新策略，贪婪算法
 88 |             print("q_table:{}".format(q_table))
 89 |             self.policy = policy
 90 |         return steps
 91 | 
 92 | 
 93 |     def show_policy(self):
 94 |         for state in range(self.state_space_size):
 95 |             for action in range(self.action_space_size):
 96 |                 policy = self.policy[state, action]
 97 |                 self.env.render_.draw_action(pos=self.env.state2pos(state),
 98 |                                              toward=policy * 0.4 * self.env.action_to_direction[action],
 99 |                                              radius=policy * 0.1)
100 | 
101 |     def show_state_value(self, state_value, y_offset=0.2):
102 |         for state in range(self.state_space_size):
103 |             self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)),
104 |                                         y_offset=y_offset,
105 |                                         size_discount=0.7)
106 | 
107 |     def obtain_episode(self, policy, start_state, start_action, length):
108 |         """
109 | 
110 |         :param policy: 由指定策略产生episode
111 |         :param start_state: 起始state
112 |         :param start_action: 起始action
113 |         :param length: episode 长度
114 |         :return: 一个 state,action,reward,next_state,next_action 序列
115 |         """
116 |         self.env.agent_location = self.env.state2pos(start_state)
117 |         episode = []
118 |         next_action = start_action
119 |         next_state = start_state
120 |         while length > 0:
121 |             length -= 1
122 |             state = next_state
123 |             action = next_action
124 |             _, reward, done, _, _ = self.env.step(action)
125 |             next_state = self.env.pos2state(self.env.agent_location)
126 |             next_action = np.random.choice(np.arange(len(policy[next_state])),
127 |                                            p=policy[next_state])
128 |             episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state,
129 |                             "next_action": next_action})
130 |         return episode
131 | 
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     print("-----Begin!-----")
136 |     gird_world2x2 = grid_env.GridEnv(size=3, target=[2, 2],
137 |                            forbidden=[[1, 0],[2,1]],
138 |                            render_mode='')
139 | 
140 |     solver = class_value_iteration(gird_world2x2)
141 |     start_time = time.time()
142 | 
143 |     # 执行值迭代算法
144 |     demand_step = 1000
145 |     remaining_steps = solver.value_iteration_new(tolerance=0.1, steps=demand_step)
146 |     if remaining_steps > 0:
147 |         print("Value iteration converged in {} steps.".format(demand_step - remaining_steps))
148 |     else:
149 |         print("Value iteration did not converge in 100 steps.")
150 | 
151 |     end_time = time.time()
152 | 
153 |     cost_time = end_time - start_time
154 |     print("cost_time:{}".format(round(cost_time, 2)))
155 |     print(len(gird_world2x2.render_.trajectory))
156 | 
157 |     solver.show_policy()  # solver.env.render()
158 |     solver.show_state_value(solver.state_value, y_offset=0.25)
159 | 
160 | 
161 |     gird_world2x2.render()


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter5_Monte Carlo Methods/MC_Exploring_Starts.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import time
  3 | import numpy as np
  4 | from torch.utils.tensorboard import SummaryWriter  # 导入SummaryWriter
  5 | 
  6 | # 引用上级目录
  7 | import sys
  8 | sys.path.append("..")
  9 | import grid_env
 10 | 
 11 | 
 12 | '''
 13 | MC Basic 是个Model free 的方法，与value iteration和 Policy iteration对比，数据是MC的必需品。
 14 | 
 15 | 
 16 | '''
 17 | class MC_Exploring_Starts:
 18 |     def __init__(self, env = grid_env.GridEnv):
 19 |         self.gama = 0.9   #discount rate
 20 |         self.env = env
 21 |         self.action_space_size = env.action_space_size
 22 |         self.state_space_size = env.size ** 2
 23 |         self.reward_space_size, self.reward_list = len(self.env.reward_list), self.env.reward_list  # [-10,-10,0,1]  reward list
 24 |         self.state_value = np.zeros(shape=self.state_space_size) #一维列表
 25 |         print("self.state_value:",self.state_value)
 26 |         #Q表和policy 维数一样
 27 |         self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size))  # 二维： state数 x action数
 28 |         self.mean_policy = np.ones(shape=(self.state_space_size, self.action_space_size)) / self.action_space_size  #平均策略，即取每个动作的概率均等
 29 |         self.policy = self.mean_policy.copy()
 30 |         self.writer = SummaryWriter("logs")  # 实例化SummaryWriter对象
 31 | 
 32 |         print("action_space_size: {} state_space_size：{}" .format(self.action_space_size ,self.state_space_size) )
 33 |         print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,self.qvalue.shape, self.mean_policy.shape))
 34 | 
 35 |         print('----------------------------------------------------------------')
 36 |     '''
 37 |         定义可视化grid world所需的函数
 38 |         def show_policy(self)
 39 |         def show_state_value(self, state_value, y_offset=0.2):
 40 |         def obtain_episode(self, policy, start_state, start_action, length):
 41 |     '''
 42 |     def show_policy(self):
 43 |         for state in range(self.state_space_size):
 44 |             for action in range(self.action_space_size):
 45 |                 policy = self.policy[state, action]
 46 |                 self.env.render_.draw_action(pos=self.env.state2pos(state),
 47 |                                              toward=policy * 0.4 * self.env.action_to_direction[action],
 48 |                                              radius=policy * 0.1)
 49 | 
 50 |     def show_state_value(self, state_value, y_offset=0.2):
 51 |         for state in range(self.state_space_size):
 52 |             self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)),
 53 |                                         y_offset=y_offset,
 54 |                                         size_discount=0.7)
 55 | 
 56 |     def obtain_episode(self, policy, start_state, start_action, length):
 57 |         """
 58 |         :param policy: 由指定策略产生episode
 59 |         :param start_state: 起始state
 60 |         :param start_action: 起始action
 61 |         :param length: 一个episode 长度
 62 |         :return: 一个 state,action,reward,next_state,next_action 列表，其中是字典格式
 63 |         """
 64 |         self.env.agent_location = self.env.state2pos(start_state)
 65 |         episode = []
 66 |         next_action = start_action
 67 |         next_state = start_state
 68 |         while length > 0:
 69 |             length -= 1
 70 |             state = next_state
 71 |             action = next_action
 72 |             _, reward, done, _, _ = self.env.step(action)
 73 |             next_state = self.env.pos2state(self.env.agent_location)
 74 |             next_action = np.random.choice(np.arange(len(policy[next_state])),  #[0, len(policy[next_state]) 中随机抽一个随机数
 75 |                                            p=policy[next_state])  #p参数的例子： p=[0.1, 0.2, 0.3, 0.1, 0.3]的概率从 [0,1,2,3,4]这四个数中选取3个数
 76 |             episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state,
 77 |                             "next_action": next_action})  #向列表中添加一个字典
 78 |         return episode
 79 | 
 80 |     def mc_exploring_starts_simple(self, length=50, epochs=10):
 81 |         """
 82 |         :param length: 每一个 state-action 对的长度
 83 |         :return:
 84 |         """
 85 |         for epoch in range(epochs):
 86 |             episode = self.obtain_episode(self.policy, state, action, length)  # policy is mean policy
 87 | 
 88 |             for state in range(self.state_space_size):
 89 |                 for action in range(self.action_space_size):
 90 |                     episode = self.obtain_episode(self.policy, state, action, length)  # policy is mean policy
 91 |                     print("obtain_episode,type:,{}; {}".format(type(episode[0]), episode))
 92 |                     # Policy evaluation:
 93 |                     sum_qvalue = 0
 94 |                     for i in range(len(episode) - 1):
 95 |                         sum_qvalue += self.gama**i * episode[i]['reward']
 96 |                     self.qvalue[state][action] = sum_qvalue
 97 | 
 98 |                 # Policy improvement:
 99 |                 max_index = np.argmax(self.qvalue[state]) # qvalue_star
100 |                 max_qvalue = np.max(self.qvalue[state]) #action_star
101 | 
102 | 
103 |     def mc_exploring_starts_first_visit(self, length=10):
104 |         time_start = time.time()
105 |         # policy = self.mean_policy.copy()
106 |         # policy = np.zeros(shape=(self.state_space_size, self.action_space_size))
107 |         policy = np.random.dirichlet(alpha=[1] * self.action_space_size, size = self.state_space_size)
108 |         print("policy:",policy)
109 |         # policy /= policy.sum(1)
110 | 
111 |         qvalue = self.qvalue.copy()
112 |         returns = [[[0] for col in range(5)] for block in range(25)]
113 |         # returns = [[]]
114 |         print("returns:", returns)
115 |         print("np.linalg.norm(policy - self.policy, ord=1) :",np.linalg.norm(policy - self.policy, ord=1) )
116 |         while np.linalg.norm(policy - self.policy, ord=1) > 0.001:
117 |             print("开始运行：")
118 |             policy = self.policy.copy()
119 |             for state in range(self.state_space_size):
120 |                 for action in range(self.action_space_size):
121 |                     visit_list = []
122 |                     g = 0
123 |                     # Following the current policy, generate an episode of length T ;生成一个episode
124 |                     episode = self.obtain_episode(policy=self.policy, start_state=state, start_action=action,
125 |                                                   length=length)
126 |                     for step in range(len(episode)-1, -1, -1):  #从末尾开始截取
127 |                         reward = episode[step]['reward']
128 |                         state = episode[step]['state']
129 |                         action = episode[step]['action']
130 |                         g = self.gama * g + reward
131 |                         # first visit
132 |                         # print("[state, action] :",[state, action] )
133 |                         if [state, action] not in visit_list:
134 |                             visit_list.append([state, action])
135 |                             # print("visit_list:",visit_list)
136 |                             returns[state][action].append(g)
137 |                             qvalue[state, action] = np.array(returns[state][action]).mean()
138 |                             qvalue_star = qvalue[state].max()
139 |                             action_star = qvalue[state].tolist().index(qvalue_star)
140 |                             self.policy[state] = np.zeros(shape=self.action_space_size).copy()
141 |                             self.policy[state, action_star] = 1
142 |                             # self.state_value[state] = qvalue_star
143 |             print(np.linalg.norm(policy - self.policy, ord=1))
144 | 
145 |         time_end = time.time()
146 |         print("mc_exploring_starts cost time:" + str(time_end - time_start))
147 | 
148 | if __name__ == "__main__":
149 |     episode_length = 2000
150 |     gird_world = grid_env.GridEnv(size=5, target=[2, 3],
151 |                                   forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]],
152 |                                   render_mode='')
153 |     solver = MC_Exploring_Starts(gird_world)
154 |     start_time = time.time()
155 | 
156 |     # solver.state_value = solver.mc_exploring_starts_first_visit(length=episode_length)
157 |     solver.mc_exploring_starts_first_visit(length=episode_length)  # 修改后，利用tqdm显示epoch进度
158 | 
159 |     end_time = time.time()
160 |     cost_time = end_time - start_time
161 |     print("episode_length:{} that the cost_time is:{}".format(episode_length, round(cost_time, 2)))
162 | 
163 |     solver.show_policy()  # solver.env.render()
164 |     solver.show_state_value(solver.state_value, y_offset=0.25)
165 |     gird_world.plot_title("Episode_length = " + str(episode_length))
166 |     gird_world.render()
167 |     # gird_world.render_clear()
168 |     print("--------------------")


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter5_Monte Carlo Methods/MC_epsilon_greedy.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import time
  3 | import numpy as np
  4 | from torch.utils.tensorboard import SummaryWriter  # 导入SummaryWriter
  5 | 
  6 | # 引用上级目录
  7 | import sys
  8 | sys.path.append("..")
  9 | import grid_env
 10 | 
 11 | class MC_epsilon_greedy:
 12 |     def __init__(self, env = grid_env.GridEnv):
 13 |         self.gama = 0.9   #discount rate
 14 |         self.env = env
 15 |         self.action_space_size = env.action_space_size
 16 |         self.state_space_size = env.size ** 2
 17 |         self.reward_space_size, self.reward_list = len(self.env.reward_list), self.env.reward_list  # [-10,-10,0,1]  reward list
 18 |         self.state_value = np.zeros(shape=self.state_space_size) #一维列表
 19 |         print("self.state_value:",self.state_value)
 20 |         #Q表和policy 维数一样
 21 |         self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size))  # 二维： state数 x action数
 22 |         self.mean_policy = np.ones(shape=(self.state_space_size, self.action_space_size)) / self.action_space_size  #平均策略，即取每个动作的概率均等
 23 |         self.policy = self.mean_policy.copy()
 24 |         self.writer = SummaryWriter("logs")  # 实例化SummaryWriter对象
 25 | 
 26 |         print("action_space_size: {} state_space_size：{}" .format(self.action_space_size ,self.state_space_size) )
 27 |         print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,self.qvalue.shape, self.mean_policy.shape))
 28 | 
 29 |         print('----------------------------------------------------------------')
 30 |     '''
 31 |         定义可视化grid world所需的函数
 32 |         def show_policy(self)
 33 |         def show_state_value(self, state_value, y_offset=0.2):
 34 |         def obtain_episode(self, policy, start_state, start_action, length):
 35 |     '''
 36 |     def show_policy(self):
 37 |         for state in range(self.state_space_size):
 38 |             for action in range(self.action_space_size):
 39 |                 policy = self.policy[state, action]
 40 |                 self.env.render_.draw_action(pos=self.env.state2pos(state),
 41 |                                              toward=policy * 0.4 * self.env.action_to_direction[action],
 42 |                                              radius=policy * 0.1)
 43 | 
 44 |     def show_state_value(self, state_value, y_offset=0.2):
 45 |         for state in range(self.state_space_size):
 46 |             self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)),
 47 |                                         y_offset=y_offset,
 48 |                                         size_discount=0.7)
 49 | 
 50 |     def obtain_episode(self, policy, start_state, start_action, length):
 51 |         """
 52 |         :param policy: 由指定策略产生episode
 53 |         :param start_state: 起始state
 54 |         :param start_action: 起始action
 55 |         :param length: 一个episode 长度
 56 |         :return: 一个 state,action,reward,next_state,next_action 列表，其中是字典格式
 57 |         """
 58 |         self.env.agent_location = self.env.state2pos(start_state)
 59 |         episode = []
 60 |         next_action = start_action
 61 |         next_state = start_state
 62 |         while length > 0:
 63 |             length -= 1
 64 |             state = next_state
 65 |             action = next_action
 66 |             _, reward, done, _, _ = self.env.step(action)
 67 |             next_state = self.env.pos2state(self.env.agent_location)
 68 |             next_action = np.random.choice(np.arange(len(policy[next_state])),  #[0, len(policy[next_state]) 中随机抽一个随机数
 69 |                                            p=policy[next_state])  #p参数的例子： p=[0.1, 0.2, 0.3, 0.1, 0.3]的概率从 [0,1,2,3,4]这四个数中选取3个数
 70 |             episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state,
 71 |                             "next_action": next_action})  #向列表中添加一个字典
 72 |         return episode
 73 | 
 74 |     def mc_epsilon_greedy(self, episodes, episode_length, epsilon = 0.5 ):
 75 |         # 初始化Returns和Num计数器
 76 |         returns = np.zeros(self.qvalue.shape)  # 初始化回报累计
 77 |         num_visits = np.zeros(self.qvalue.shape, dtype=int)  # 初始化访问次数
 78 | 
 79 |         for _ in range(episodes):
 80 |             # Episode generation
 81 |             start_state = np.random.randint(self.state_space_size)  # 随机选择起始状态
 82 |             start_action = np.random.choice(np.arange(self.action_space_size),  # 随机选择起始动作
 83 |                                             p=self.policy[start_state])
 84 | 
 85 |             episode = self.obtain_episode(self.policy, start_state, start_action,
 86 |                                           episode_length)  # 获取一个episode
 87 | 
 88 |             # 对于每个step的回报累积和访问次数更新
 89 |             for step in reversed(episode):  # 逆序遍历，从T-1到0
 90 |                 state, action, reward = step["state"], step["action"], step["reward"]
 91 |                 G = reward  # 当前步的即时奖励
 92 |                 for rt in episode[::-1][episode.index(step):]:  # 从当前步开始反向累加未来奖励
 93 |                     G = self.gama * G + rt["reward"]  # 累积折扣回报
 94 |                 returns[state, action] += G  # 更新累积回报
 95 |                 num_visits[state, action] += 1  # 更新状态动作对的访问次数
 96 | 
 97 |             # Policy evaluation
 98 |             self.qvalue = np.divide(returns, num_visits, where=num_visits != 0)  # 避免除以零错误
 99 |             # Policy improvement
100 |             best_actions = np.argmax(self.qvalue, axis=1)  # 找到每个状态下最优的动作
101 |             for state in range(self.state_space_size):
102 |                 for action in range(self.action_space_size):
103 |                     # self.policy[state, action] = (1 - epsilon + epsilon / self.action_space_size) * (
104 |                     #             action == best_actions[state]) + \
105 |                     #                              (epsilon / self.action_space_size) * (action != best_actions[state])
106 |                     self.policy[state, :] = 0  # 先将所有动作概率设为0
107 |                     self.policy[state, best_actions[state]] = 1  # 最优动作概率设为1
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     episodes = 1000
112 |     episode_length = 2000
113 |     gird_world = grid_env.GridEnv(size=5, target=[2, 3],
114 |                                   forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]],
115 |                                   render_mode='')
116 |     solver = MC_epsilon_greedy(gird_world)
117 |     start_time = time.time()
118 | 
119 |     # solver.state_value = solver.mc_exploring_starts_first_visit(length=episode_length)
120 |     solver.mc_epsilon_greedy(episodes, episode_length)  # 修改后，利用tqdm显示epoch进度
121 | 
122 |     end_time = time.time()
123 |     cost_time = end_time - start_time
124 |     print("episode_length:{} that the cost_time is:{}".format(episode_length, round(cost_time, 2)))
125 | 
126 |     solver.show_policy()  # solver.env.render()
127 |     solver.show_state_value(solver.state_value, y_offset=0.25)
128 |     gird_world.plot_title("Episode_length = " + str(episode_length))
129 |     gird_world.render()
130 |     # gird_world.render_clear()
131 |     print("--------------------")


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter6_Stochastic_approximation/Robbins-Monro algorithm.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | # import decimal  #用numpy计算，弃用decimal
 4 | # decimal.getcontext().prec = 50
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | """
 8 | Consider an example: g(w) = w**3 - 5
 9 | analytical solution: g(w) = 0; w**3 = 5;  5^(1/3) ≈ 1.71.
10 | 
11 | Now, suppose that we can only observe the input w and the output   g̃(w) = g(w) + η,
12 | 
13 | """
14 | 
15 | w_k = [0]  # w_1 = 0
16 | g_tilde = []
17 | 
18 | # eta = np.random.normal(size=10) # η 高斯噪声
19 | # print("eta:",eta)
20 | eta_list = [] #plot用
21 | def calculate_g_tilde(w):
22 |     eta = np.random.normal()
23 |     eta_list.append(eta)
24 |     g_tilde =np.array(w**3) - 5 + eta
25 | 
26 |     # g_tilde = decimal.Decimal(w ** 3) - 5
27 |     return (g_tilde)
28 | 
29 | for a_k in range(2,550):  # a_k 要从2开始
30 |     g_tilde.append( calculate_g_tilde(w_k[-1]) ) # g_k
31 |     # print("g_tilde",g_tilde)
32 |     w_k.append( w_k[-1] - np.array(1/a_k) * g_tilde[-1] )
33 |     # print("w_k" ,w_k)
34 | print("w_k",w_k)  #w_k[-1]是结果
35 | print('---------------------------')
36 | print("实际结果：",np.cbrt(5))  #numpy开立方
37 | print("迭代最后结果：",w_k[-1])
38 | 
39 | 
40 | 
41 | # 绘制第一个图表
42 | plt.figure(figsize=(10, 5))
43 | plt.plot(range(1, len(w_k)+1), w_k,  marker='o',markerfacecolor='none',   # 空心，设置填充色为透明
44 |          markeredgecolor='blue',   # 边框颜色为蓝色
45 |          markersize=10,
46 |          linestyle='-', color='blue', label='Estimated root w_k')
47 | plt.xlabel('Iteration index k', fontsize = 12)
48 | plt.ylabel('Estimated root w_k', fontsize = 12)
49 | 
50 | # 绘制第二个图表
51 | plt.figure(figsize=(8, 5))
52 | plt.plot(range(len(eta_list)), eta_list,  marker='o',markerfacecolor='none',   # 空心，设置填充色为透明
53 |          markeredgecolor='green',   # 边框颜色为蓝色
54 |          markersize=10,
55 |          linestyle='-',  color='green', label='Observation noise')
56 | plt.xlabel('Iteration index k', fontsize = 12)
57 | plt.ylabel('Observation noise', fontsize = 12)
58 | 
59 | # 添加图例
60 | plt.legend()
61 | 
62 | # 显示图表
63 | plt.show()


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/1.Sarsa.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import time
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from torch.utils.tensorboard import SummaryWriter  # 导入SummaryWriter
  6 | 
  7 | # 引用上级目录
  8 | import sys
  9 | sys.path.append("..")
 10 | import grid_env
 11 | 
 12 | """
 13 | SARSA: State - action - reward - state - action
 14 | 
 15 | TD learning of acton values: Sarsa  ->  directly estimate action values.
 16 | """
 17 | class Sarsa():
 18 |     def __init__(self,alpha,env = grid_env.GridEnv):
 19 |         self.gama = 0.9  # discount rate
 20 |         self.alpha = alpha  #learning rate
 21 |         self.env = env
 22 |         self.action_space_size = env.action_space_size
 23 |         self.state_space_size = env.size ** 2
 24 |         self.reward_space_size, self.reward_list = len(
 25 |             self.env.reward_list), self.env.reward_list  # [-10,-10,0,1]  reward list
 26 |         self.state_value = np.zeros(shape=self.state_space_size)  # 一维列表
 27 |         print("self.state_value:", self.state_value)
 28 |         self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size))  # 二维： state数 x action数
 29 |         self.mean_policy = np.ones(     #self.mean_policy shape: (25, 5)
 30 |             shape=(self.state_space_size, self.action_space_size)) / self.action_space_size  # 平均策略，即取每个动作的概率均等
 31 |         self.policy = self.mean_policy.copy()
 32 |         self.writer = SummaryWriter("logs")  # 实例化SummaryWriter对象
 33 | 
 34 |         print("action_space_size: {} state_space_size：{}".format(self.action_space_size, self.state_space_size))
 35 |         print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,
 36 |                                                                                      self.qvalue.shape,
 37 |                                                                                      self.mean_policy.shape))
 38 | 
 39 |         print('----------------------------------------------------------------')
 40 | 
 41 |     def show_policy(self):
 42 |         for state in range(self.state_space_size):
 43 |             for action in range(self.action_space_size):
 44 |                 policy = self.policy[state, action]
 45 |                 self.env.render_.draw_action(pos=self.env.state2pos(state),
 46 |                                              toward=policy * 0.4 * self.env.action_to_direction[action],
 47 |                                              radius=policy * 0.1)
 48 | 
 49 |     def show_state_value(self, state_value, y_offset=0.2):
 50 |         for state in range(self.state_space_size):
 51 |             self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)),
 52 |                                         y_offset=y_offset,
 53 |                                         size_discount=0.7)
 54 | 
 55 |     def obtain_episode(self, policy, start_state, start_action, length):
 56 |         """
 57 |         :param policy: 由指定策略产生episode
 58 |         :param start_state: 起始state
 59 |         :param start_action: 起始action
 60 |         :param length: 一个episode 长度
 61 |         :return: 一个列表，其中是字典格式: state,action,reward,next_state,next_action
 62 |         """
 63 |         self.env.agent_location = self.env.state2pos(start_state)
 64 |         episode = []
 65 |         next_action = start_action
 66 |         next_state = start_state
 67 |         while length > 0:
 68 |             length -= 1
 69 |             state = next_state
 70 |             action = next_action
 71 |             _, reward, done, _, _ = self.env.step(action)  # 一步动作
 72 |             next_state = self.env.pos2state(self.env.agent_location)
 73 |             next_action = np.random.choice(np.arange(len(policy[next_state])),
 74 |                                            p=policy[next_state])
 75 |             episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state,
 76 |                             "next_action": next_action})  #向列表中添加一个字典
 77 |         return episode  #返回列表，其中的元素为字典
 78 | 
 79 |     '''
 80 |         Learn an optimal policy that can lead the agent to the target state from an initial state s0.
 81 |     '''
 82 |     def Sarsa_alg(self,initial_location, epsilon = 0.1):
 83 |         total_rewards = []
 84 |         episode_lengths = []
 85 |         initial_state = self.env.pos2state(initial_location)
 86 |         print("initial_state:", initial_state)
 87 |         for episode_num in range(1000): # episode_num
 88 |             self.env.reset()
 89 |             total_reward = 0
 90 |             episode_length = 0
 91 |             done = False
 92 |             print("episode_num:",episode_num)
 93 | 
 94 |             state = initial_state
 95 |             action = np.random.choice(a=np.arange(self.action_space_size),
 96 |                                       p=self.policy[state, :])  # Generate a0 at s0 following π0(s0)
 97 |             #initialize buffers
 98 |             states = [state]
 99 |             aciton = [action]
100 |             rewards = [0]
101 |             while not done:  #If s_t is not the target state, do
102 |                 episode_length += 1
103 |                 _, reward, done, _, _ = self.env.step(action) #Collect an experience sample (rt+1, st+1, at+1)
104 |                 #S
105 |                 next_state = self.env.pos2state(self.env.agent_location)
106 |                 # print("next_state:",next_state, "self.env.agent_location:",self.env.agent_location)
107 |                 #A
108 |                 next_action = np.random.choice(np.arange(self.action_space_size),
109 |                                                p=self.policy[next_state,:])
110 |                 total_reward += reward
111 |                 #Update q-value for (st, at):
112 |                 self.qvalue[state][action] = self.qvalue[state, action] - self.alpha * (self.qvalue[state, action] - (reward + self.gama * self.qvalue[next_state, next_action]) )
113 |                 #update policy
114 |                 qvalue_star = self.qvalue[state].max()
115 |                 action_star = self.qvalue[state].tolist().index(qvalue_star)
116 |                 for a in range(self.action_space_size):
117 |                     if a == action_star:
118 |                         self.policy[state, a] = 1 - epsilon + (epsilon / self.action_space_size)
119 | 
120 |                     else:
121 |                         self.policy[state, a] = epsilon / self.action_space_size
122 | 
123 |                 action = next_action
124 |                 state = next_state
125 |             total_rewards.append(total_reward)
126 |             episode_lengths.append(episode_length)
127 | 
128 |         return total_rewards,episode_lengths
129 | 
130 | if __name__ =="__main__":
131 |     gird_world = grid_env.GridEnv(size=5, target=[2, 3],
132 |                                   forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]],
133 |                                   render_mode='')
134 |     solver = Sarsa(alpha =0.1, env = gird_world)
135 |     # solver.sarsa()
136 |     # print("env.policy[0, :]:",solver.policy[0, :])
137 |     # for _ in range(20):
138 |     #     a0 = np.random.choice(5, p=solver.policy[0, :] )
139 |     #
140 |     #     print("a0:",a0)
141 | 
142 |     start_time = time.time()
143 | 
144 |     initial_location = [0,0]
145 |     total_rewards, episode_lengths = solver.Sarsa_alg(initial_location = initial_location)
146 | 
147 | 
148 |     end_time = time.time()
149 |     cost_time = end_time - start_time
150 |     print("cost_time:",cost_time)
151 |     print(len(gird_world.render_.trajectory))
152 | 
153 |     initial_state = solver.env.pos2state(initial_location)
154 |     print("训练后的policy结果为:\n",solver.policy[initial_state,:])
155 |     solver.show_policy()  # solver.env.render()
156 |     solver.show_state_value(solver.state_value, y_offset=0.25)
157 |     # gird_world.plot_title("Episode_length = " + str(i))
158 |     gird_world.render()
159 |     # gird_world.render_clear()
160 |     print("--------------------")
161 |     print("Plot")
162 |     # 绘制第一个图表
163 |     plt.figure(figsize=(10, 5))
164 |     plt.plot(range(1, len(total_rewards) + 1), total_rewards,   # 空心，设置填充色为透明
165 |              markeredgecolor='blue',  # 边框颜色为蓝色
166 |              markersize=10,
167 |              linestyle='-', color='blue',label = "total_rewards")
168 |     plt.xlabel('Episode index', fontsize=12)
169 |     plt.ylabel('total_rewards', fontsize=12)
170 | 
171 |     # 绘制第二个图表
172 |     plt.figure(figsize=(10, 5))
173 |     plt.plot(range(1, len(episode_lengths) + 1), episode_lengths,  # 空心，设置填充色为透明
174 |              markeredgecolor='blue',  # 边框颜色为蓝色
175 |              markersize=10,
176 |              linestyle='-', color='blue',label = "episode_length")
177 |     plt.xlabel('Episode index', fontsize=12)
178 |     plt.ylabel('episode_length', fontsize=12)
179 | 
180 |     # 添加图例
181 |     plt.legend()
182 |     # 显示图表
183 |     plt.show()
184 | 


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/2.n-step Sarsa.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import time
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from torch.utils.tensorboard import SummaryWriter  # 导入SummaryWriter
  6 | 
  7 | # 引用上级目录
  8 | import sys
  9 | sys.path.append("..")
 10 | import grid_env
 11 | 
 12 | """
 13 | SARSA: State - action - reward - state - action
 14 | 
 15 | TD learning of acton values: Sarsa  ->  directly estimate action values.
 16 | """
 17 | class N_step_Sarsa():
 18 |     def __init__(self,alpha,env = grid_env.GridEnv):
 19 |         self.gama = 0.9  # discount rate
 20 |         self.alpha = alpha  #learning rate
 21 |         self.env = env
 22 |         self.action_space_size = env.action_space_size
 23 |         self.state_space_size = env.size ** 2
 24 |         self.reward_space_size, self.reward_list = len(
 25 |             self.env.reward_list), self.env.reward_list  # [-10,-10,0,1]  reward list
 26 |         self.state_value = np.zeros(shape=self.state_space_size)  # 一维列表
 27 |         print("self.state_value:", self.state_value)
 28 |         self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size))  # 二维： state数 x action数
 29 |         self.mean_policy = np.ones(     #self.mean_policy shape: (25, 5)
 30 |             shape=(self.state_space_size, self.action_space_size)) / self.action_space_size  # 平均策略，即取每个动作的概率均等
 31 |         self.policy = self.mean_policy.copy()
 32 |         self.writer = SummaryWriter("logs")  # 实例化SummaryWriter对象
 33 | 
 34 |         print("action_space_size: {} state_space_size：{}".format(self.action_space_size, self.state_space_size))
 35 |         print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,
 36 |                                                                                      self.qvalue.shape,
 37 |                                                                                      self.mean_policy.shape))
 38 | 
 39 |         print('----------------------------------------------------------------')
 40 | 
 41 |     def show_policy(self):
 42 |         for state in range(self.state_space_size):
 43 |             for action in range(self.action_space_size):
 44 |                 policy = self.policy[state, action]
 45 |                 self.env.render_.draw_action(pos=self.env.state2pos(state),
 46 |                                              toward=policy * 0.4 * self.env.action_to_direction[action],
 47 |                                              radius=policy * 0.1)
 48 | 
 49 |     def show_state_value(self, state_value, y_offset=0.2):
 50 |         for state in range(self.state_space_size):
 51 |             self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)),
 52 |                                         y_offset=y_offset,
 53 |                                         size_discount=0.7)
 54 | 
 55 |     '''
 56 |         Learn an optimal policy that can lead the agent to the target state from an initial state s0.
 57 |     '''
 58 | 
 59 |     def n_step_Sarsa_alg(self, initial_location, epsilon=0.1, n=3):
 60 |         total_rewards = []
 61 |         episode_lengths = []
 62 |         initial_state = self.env.pos2state(initial_location)
 63 |         print("initial_state:", initial_state)
 64 | 
 65 |         for episode_num in range(1000):  # episode_num
 66 |             self.env.reset()
 67 |             total_reward = 0
 68 |             episode_length = 0
 69 |             done = False
 70 |             print("episode_num:", episode_num)
 71 | 
 72 |             state = initial_state
 73 |             action = np.random.choice(a=np.arange(self.action_space_size),
 74 |                                       p=self.policy[state, :])  # Generate a0 at s0 following π0(s0)
 75 | 
 76 |             # Initialize buffers
 77 |             states = [state]
 78 |             actions = [action]
 79 |             rewards = [0]  # Reward at time 0 is 0
 80 | 
 81 |             T = float('inf')
 82 |             t = 0
 83 | 
 84 |             while True:
 85 |                 if t < T:
 86 |                     _, reward, done, _, _ = self.env.step(action)  # Collect an experience sample (rt+1, st+1, at+1)
 87 |                     next_state = self.env.pos2state(self.env.agent_location)
 88 |                     next_action = np.random.choice(np.arange(self.action_space_size), p=self.policy[next_state, :])
 89 | 
 90 |                     states.append(next_state)
 91 |                     actions.append(next_action)
 92 |                     rewards.append(reward)
 93 | 
 94 |                     total_reward += reward
 95 |                     episode_length += 1
 96 | 
 97 |                     if done:
 98 |                         T = t + 1
 99 | 
100 |                 tau = t - n + 1
101 |                 if tau >= 0:
102 |                     G = sum([self.gama ** (i - tau - 1) * rewards[i] for i in range(tau + 1, min(tau + n, T) + 1)])
103 |                     if tau + n < T:
104 |                         G += self.gama ** n * self.qvalue[states[tau + n]][actions[tau + n]]
105 | 
106 |                     state_tau = states[tau]
107 |                     action_tau = actions[tau]
108 |                     self.qvalue[state_tau][action_tau] += self.alpha * (G - self.qvalue[state_tau][action_tau])
109 | 
110 |                     # Update policy
111 |                     qvalue_star = self.qvalue[state_tau].max()
112 |                     action_star = self.qvalue[state_tau].tolist().index(qvalue_star)
113 |                     for a in range(self.action_space_size):
114 |                         if a == action_star:
115 |                             self.policy[state_tau, a] = 1 - epsilon + (epsilon / self.action_space_size)
116 |                         else:
117 |                             self.policy[state_tau, a] = epsilon / self.action_space_size
118 | 
119 |                 if tau == T - 1:
120 |                     break
121 | 
122 |                 t += 1
123 |                 state = next_state
124 |                 action = next_action
125 | 
126 |             total_rewards.append(total_reward)
127 |             episode_lengths.append(episode_length)
128 | 
129 |         return total_rewards, episode_lengths
130 | if __name__ =="__main__":
131 |     gird_world = grid_env.GridEnv(size=5, target=[2, 3],
132 |                                   forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]],
133 |                                   render_mode='')
134 |     solver = N_step_Sarsa(alpha =0.1, env = gird_world)
135 |     # solver.sarsa()
136 |     # print("env.policy[0, :]:",solver.policy[0, :])
137 |     # for _ in range(20):
138 |     #     a0 = np.random.choice(5, p=solver.policy[0, :] )
139 |     #
140 |     #     print("a0:",a0)
141 | 
142 |     start_time = time.time()
143 | 
144 |     initial_location = [0,4]
145 |     total_rewards, episode_lengths = solver.n_step_Sarsa_alg(initial_location = initial_location)
146 | 
147 | 
148 |     end_time = time.time()
149 |     cost_time = end_time - start_time
150 |     print("cost_time:",cost_time)
151 |     print(len(gird_world.render_.trajectory))
152 | 
153 |     initial_state = solver.env.pos2state(initial_location)
154 |     print("训练后的policy结果为:\n",solver.policy[initial_state,:])
155 |     solver.show_policy()  # solver.env.render()
156 |     solver.show_state_value(solver.state_value, y_offset=0.25)
157 |     # gird_world.plot_title("Episode_length = " + str(i))
158 |     gird_world.render()
159 |     # gird_world.render_clear()
160 |     print("--------------------")
161 |     print("Plot")
162 |     # 绘制第一个图表
163 |     plt.figure(figsize=(10, 5))
164 |     plt.plot(range(1, len(total_rewards) + 1), total_rewards,   # 空心，设置填充色为透明
165 |              markeredgecolor='blue',  # 边框颜色为蓝色
166 |              markersize=10,
167 |              linestyle='-', color='blue',label = "total_rewards")
168 |     plt.xlabel('Episode index', fontsize=12)
169 |     plt.ylabel('total_rewards', fontsize=12)
170 | 
171 |     # 绘制第二个图表
172 |     plt.figure(figsize=(10, 5))
173 |     plt.plot(range(1, len(episode_lengths) + 1), episode_lengths,  # 空心，设置填充色为透明
174 |              markeredgecolor='blue',  # 边框颜色为蓝色
175 |              markersize=10,
176 |              linestyle='-', color='blue',label = "episode_length")
177 |     plt.xlabel('Episode index', fontsize=12)
178 |     plt.ylabel('episode_length', fontsize=12)
179 | 
180 |     # 添加图例
181 |     plt.legend()
182 |     # 显示图表
183 |     plt.show()
184 | 


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/3.Q-learning.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import time
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from torch.utils.tensorboard import SummaryWriter  # 导入SummaryWriter
  6 | 
  7 | # 引用上级目录
  8 | import sys
  9 | sys.path.append("..")
 10 | import grid_env
 11 | 
 12 | """
 13 | SARSA: State - action - reward - state - action
 14 | 
 15 | TD learning of acton values: Sarsa  ->  directly estimate action values.
 16 | """
 17 | class Q_learning():
 18 |     def __init__(self,alpha,env = grid_env.GridEnv):
 19 |         self.gamma = 0.9  # discount rate
 20 |         self.alpha = alpha  #learning rate
 21 |         self.env = env
 22 |         self.action_space_size = env.action_space_size
 23 |         self.state_space_size = env.size ** 2
 24 |         self.reward_space_size, self.reward_list = len(
 25 |             self.env.reward_list), self.env.reward_list  # [-10,-10,0,1]  reward list
 26 |         self.state_value = np.zeros(shape=self.state_space_size)  # 一维列表
 27 |         print("self.state_value:", self.state_value)
 28 |         self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size))  # 二维： state数 x action数
 29 |         self.mean_policy = np.ones(     #self.mean_policy shape: (25, 5)
 30 |             shape=(self.state_space_size, self.action_space_size)) / self.action_space_size  # 平均策略，即取每个动作的概率均等
 31 |         self.policy = self.mean_policy.copy()
 32 |         self.writer = SummaryWriter("logs")  # 实例化SummaryWriter对象
 33 | 
 34 |         print("action_space_size: {} state_space_size：{}".format(self.action_space_size, self.state_space_size))
 35 |         print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,
 36 |                                                                                      self.qvalue.shape,
 37 |                                                                                      self.mean_policy.shape))
 38 | 
 39 |         print('----------------------------------------------------------------')
 40 | 
 41 |     def show_policy(self):
 42 |         for state in range(self.state_space_size):
 43 |             for action in range(self.action_space_size):
 44 |                 policy = self.policy[state, action]
 45 |                 self.env.render_.draw_action(pos=self.env.state2pos(state),
 46 |                                              toward=policy * 0.4 * self.env.action_to_direction[action],
 47 |                                              radius=policy * 0.1)
 48 | 
 49 |     def show_state_value(self, state_value, y_offset=0.2):
 50 |         for state in range(self.state_space_size):
 51 |             self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)),
 52 |                                         y_offset=y_offset,
 53 |                                         size_discount=0.7)
 54 | 
 55 |     '''
 56 |         Learn an optimal policy that can lead the agent to the target state from an initial state s0.
 57 |     '''
 58 | 
 59 |     def q_learning(self, initial_location, epsilon=0.1, n=3):
 60 |         total_rewards = []
 61 |         episode_lengths = []
 62 |         initial_state = self.env.pos2state(initial_location)
 63 |         print("initial_state:", initial_state)
 64 | 
 65 |         for episode_num in range(1000):  # episode_num
 66 |             self.env.reset()
 67 |             total_reward = 0
 68 |             episode_length = 0
 69 |             done = False
 70 |             print("episode_num:", episode_num)
 71 |             state = initial_state
 72 |             while not done:
 73 |                 # Choose action using epsilon-greedy policy
 74 |                 if np.random.rand() < epsilon:
 75 |                     action = np.random.choice(self.action_space_size)  # Explore: random action
 76 |                 else:
 77 |                     action = np.argmax(self.qvalue[state])  # Exploit: action with max Q-value
 78 | 
 79 |                 # Take action and observe reward and next state
 80 |                 _, reward, done, _, _ = self.env.step(action)
 81 |                 next_state = self.env.pos2state(self.env.agent_location)
 82 | 
 83 |                 # Update Q-value
 84 |                 best_next_action = np.argmax(self.qvalue[next_state])
 85 |                 td_target = reward + self.gamma * self.qvalue[next_state][best_next_action]
 86 |                 td_error = self.qvalue[state][action] - td_target
 87 |                 self.qvalue[state][action] -= self.alpha * td_error
 88 | 
 89 |                 # Update policy (optional, since Q-learning is off-policy)
 90 |                 qvalue_star = self.qvalue[state].max()
 91 |                 action_star = self.qvalue[state].tolist().index(qvalue_star)
 92 |                 for a in range(self.action_space_size):
 93 |                     if a == action_star:
 94 |                         self.policy[state, a] = 1 - epsilon + (epsilon / self.action_space_size)
 95 |                     else:
 96 |                         self.policy[state, a] = epsilon / self.action_space_size
 97 | 
 98 |                 # Update state
 99 |                 state = next_state
100 |                 total_reward += reward
101 |                 episode_length += 1
102 | 
103 |             total_rewards.append(total_reward)
104 |             episode_lengths.append(episode_length)
105 | 
106 |         return total_rewards, episode_lengths
107 | if __name__ =="__main__":
108 |     gird_world = grid_env.GridEnv(size=5, target=[2, 3],
109 |                                   forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]],
110 |                                   render_mode='')
111 |     solver = Q_learning(alpha =0.1, env = gird_world)
112 |     # solver.sarsa()
113 |     # print("env.policy[0, :]:",solver.policy[0, :])
114 |     # for _ in range(20):
115 |     #     a0 = np.random.choice(5, p=solver.policy[0, :] )
116 |     #
117 |     #     print("a0:",a0)
118 | 
119 |     start_time = time.time()
120 | 
121 |     initial_location = [4,0]
122 |     total_rewards, episode_lengths = solver.q_learning(initial_location = initial_location)
123 | 
124 | 
125 |     end_time = time.time()
126 |     cost_time = end_time - start_time
127 |     print("cost_time:",cost_time)
128 |     print(len(gird_world.render_.trajectory))
129 | 
130 |     initial_state = solver.env.pos2state(initial_location)
131 |     print("训练后的policy结果为:\n",solver.policy[initial_state,:])
132 |     solver.show_policy()  # solver.env.render()
133 |     solver.show_state_value(solver.state_value, y_offset=0.25)
134 |     # gird_world.plot_title("Episode_length = " + str(i))
135 |     gird_world.render()
136 |     # gird_world.render_clear()
137 |     print("--------------------")
138 |     print("Plot")
139 |     # 绘制第一个图表
140 |     plt.figure(figsize=(10, 5))
141 |     plt.plot(range(1, len(total_rewards) + 1), total_rewards,   # 空心，设置填充色为透明
142 |              markeredgecolor='blue',  # 边框颜色为蓝色
143 |              markersize=10,
144 |              linestyle='-', color='blue',label = "total_rewards")
145 |     plt.xlabel('Episode index', fontsize=12)
146 |     plt.ylabel('total_rewards', fontsize=12)
147 | 
148 |     # 绘制第二个图表
149 |     plt.figure(figsize=(10, 5))
150 |     plt.plot(range(1, len(episode_lengths) + 1), episode_lengths,  # 空心，设置填充色为透明
151 |              markeredgecolor='blue',  # 边框颜色为蓝色
152 |              markersize=10,
153 |              linestyle='-', color='blue',label = "episode_length")
154 |     plt.xlabel('Episode index', fontsize=12)
155 |     plt.ylabel('episode_length', fontsize=12)
156 | 
157 |     # 添加图例
158 |     plt.legend()
159 |     # 显示图表
160 |     plt.show()
161 | 


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/4.Q-learning on policy.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/RL_Learning-main/scripts/Chapter7_Temporal-Difference learning/4.Q-learning on policy.py


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/Chapter9_Policy Gradient/[Reinforce]Monte Carlo policy gradient.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import time
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from torch.utils.tensorboard import SummaryWriter  # 导入SummaryWriter
  6 | from torch.utils import data
  7 | import torch
  8 | import torch.nn as nn
  9 | 
 10 | 
 11 | # 引用上级目录
 12 | import sys
 13 | sys.path.append("..")
 14 | import grid_env
 15 | 
 16 | """
 17 | REINFORCE algorithm
 18 | 
 19 | """
 20 | 
 21 | "Define policy NN"
 22 | class PolicyNet(nn.Module):
 23 |     def __init__(self, input_dim=2, output_dim=5):
 24 |         super(PolicyNet, self).__init__()
 25 |         self.fc = nn.Sequential(
 26 |             nn.Linear(in_features=input_dim, out_features=100),
 27 |             nn.ReLU(),
 28 |             nn.Linear(in_features=100, out_features=output_dim),
 29 |             nn.Softmax(dim=1)
 30 |         )
 31 | 
 32 |     def forward(self, x):
 33 |         x = x.type(torch.float32)
 34 |         return self.fc(x)
 35 | 
 36 | 
 37 | class REINFORCE():
 38 |     def __init__(self,alpha,env = grid_env.GridEnv):
 39 |         self.gama = 0.9  # discount rate
 40 |         self.alpha = alpha  #learning rate
 41 |         self.env = env
 42 |         self.action_space_size = env.action_space_size
 43 |         self.state_space_size = env.size ** 2
 44 |         self.reward_space_size, self.reward_list = len(
 45 |             self.env.reward_list), self.env.reward_list  # [-10,-10,0,1]  reward list
 46 |         self.state_value = np.zeros(shape=self.state_space_size)  # 一维列表
 47 |         print("self.state_value:", self.state_value)
 48 |         self.qvalue = np.zeros(shape=(self.state_space_size, self.action_space_size))  # 二维： state数 x action数
 49 |         self.mean_policy = np.ones(     #self.mean_policy shape: (25, 5)
 50 |             shape=(self.state_space_size, self.action_space_size)) / self.action_space_size  # 平均策略，即取每个动作的概率均等
 51 |         self.policy = self.mean_policy.copy()
 52 |         self.writer = SummaryWriter("logs")  # 实例化SummaryWriter对象
 53 | 
 54 |         print("action_space_size: {} state_space_size：{}".format(self.action_space_size, self.state_space_size))
 55 |         print("state_value.shape:{} , qvalue.shape:{} , mean_policy.shape:{}".format(self.state_value.shape,
 56 |                                                                                      self.qvalue.shape,
 57 |                                                                                      self.mean_policy.shape))
 58 |         print('----------------------------------------------------------------')
 59 | 
 60 |     def show_policy(self):
 61 |         for state in range(self.state_space_size):
 62 |             for action in range(self.action_space_size):
 63 |                 policy = self.policy[state, action]
 64 |                 self.env.render_.draw_action(pos=self.env.state2pos(state),
 65 |                                              toward=policy * 0.4 * self.env.action_to_direction[action],
 66 |                                              radius=policy * 0.1)
 67 | 
 68 |     def show_state_value(self, state_value, y_offset=0.2):
 69 |         for state in range(self.state_space_size):
 70 |             self.env.render_.write_word(pos=self.env.state2pos(state), word=str(round(state_value[state], 1)),
 71 |                                         y_offset=y_offset,
 72 |                                         size_discount=0.7)
 73 | 
 74 |     #其中一个episode 就是一个从start 到 target 的轨迹。
 75 |     def obtain_episode_net(self, policy_net, start_state, start_action):
 76 |         """
 77 |         :param policy_net: 由指定策略产生episode
 78 |         :param start_state: 起始state
 79 |         :param start_action: 起始action
 80 |         :return: 一个列表，其中是字典格式: state,action,reward,next_state,next_action
 81 |         """
 82 |         self.env.agent_location = self.env.state2pos(start_state)
 83 |         episode = []
 84 |         next_action = start_action
 85 |         next_state = start_state
 86 |         terminated = False
 87 |         while not terminated:
 88 |             state = next_state
 89 |             action = next_action
 90 |             _, reward, terminated, _, _ = self.env.step(action)  # 一步动作
 91 |             next_state = self.env.pos2state(self.env.agent_location)
 92 |             x, y = self.env.state2pos(next_state) / self.env.size
 93 |             prb = policy_net(torch.tensor((x, y)).reshape(-1, 2))[0]
 94 |             next_action = np.random.choice(np.arange(self.action_space_size), p = prb.detach().numpy())
 95 |             episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state,
 96 |                             "next_action": next_action})  # 向列表中添加一个字典
 97 |         return episode
 98 | 
 99 |     def reiniforce(self, epochs=20000):
100 |         policy_net = PolicyNet()
101 |         optimizer = torch.optim.Adam(policy_net.parameters(), lr=self.alpha)
102 |         for epoch in range(epochs):
103 |             prb = policy_net(torch.tensor((0, 0)).reshape(-1, 2))[0]
104 |             print("epoch:{} , prb:{}".format(epoch, prb))
105 |             start_action = np.random.choice(np.arange(self.action_space_size), p=prb.detach().numpy())
106 |             episode = self.obtain_episode_net(policy_net, start_state=0, start_action=start_action)
107 |             # print("eposode:", episode)
108 | 
109 |             if len(episode) < 10 :
110 |                 g = -100
111 |             else:
112 |                 g = 0
113 |             optimizer.zero_grad()  # 清零梯度
114 |             for step in reversed(range(len(episode))):
115 |                 reward = episode[step]['reward']
116 |                 state = episode[step]['state']
117 |                 action = episode[step]['action']
118 |                 if len(episode) > 1000:
119 |                     # print(g, reward)
120 |                     pass
121 |                 g = self.gama * g + reward
122 |                 self.qvalue[state, action] = g
123 |                 x ,y = self.env.state2pos(state)/self.env.size
124 |                 prb = policy_net(torch.tensor((x, y)).reshape(-1, 2))[0]
125 |                 log_prob = torch.log(prb[action])
126 |                 loss = -log_prob * g
127 |                 loss.backward() #反向传播计算梯度
128 |             self.writer.add_scalar("loss", float(loss.detach()), epoch)
129 |             self.writer.add_scalar('g', g, epoch)
130 |             self.writer.add_scalar('episode_length', len(episode), epoch)
131 |             # print(epoch, len(episode), g)
132 |             optimizer.step()
133 |         for s in range(self.state_space_size):
134 |             x, y = self.env.state2pos(s) / self.env.size
135 |             prb = policy_net(torch.tensor((x, y)).reshape(-1, 2))[0]
136 |             self.policy[s,:] = prb.copy()
137 |         self.writer.close()
138 | 
139 | if __name__ == '__main__':
140 |     gird_world = grid_env.GridEnv(size=5, target=[2, 3],
141 |                                   forbidden=[[1, 1], [2, 1], [2, 2], [1, 3], [3, 3], [1, 4]],
142 |                                   render_mode='')
143 |     solver = REINFORCE(alpha=0.001, env=gird_world)
144 |     start_time = time.time()
145 | 
146 |     solver.reiniforce()
147 |     print("solver.state_value:", solver.state_value)
148 | 
149 | 
150 |     end_time = time.time()
151 |     cost_time = end_time - start_time
152 |     print("cost_time:", cost_time)
153 |     solver.show_policy()  # solver.env.render()
154 |     solver.show_state_value(solver.state_value, y_offset=0.25)
155 |     solver.env.render()
156 | 


--------------------------------------------------------------------------------
/RL_Learning-main/scripts/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class QNET(nn.Module):
 6 |     def __init__(self, input_dim=3, output_dim=1):
 7 |         super(QNET, self).__init__()
 8 |         self.fc = nn.Sequential(
 9 |             nn.Linear(in_features=input_dim, out_features=128),
10 |             nn.ReLU(),
11 |             nn.Linear(in_features=128, out_features=64),
12 |             nn.ReLU(),
13 |             nn.Linear(in_features=64, out_features=32),
14 |             nn.ReLU(),
15 |             nn.Linear(in_features=32, out_features=output_dim),
16 |         )
17 | 
18 |     def forward(self, x):
19 |         x = x.type(torch.float32)
20 |         return self.fc(x)
21 | 
22 | 
23 | class PolicyNet(nn.Module):
24 |     def __init__(self, input_dim=2, output_dim=5):
25 |         super(PolicyNet, self).__init__()
26 |         self.fc = nn.Sequential(
27 |             nn.Linear(in_features=input_dim, out_features=100),
28 |             nn.ReLU(),
29 |             nn.Linear(in_features=100, out_features=output_dim),
30 |             nn.Softmax(dim=1)
31 |         )
32 | 
33 |     def forward(self, x):
34 |         x = x.type(torch.float32)
35 |         return self.fc(x)
36 | 
37 | 
38 | class DPolicyNet(nn.Module):
39 |     def __init__(self, input_dim=2, output_dim=1):
40 |         super(DPolicyNet, self).__init__()
41 |         self.fc = nn.Sequential(
42 |             nn.Linear(in_features=input_dim, out_features=100),
43 |             nn.ReLU(),
44 |             nn.Linear(in_features=100, out_features=output_dim),
45 |         )
46 | 
47 |     def forward(self, x):
48 |         x = x.type(torch.float32)
49 |         return self.fc(x)
50 | 
51 | 
52 | class ValueNet(torch.nn.Module):
53 |     def __init__(self, input_dim=2, output_dim=1):
54 |         super(ValueNet, self).__init__()
55 |         self.fc = nn.Sequential(
56 |             nn.Linear(in_features=input_dim, out_features=100),
57 |             nn.ReLU(),
58 |             nn.Linear(in_features=100, out_features=output_dim),
59 |         )
60 | 
61 |     def forward(self, x):
62 |         x = x.type(torch.float32)
63 |         return self.fc(x)
64 | 
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     dqn = PolicyNet()
69 |     input = torch.tensor([[2, 1], [3, 1]])
70 |     print(dqn)
71 |     print(dqn(input))
72 | 


--------------------------------------------------------------------------------
/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/img.png


--------------------------------------------------------------------------------
/动手学强化学习/DQN/DQN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Qnet(torch.nn.Module):
 7 |     ''' 只有一层隐藏层的Q网络 '''
 8 |     def __init__(self, state_dim, hidden_dim, action_dim):
 9 |         super(Qnet, self).__init__()
10 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
11 |         self.fc2 = torch.nn.Linear(hidden_dim, action_dim)
12 | 
13 |     def forward(self, x):
14 |         x = F.relu(self.fc1(x))  # 隐藏层使用ReLU激活函数
15 |         return self.fc2(x)
16 | 
17 | 
18 | class VAnet(torch.nn.Module):
19 |     def __init__(self, state_dim, hidden_dim, action_dim):
20 |         super().__init__()
21 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
22 |         self.fc_A = torch.nn.Linear(hidden_dim, action_dim)
23 |         self.fc_V = torch.nn.Linear(hidden_dim, 1)
24 | 
25 |     def forward(self, x):
26 |         x = F.relu(self.fc1(x))
27 |         A = self.fc_A(x)
28 |         V = self.fc_V(x)
29 |         Q = V + A - A.mean(1).view(-1, 1)
30 |         return Q
31 | 
32 | class DQN:
33 |     ''' DQN算法 '''
34 |     def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
35 |                  epsilon, target_update, device):
36 |         self.action_dim = action_dim
37 |         self.q_net = Qnet(state_dim, hidden_dim,
38 |                           self.action_dim).to(device)  # Q网络
39 |         # 目标网络
40 |         self.target_q_net = Qnet(state_dim, hidden_dim,
41 |                                  self.action_dim).to(device)
42 |         # 使用Adam优化器
43 |         self.optimizer = torch.optim.Adam(self.q_net.parameters(),
44 |                                           lr=learning_rate)
45 |         self.gamma = gamma  # 折扣因子
46 |         self.epsilon = epsilon  # epsilon-贪婪策略
47 |         self.target_update = target_update  # 目标网络更新频率
48 |         self.count = 0  # 计数器,记录更新次数
49 |         self.device = device
50 | 
51 |     def take_action(self, state):  # epsilon-贪婪策略采取动作
52 |         if np.random.random() < self.epsilon:
53 |             action = np.random.randint(self.action_dim)
54 |         else:
55 |             state = torch.tensor([state], dtype=torch.float).to(self.device)
56 |             action = self.q_net(state).argmax().item()
57 |         return action
58 | 
59 |     def update(self, transition_dict):
60 |         states = torch.tensor(transition_dict['states'],
61 |                               dtype=torch.float).to(self.device)
62 |         actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
63 |             self.device)
64 |         rewards = torch.tensor(transition_dict['rewards'],
65 |                                dtype=torch.float).view(-1, 1).to(self.device)
66 |         next_states = torch.tensor(transition_dict['next_states'],
67 |                                    dtype=torch.float).to(self.device)
68 |         dones = torch.tensor(transition_dict['dones'],
69 |                              dtype=torch.float).view(-1, 1).to(self.device)
70 | 
71 |         q_values = self.q_net(states).gather(1, actions)  # Q值
72 |         # 下个状态的最大Q值
73 |         max_next_q_values = self.target_q_net(next_states).max(1)[0].view(
74 |             -1, 1)
75 |         q_targets = rewards + self.gamma * max_next_q_values * (1 - dones
76 |                                                                 )  # TD误差目标
77 |         dqn_loss = torch.mean(F.mse_loss(q_values, q_targets))  # 均方误差损失函数
78 |         self.optimizer.zero_grad()  # PyTorch中默认梯度会累积,这里需要显式将梯度置为0
79 |         dqn_loss.backward()  # 反向传播更新参数
80 |         self.optimizer.step()
81 | 
82 |         if self.count % self.target_update == 0:  #target_update 指C步之后更新目标网络Q_target的参数 \omega
83 |             self.target_q_net.load_state_dict(
84 |                 self.q_net.state_dict())  # 更新目标网络
85 |         self.count += 1


--------------------------------------------------------------------------------
/动手学强化学习/DQN/display.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import numpy as np
 4 | from DQN import DQN
 5 | from time import sleep
 6 | 
 7 | 
 8 | def dis_to_con(discrete_action, env, action_dim):
 9 |     action_lowbound = env.action_space.low[0]
10 |     action_upbound = env.action_space.high[0]
11 |     return np.array([discrete_action / (action_dim - 1) * (action_upbound - action_lowbound) + action_lowbound])
12 | 
13 | 
14 | lr = 2e-3
15 | hidden_dim = 128
16 | gamma = 0.98
17 | epsilon = 0.0
18 | target_update = 10
19 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
20 | 
21 | env = gym.make('Pendulum-v1')
22 | 
23 | state_dim = env.observation_space.shape[0]
24 | action_dim = 11
25 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device)
26 | state_dict = torch.load('dqn_pendulumv1.pth')
27 | agent.q_net.load_state_dict(state_dict)
28 | agent.target_q_net.load_state_dict(state_dict)
29 | 
30 | state = env.reset()
31 | done = False
32 | agent_return = 0
33 | while not done:
34 |     action = agent.take_action(state)
35 |     action = dis_to_con(action, env, action_dim)
36 |     next_state, reward, done, _ = env.step(action)
37 |     agent_return += reward
38 |     env.render()
39 |     state = next_state
40 |     sleep(0.01)
41 | 
42 | print('DQN return:', agent_return)
43 | 
44 | 
45 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device, 'DoubleDQN')
46 | state_dict = torch.load('double_dqn_pendulumv1.pth')
47 | agent.q_net.load_state_dict(state_dict)
48 | agent.target_q_net.load_state_dict(state_dict)
49 | 
50 | state = env.reset()
51 | done = False
52 | agent_return = 0
53 | while not done:
54 |     action = agent.take_action(state)
55 |     action = dis_to_con(action, env, action_dim)
56 |     next_state, reward, done, _ = env.step(action)
57 |     agent_return += reward
58 |     env.render()
59 |     state = next_state
60 |     sleep(0.01)
61 | 
62 | print('Double DQN return:', agent_return)
63 | 
64 | 
65 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device, 'DuelingDQN')
66 | state_dict = torch.load('dueling_dqn_pendulumv1.pth')
67 | agent.q_net.load_state_dict(state_dict)
68 | agent.target_q_net.load_state_dict(state_dict)
69 | 
70 | state = env.reset()
71 | done = False
72 | agent_return = 0
73 | while not done:
74 |     action = agent.take_action(state)
75 |     action = dis_to_con(action, env, action_dim)
76 |     next_state, reward, done, _ = env.step(action)
77 |     agent_return += reward
78 |     env.render()
79 |     state = next_state
80 |     sleep(0.01)
81 | 
82 | print('Dueling DQN return:', agent_return)


--------------------------------------------------------------------------------
/动手学强化学习/DQN/main.py:
--------------------------------------------------------------------------------
  1 | import gymnasium as gym
  2 | import torch
  3 | import random
  4 | import numpy as np
  5 | from DQN import DQN
  6 | from tqdm import tqdm
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | import sys
 10 | import os
 11 | # 将上级目录添加到 sys.path
 12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 13 | import rl_utils
 14 | 
 15 | def dis_to_con(discrete_action, env, action_dim):
 16 |     action_lowbound = env.action_space.low[0]
 17 |     action_upbound = env.action_space.high[0]
 18 |     return np.array([discrete_action / (action_dim - 1) * (action_upbound - action_lowbound) + action_lowbound])
 19 | 
 20 | 
 21 | def train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size):
 22 |     return_list = []
 23 |     max_q_value_list = []
 24 |     max_q_value = 0
 25 | 
 26 |     for i in range(10):
 27 |         with tqdm(total=num_episodes // 10, desc='Iteration %d' % i) as pbar:
 28 |             for i_episode in range(num_episodes // 10):
 29 |                 episode_return = 0
 30 |                 state, *_ = env.reset()
 31 |                 done = False
 32 |                 while not done:
 33 |                     # print("state", state)
 34 |                     action = agent.take_action(state)
 35 |                     # max_q_value = agent.max_q_value(state) * 0.005 + max_q_value * 0.995
 36 |                     # max_q_value_list.append(max_q_value)
 37 | 
 38 |                     # action_continuous = dis_to_con(action, env, agent.action_dim)
 39 |                     next_state, reward, done, *_ = env.step(action)  #实参或者为： action_continuous
 40 |                     replay_buffer.add(state, action, reward, next_state, done)
 41 |                     state = next_state
 42 |                     episode_return += reward
 43 | 
 44 |                     if replay_buffer.size() > minimal_size:
 45 |                         b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
 46 |                         transition_dict = dict(
 47 |                             states=b_s,
 48 |                             actions=b_a,
 49 |                             rewards=b_r,
 50 |                             next_states=b_ns,
 51 |                             dones=b_d
 52 |                         )
 53 |                         agent.update(transition_dict)
 54 |                 return_list.append(episode_return)
 55 | 
 56 |                 if (i_episode + 1) % 10 == 0:
 57 |                     pbar.set_postfix({
 58 |                         'episode': '%d' % (num_episodes / 10 * i + i_episode + 1),
 59 |                         'return': '%.3f' % np.mean(return_list[-10:])
 60 |                     })
 61 |                 pbar.update(1)
 62 |     return return_list, max_q_value_list
 63 | 
 64 | 
 65 | 
 66 | if __name__ == "__main__":
 67 |     lr = 2e-3
 68 |     num_episodes = 500
 69 |     hidden_dim = 128
 70 |     gamma = 0.99
 71 |     epsilon = 0.01
 72 |     target_update = 10
 73 |     buffer_size = 10000
 74 |     minimal_size = 500
 75 |     batch_size = 64
 76 |     device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 77 |     print(f"Using device: {device}")
 78 |     # env_name = 'Pendulum-v1'
 79 |     env_name = "CartPole-v0"
 80 |     env = gym.make(env_name)
 81 |     print("CartPole-v0 env:",env)
 82 | 
 83 |     random.seed(0)
 84 |     np.random.seed(0)
 85 |     env.reset(seed = 0)  # 新版gymnausim
 86 |     # env.seed(0)  #旧版gym
 87 |     torch.manual_seed(0)
 88 | 
 89 | 
 90 |     replay_buffer = rl_utils.ReplayBuffer(buffer_size)
 91 |     print("replay_buffer建立成功！", replay_buffer)
 92 |     state_dim = env.observation_space.shape[0]
 93 |     action_dim = env.action_space.n
 94 |     print("state_dim:", state_dim)
 95 |     print("action_dim:", action_dim)
 96 |     agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device)
 97 |     # return_list, max_q_value_list = train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size)
 98 |     return_list, _ = train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size)
 99 | 
100 |     # torch.save(agent.q_net.state_dict(), 'dqn_pendulumv1.pth')
101 |     episodes_list = list(range(len(return_list)))
102 | 
103 |     plt.plot(episodes_list, return_list)
104 |     plt.xlabel('Episodes')
105 |     plt.ylabel('Returns')
106 |     plt.title('DQN Returns on {}'.format(env_name))
107 |     plt.show()
108 |     ##将结果平滑处理
109 |     mv_return = rl_utils.moving_average(return_list, 9)
110 |     plt.plot(episodes_list, mv_return)
111 |     plt.xlabel('Episodes')
112 |     plt.ylabel('Returns')
113 |     plt.title('DQN on {}'.format(env_name))
114 |     plt.show()
115 | 
116 | 
117 | 
118 | # --------------------------------------------------------
119 | #
120 | # print("Double DQN")
121 | # random.seed(0)
122 | # np.random.seed(0)
123 | # env.seed(0)
124 | # torch.manual_seed(0)
125 | #
126 | # replay_buffer = ReplayBuffer(buffer_size)
127 | # agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device, "DoubleDQN")
128 | # return_list, max_q_value_list = train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size)
129 | #
130 | # torch.save(agent.q_net.state_dict(), 'double_dqn_pendulumv1.pth')
131 | # episodes_list = list(range(len(return_list)))
132 | # mv_returns = moving_average(return_list, 5)
133 | # plt.plot(episodes_list, mv_returns)
134 | # plt.xlabel('Episodes')
135 | # plt.ylabel('Returns')
136 | # plt.title('Double DQN Returns on {}'.format(env_name))
137 | # plt.show()
138 | # --------------------------------------------------------
139 | # frames_list = list(range(len(max_q_value_list)))
140 | # plt.plot(frames_list, max_q_value_list)
141 | # plt.axhline(0, c='orange', ls='--')
142 | # plt.axhline(10, c='red', ls='--')
143 | # plt.xlabel('Frames')
144 | # plt.ylabel('Q value')
145 | # plt.title('Double DQN Q value on {}'.format(env_name))
146 | # plt.show()
147 | #
148 | # print("Dueling DQN")
149 | # random.seed(0)
150 | # np.random.seed(0)
151 | # env.seed(0)
152 | # torch.manual_seed(0)
153 | #
154 | # replay_buffer = ReplayBuffer(buffer_size)
155 | # agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, target_update, device, "DuelingDQN")
156 | # return_list, max_q_value_list = train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size)
157 | #
158 | # torch.save(agent.q_net.state_dict(), 'dueling_dqn_pendulumv1.pth')
159 | # episodes_list = list(range(len(return_list)))
160 | # mv_returns = moving_average(return_list, 5)
161 | # plt.plot(episodes_list, mv_returns)
162 | # plt.xlabel('Episodes')
163 | # plt.ylabel('Returns')
164 | # plt.title('Dueling DQN Returns on {}'.format(env_name))
165 | # plt.show()
166 | #
167 | # frames_list = list(range(len(max_q_value_list)))
168 | # plt.plot(frames_list, max_q_value_list)
169 | # plt.axhline(0, c='orange', ls='--')
170 | # plt.axhline(10, c='red', ls='--')
171 | # plt.xlabel('Frames')
172 | # plt.ylabel('Q value')
173 | # plt.title('Dueling DQN Q value on {}'.format(env_name))
174 | # plt.show()


--------------------------------------------------------------------------------
/动手学强化学习/Hands-on-RL/README.md:
--------------------------------------------------------------------------------
 1 | # 动手学强化学习
 2 | 
 3 | Tips: 若运行gym环境的代码时遇到报错，请尝试pip install gym==0.18.3安装此版本的gym库，若仍有问题，欢迎提交issue！
 4 | 
 5 | 欢迎来到《动手学强化学习》（Hands-on Reinforcement Learning）的地带。该系列从强化学习的定义等基础讲起，一步步由浅入深，介绍目前一些主流的强化学习算法。每一章内容都是一个Jupyter Notebook，内含详细的图文介绍和代码讲解。
 6 | 
 7 | * 由于GitHub上渲染notebook效果有限，我们推荐读者前往[Hands-on RL主页](https://hrl.boyuai.com/)进行浏览，我们在此提供了纯代码版本的notebook，供大家下载运行。
 8 | 
 9 | * 欢迎在[京东](https://item.jd.com/13129509.html)和[当当网](http://product.dangdang.com/29391150.html)购买《动手学强化学习》。
10 | 
11 | * 如果你发现了本书的任何问题，或者有任何改善建议的，欢迎提交issue！
12 | 
13 | * 本书配套的强化学习课程已上线到[伯禹学习平台](https://www.boyuai.com/elites/course/xVqhU42F5IDky94x)，所有人都可以免费学习和讨论。
14 | 
15 | ![](https://boyuai.oss-cn-shanghai.aliyuncs.com/disk/tmp/hrl-poster.jpeg)
16 | 


--------------------------------------------------------------------------------
/动手学强化学习/Hands-on-RL/rl_utils.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import numpy as np
 3 | import torch
 4 | import collections
 5 | import random
 6 | 
 7 | class ReplayBuffer:
 8 |     def __init__(self, capacity):
 9 |         self.buffer = collections.deque(maxlen=capacity) 
10 | 
11 |     def add(self, state, action, reward, next_state, done): 
12 |         self.buffer.append((state, action, reward, next_state, done)) 
13 | 
14 |     def sample(self, batch_size): 
15 |         transitions = random.sample(self.buffer, batch_size)
16 |         state, action, reward, next_state, done = zip(*transitions)
17 |         return np.array(state), action, reward, np.array(next_state), done 
18 | 
19 |     def size(self): 
20 |         return len(self.buffer)
21 | 
22 | def moving_average(a, window_size):
23 |     cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 
24 |     middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size
25 |     r = np.arange(1, window_size-1, 2)
26 |     begin = np.cumsum(a[:window_size-1])[::2] / r
27 |     end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1]
28 |     return np.concatenate((begin, middle, end))
29 | 
30 | def train_on_policy_agent(env, agent, num_episodes):
31 |     return_list = []
32 |     for i in range(10):
33 |         with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar:
34 |             for i_episode in range(int(num_episodes/10)):
35 |                 episode_return = 0
36 |                 transition_dict = {'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': []}
37 |                 state = env.reset()
38 |                 done = False
39 |                 while not done:
40 |                     action = agent.take_action(state)
41 |                     next_state, reward, done, _ = env.step(action)
42 |                     transition_dict['states'].append(state)
43 |                     transition_dict['actions'].append(action)
44 |                     transition_dict['next_states'].append(next_state)
45 |                     transition_dict['rewards'].append(reward)
46 |                     transition_dict['dones'].append(done)
47 |                     state = next_state
48 |                     episode_return += reward
49 |                 return_list.append(episode_return)
50 |                 agent.update(transition_dict)
51 |                 if (i_episode+1) % 10 == 0:
52 |                     pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])})
53 |                 pbar.update(1)
54 |     return return_list
55 | 
56 | def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size):
57 |     return_list = []
58 |     for i in range(10):
59 |         with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar:
60 |             for i_episode in range(int(num_episodes/10)):
61 |                 episode_return = 0
62 |                 state = env.reset()
63 |                 done = False
64 |                 while not done:
65 |                     action = agent.take_action(state)
66 |                     next_state, reward, done, _ = env.step(action)
67 |                     replay_buffer.add(state, action, reward, next_state, done)
68 |                     state = next_state
69 |                     episode_return += reward
70 |                     if replay_buffer.size() > minimal_size:
71 |                         b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
72 |                         transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d}
73 |                         agent.update(transition_dict)
74 |                 return_list.append(episode_return)
75 |                 if (i_episode+1) % 10 == 0:
76 |                     pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])})
77 |                 pbar.update(1)
78 |     return return_list
79 | 
80 | 
81 | def compute_advantage(gamma, lmbda, td_delta):
82 |     td_delta = td_delta.detach().numpy()
83 |     advantage_list = []
84 |     advantage = 0.0
85 |     for delta in td_delta[::-1]:
86 |         advantage = gamma * lmbda * advantage + delta
87 |         advantage_list.append(advantage)
88 |     advantage_list.reverse()
89 |     return torch.tensor(advantage_list, dtype=torch.float)
90 |                 


--------------------------------------------------------------------------------
/动手学强化学习/README.md:
--------------------------------------------------------------------------------
 1 | This file aims to reproduce https://hrl.boyuai.com/chapter/2/dqn%E7%AE%97%E6%B3%95
 2 | 
 3 | 1. DQN
 4 | 2. Policy gradient (Reinforce)
 5 | 3. Actor Critic
 6 | 4. DDPG
 7 | 
 8 | 最后扩展到MADDPG。
 9 | 
10 | ##[2023]py script 形式的HandsOnRL
11 | https://github.com/peterwu4084/HandsOnRL/tree/main


--------------------------------------------------------------------------------
/动手学强化学习/rl_utils.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import numpy as np
 3 | import torch
 4 | import collections
 5 | import random
 6 | 
 7 | class ReplayBuffer:
 8 |     ''' 经验回放池 '''
 9 |     def __init__(self, capacity):
10 |         self.buffer = collections.deque(maxlen=capacity)  # 队列,先进先出
11 | 
12 |     def add(self, state, action, reward, next_state, done):  # 将数据加入buffer
13 |         self.buffer.append((state, action, reward, next_state, done))
14 | 
15 |     def sample(self, batch_size):  # 从buffer中采样数据,数量为batch_size
16 |         transitions = random.sample(self.buffer, batch_size)
17 |         state, action, reward, next_state, done = zip(*transitions)
18 |         return np.array(state), action, reward, np.array(next_state), done
19 | 
20 |     def size(self):  # 目前buffer中数据的数量
21 |         return len(self.buffer)
22 | 
23 | def moving_average(a, window_size):
24 |     a = np.array(a)  # 先转换为 NumPy 数组
25 |     a = np.where(a > 200, 200, a)  # 将大于200的值替换为200
26 |     cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 
27 |     middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size
28 |     r = np.arange(1, window_size-1, 2)
29 |     begin = np.cumsum(a[:window_size-1])[::2] / r
30 |     end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1]
31 |     return np.concatenate((begin, middle, end))
32 | 
33 | def train_on_policy_agent(env, agent, num_episodes):
34 |     return_list = []
35 |     for i in range(10):
36 |         with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar:
37 |             for i_episode in range(int(num_episodes/10)):
38 |                 episode_return = 0
39 |                 transition_dict = {'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': []}
40 |                 state = env.reset()
41 |                 done = False
42 |                 while not done:
43 |                     action = agent.take_action(state)
44 |                     next_state, reward, done, _ = env.step(action)
45 |                     transition_dict['states'].append(state)
46 |                     transition_dict['actions'].append(action)
47 |                     transition_dict['next_states'].append(next_state)
48 |                     transition_dict['rewards'].append(reward)
49 |                     transition_dict['dones'].append(done)
50 |                     state = next_state
51 |                     episode_return += reward
52 |                 return_list.append(episode_return)
53 |                 agent.update(transition_dict)
54 |                 if (i_episode+1) % 10 == 0:
55 |                     pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])})
56 |                 pbar.update(1)
57 |     return return_list
58 | 
59 | def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size):
60 |     return_list = []
61 |     for i in range(10):
62 |         with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar:
63 |             for i_episode in range(int(num_episodes/10)):
64 |                 episode_return = 0
65 |                 state = env.reset()
66 |                 done = False
67 |                 while not done:
68 |                     action = agent.take_action(state)
69 |                     next_state, reward, done, _ = env.step(action)
70 |                     replay_buffer.add(state, action, reward, next_state, done)
71 |                     state = next_state
72 |                     episode_return += reward
73 |                     if replay_buffer.size() > minimal_size:
74 |                         b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
75 |                         transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d}
76 |                         agent.update(transition_dict)
77 |                 return_list.append(episode_return)
78 |                 if (i_episode+1) % 10 == 0:
79 |                     pbar.set_postfix({'episode': '%d' % (num_episodes/10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])})
80 |                 pbar.update(1)
81 |     return return_list
82 | 
83 | 
84 | def compute_advantage(gamma, lmbda, td_delta):
85 |     td_delta = td_delta.detach().numpy()
86 |     advantage_list = []
87 |     advantage = 0.0
88 |     for delta in td_delta[::-1]:
89 |         advantage = gamma * lmbda * advantage + delta
90 |         advantage_list.append(advantage)
91 |     advantage_list.reverse()
92 |     return torch.tensor(advantage_list, dtype=torch.float)
93 |                 


--------------------------------------------------------------------------------
/动手学强化学习/策略梯度/Reinforce.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | class PolicyNet(torch.nn.Module):
 5 |     def __init__(self, state_dim, hidden_dim, action_dim):
 6 |         super(PolicyNet, self).__init__()
 7 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 8 |         self.fc2 = torch.nn.Linear(hidden_dim, action_dim)
 9 | 
10 |     def forward(self, x):
11 |         x = F.relu(self.fc1(x))
12 |         return F.softmax(self.fc2(x), dim=1)
13 | 
14 | class REINFORCE:
15 |     def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
16 |                  device):
17 |         self.policy_net = PolicyNet(state_dim, hidden_dim,
18 |                                     action_dim).to(device)
19 |         self.optimizer = torch.optim.Adam(self.policy_net.parameters(),
20 |                                           lr=learning_rate)  # 使用Adam优化器
21 |         self.gamma = gamma  # 折扣因子
22 |         self.device = device
23 | 
24 |     def take_action(self, state):  # 根据动作概率分布随机采样
25 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
26 |         probs = self.policy_net(state)
27 |         action_dist = torch.distributions.Categorical(probs)
28 |         action = action_dist.sample()
29 |         return action.item()
30 | 
31 |     def update(self, transition_dict):
32 |         reward_list = transition_dict['rewards']
33 |         state_list = transition_dict['states']
34 |         action_list = transition_dict['actions']
35 | 
36 |         G = 0
37 |         self.optimizer.zero_grad()
38 |         for i in reversed(range(len(reward_list))):  # 从最后一步算起
39 |             reward = reward_list[i]
40 |             state = torch.tensor([state_list[i]],
41 |                                  dtype=torch.float).to(self.device)
42 |             action = torch.tensor([action_list[i]]).view(-1, 1).to(self.device)
43 |             log_prob = torch.log(self.policy_net(state).gather(1, action))
44 |             G = self.gamma * G + reward
45 |             loss = -log_prob * G  # 每一步的损失函数
46 |             loss.backward()  # 反向传播计算梯度
47 |         self.optimizer.step()  # 梯度下降


--------------------------------------------------------------------------------
/动手学强化学习/策略梯度/display.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ronchy2000/Multi-agent-RL/5d8471456b91a4f9a8f0b24444e4156b57c24c37/动手学强化学习/策略梯度/display.py


--------------------------------------------------------------------------------
/动手学强化学习/策略梯度/main.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import torch
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from tqdm import tqdm
 7 | from Reinforce import *
 8 | 
 9 | # 将上级目录添加到 sys.path
10 | import sys
11 | import os
12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13 | import rl_utils
14 | 
15 | learning_rate = 1e-3
16 | num_episodes = 1000
17 | hidden_dim = 128
18 | gamma = 0.98
19 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
20 |     "cpu")
21 | print(f"Using device: {device}")
22 | 
23 | env_name = "CartPole-v0"
24 | env = gym.make(env_name)
25 | env.reset(seed = 0)
26 | torch.manual_seed(0)
27 | state_dim = env.observation_space.shape[0]
28 | action_dim = env.action_space.n
29 | agent = REINFORCE(state_dim, hidden_dim, action_dim, learning_rate, gamma,
30 |                   device)
31 | 
32 | return_list = []
33 | for i in range(10):
34 |     with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
35 |         for i_episode in range(int(num_episodes / 10)):
36 |             episode_return = 0
37 |             transition_dict = {
38 |                 'states': [],
39 |                 'actions': [],
40 |                 'next_states': [],
41 |                 'rewards': [],
42 |                 'dones': []
43 |             }
44 |             state, *_ = env.reset()
45 |             done = False
46 |             while not done:
47 |                 action = agent.take_action(state)
48 |                 next_state, reward, done, *_ = env.step(action)
49 |                 transition_dict['states'].append(state)
50 |                 transition_dict['actions'].append(action)
51 |                 transition_dict['next_states'].append(next_state)
52 |                 transition_dict['rewards'].append(reward)
53 |                 transition_dict['dones'].append(done)
54 |                 state = next_state
55 |                 episode_return += reward
56 |             return_list.append(episode_return)
57 |             agent.update(transition_dict)
58 |             if (i_episode + 1) % 10 == 0:
59 |                 pbar.set_postfix({
60 |                     'episode':
61 |                     '%d' % (num_episodes / 10 * i + i_episode + 1),
62 |                     'return':
63 |                     '%.3f' % np.mean(return_list[-10:])
64 |                 })
65 |             pbar.update(1)
66 | 
67 | episodes_list = list(range(len(return_list)))
68 | plt.plot(episodes_list, return_list)
69 | plt.xlabel('Episodes')
70 | plt.ylabel('Returns')
71 | plt.title('REINFORCE on {}'.format(env_name))
72 | plt.show()
73 | 
74 | mv_return = rl_utils.moving_average(return_list, 9)
75 | plt.plot(episodes_list, mv_return)
76 | plt.xlabel('Episodes')
77 | plt.ylabel('Returns')
78 | plt.title('REINFORCE on {}'.format(env_name))
79 | plt.show()


--------------------------------------------------------------------------------