├── .DS_Store
├── README.md
├── ac
    ├── CartPole-v0
    │   ├── 10core_actor_loss_curve.png
    │   ├── 10core_critic_loss_curve.png
    │   ├── 10core_reward_curve.png
    │   ├── 1core_reward_curve.png
    │   ├── 2core_reward_curve.png
    │   ├── 3core_reward_curve.png
    │   ├── A2C.gif
    │   ├── A2C.pth
    │   ├── A3C.gif
    │   ├── A3C.pth
    │   ├── AC.gif
    │   ├── AC.pth
    │   ├── actor_loss_curve.png
    │   ├── critic_loss_curve.png
    │   ├── reward_curve(A2C).png
    │   ├── reward_curve(A3C).png
    │   └── reward_curve(AC).png
    ├── README.md
    ├── a2c.py
    ├── a2c_target.py
    ├── a3c.py
    ├── ac.py
    ├── ac_target.py
    ├── sacwalker.py
    └── test_env.py
├── custom_env
    ├── README.md
    └── snake_env.py
├── ddpg
    ├── ddpg.py
    └── td3.py
├── deep_learning
    ├── 作业1-傅上峰.ipynb
    ├── 作业1-廖子兴.ipynb
    ├── 作业1-邓成杰.ipynb
    ├── 作业1-邹庆翔.ipynb
    ├── 作业1——手写数字识别.ipynb
    └── 喻风-minst.ipynb
├── dqn
    ├── ddqn.py
    ├── ddrqn.py
    ├── dqn.py
    └── drqn.py
├── pg
    ├── reinforce.py
    └── reinforce_baseline.py
├── ppo
    ├── .idea
    │   ├── .gitignore
    │   ├── RL_algorithm.iml
    │   ├── inspectionProfiles
    │   │   └── profiles_settings.xml
    │   ├── misc.xml
    │   └── modules.xml
    ├── BipedalWalker-v3
    │   ├── 2024_01_30_22_54_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_01_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_05_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_16_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_17_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_26_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_32_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_35_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_39_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_40_BipedalWalker-v3.pth
    │   ├── 2024_01_30_23_50_BipedalWalker-v3.pth
    │   ├── 2024_01_31_00_09_BipedalWalker-v3.pth
    │   ├── 2024_01_31_00_11_BipedalWalker-v3.pth
    │   ├── 2024_01_31_00_15_BipedalWalker-v3.pth
    │   ├── 2024_02_01_10_58_BipedalWalker-v3.pth
    │   ├── 2024_02_01_10_59_BipedalWalker-v3.pth
    │   ├── 2024_02_01_11_00_BipedalWalker-v3.pth
    │   ├── 2024_02_01_11_01_BipedalWalker-v3.pth
    │   ├── 2024_02_01_11_02_BipedalWalker-v3.pth
    │   ├── 2024_02_01_11_03_BipedalWalker-v3.pth
    │   ├── 2024_02_01_11_04_BipedalWalker-v3.pth
    │   ├── 2024_02_01_11_06_BipedalWalker-v3.pth
    │   ├── 2024_02_01_11_07_BipedalWalker-v3.pth
    │   ├── 2024_02_01_11_08_BipedalWalker-v3.pth
    │   ├── 2024_02_01_15_41_BipedalWalker-v3.pth
    │   ├── 2024_02_01_15_48_BipedalWalker-v3.pth
    │   ├── 2024_02_01_15_49_BipedalWalker-v3.pth
    │   ├── 2024_02_01_15_50_BipedalWalker-v3.pth
    │   ├── 2024_02_01_15_51_BipedalWalker-v3.pth
    │   ├── 2024_02_01_15_53_BipedalWalker-v3.pth
    │   ├── 2024_02_01_15_54_BipedalWalker-v3.pth
    │   ├── 2024_02_01_15_59_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_23_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_25_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_26_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_27_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_28_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_29_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_30_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_31_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_32_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_33_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_34_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_35_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_36_BipedalWalker-v3.pth
    │   ├── 2024_02_01_16_42_BipedalWalker-v3.pth
    │   ├── 2024_02_01_17_53_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_17_55_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_17_56_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_17_58_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_17_59_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_00_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_02_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_05_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_06_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_07_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_08_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_09_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_12_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_13_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_24_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_41_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_45_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_51_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_18_56_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_19_11_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_19_19_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_19_26_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_19_44_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_19_52_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_19_56_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_20_12_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_20_16_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_20_18_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_20_22_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_21_10_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_21_14_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_21_25_BipedalWalkerHardcore-v3.pth
    │   ├── 2024_02_01_21_30_BipedalWalkerHardcore-v3.pth
    │   └── 2024_02_01_21_54_BipedalWalkerHardcore-v3.pth
    ├── BipedwalkerHardcoreTest.py
    ├── MultiFollow
    │   ├── 2024_02_24_20_31_Follow.pth
    │   └── 2024_02_24_20_32.gif
    ├── custom_env
    │   ├── MultiAgentFollowEnvV1.py
    │   ├── MultiAgentFollowEnvV2.py
    │   ├── Walker_Discreate.py
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── MultiAgentFollowEnvV2.cpython-38.pyc
    │   │   ├── MultiAgentFollowEnvV3.cpython-38.pyc
    │   │   ├── Walker_Discreate.cpython-38.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   └── snake_env.cpython-38.pyc
    │   └── snake_env.py
    ├── ppo.py
    ├── ppo_continuous.py
    ├── ppo_discrete.py
    ├── ppo_discretemask.py
    ├── ppo_multiagentfollow.py
    ├── ppo_multidiscrete.py
    ├── ppo_multienv.py
    ├── ppo_sharing.py
    ├── ppo_snake.py
    ├── ppoconPendulum.py
    ├── ppodisPendulum.py
    ├── runs
    │   ├── BipedalWalker-v3__ppo_continuous__1__1706597051
    │   │   └── events.out.tfevents.1706597051.SKY-20230422NIZ.12216.0
    │   ├── BipedalWalker-v3__ppo_continuous__1__1706771659
    │   │   └── events.out.tfevents.1706771659.DESKTOP-VAT73EM.12564.0
    │   ├── BipedalWalker-v3__ppo_multienv__1__1706755654
    │   │   └── events.out.tfevents.1706755654.DESKTOP-VAT73EM.4120.0
    │   ├── BipedalWalker-v3__ppo_multienv__1__1706774718
    │   │   └── events.out.tfevents.1706774718.DESKTOP-VAT73EM.11980.0
    │   ├── BipedalWalkerHardcore-v3__ppo_multienv__1__1706780430
    │   │   └── events.out.tfevents.1706780430.DESKTOP-VAT73EM.12616.0
    │   ├── Follow__ppo_sharing__1__1708769402
    │   │   └── events.out.tfevents.1708769402.LAPTOP-K5GRC2HU.20764.0
    │   ├── Snake-v0__ppo_discretemask__1__1706672792
    │   │   └── events.out.tfevents.1706672792.SKY-20230422NIZ.11492.0
    │   ├── Snake-v0__ppo_discretemask__1__1707018799
    │   │   └── events.out.tfevents.1707018799.DESKTOP-VAT73EM.4348.0
    │   ├── Snake-v0__ppo_snake__1__1706623864
    │   │   └── events.out.tfevents.1706623864.SKY-20230422NIZ.676.0
    │   ├── Snake-v0__ppo_snake__1__1708424259
    │   │   └── events.out.tfevents.1708424259.LAPTOP-K5GRC2HU.22668.0
    │   ├── Walker__ppomultidiscrete__1__1706597666
    │   │   └── events.out.tfevents.1706597666.SKY-20230422NIZ.10160.0
    │   └── runs.rar
    ├── snake-v0
    │   ├── 2024_01_30_22_14_Snake-v0.pth
    │   ├── 2024_01_30_22_19_Snake-v0.pth
    │   ├── 2024_01_30_22_22_Snake-v0.pth
    │   ├── 2024_01_30_22_36_Snake-v0.pth
    │   ├── 2024_01_30_22_49_Snake-v0.pth
    │   ├── 2024_01_31_19_26_Snake-v0.pth
    │   ├── 2024_01_31_19_27_Snake-v0.pth
    │   └── 2024_01_31_19_28_Snake-v0.pth
    ├── snake-v0mask
    │   ├── 2024_01_31_11_48_Snake-v0.pth
    │   ├── 2024_01_31_11_49_Snake-v0.pth
    │   ├── 2024_01_31_11_51_Snake-v0.pth
    │   ├── 2024_01_31_11_55_Snake-v0.pth
    │   ├── 2024_01_31_12_00_Snake-v0.pth
    │   ├── 2024_01_31_12_09_Snake-v0.pth
    │   ├── 2024_01_31_12_13_Snake-v0.pth
    │   ├── 2024_01_31_12_31_Snake-v0.pth
    │   ├── 2024_01_31_12_48_Snake-v0.pth
    │   ├── 2024_01_31_17_20_Snake-v0.pth
    │   └── 2024_01_31_17_21_Snake-v0.pth
    └── test_follow.py
├── requirements.txt
├── result
    └── gif
    │   ├── BipedalWalker-v3
    │       ├── BipedalWalker-v3.gif
    │       ├── SAC.png
    │       ├── SAC.pth
    │       ├── SACtrick.png
    │       └── SACtrick.pth
    │   ├── BipedalWalkerHardcore-v3
    │       ├── BipedalWalkerHardcore-v3.gif
    │       ├── SAC10000.png
    │       ├── SAC10000.pth
    │       ├── SAC3000.png
    │       └── SAC3000.pth
    │   ├── Snake-v0
    │       ├── A2C.gif
    │       ├── A2C.png
    │       ├── A2C.pth
    │       ├── A2C.yaml
    │       ├── A2C_TARGET.png
    │       ├── A2C_TARGET.pth
    │       ├── AC.png
    │       ├── AC.pth
    │       ├── AC_TARGET.png
    │       ├── AC_TARGET.pth
    │       ├── DDQN.gif
    │       ├── DDQN.png
    │       ├── DDQN.pth
    │       ├── DDRQN.png
    │       ├── DDRQN.pth
    │       ├── DQN.png
    │       ├── DQN.pth
    │       ├── DRQN.png
    │       ├── DRQN.pth
    │       ├── PPO.gif
    │       ├── PPO.png
    │       ├── PPO.pth
    │       ├── REINFORCE.png
    │       ├── REINFORCE.pth
    │       ├── REINFORCE_baseline.gif
    │       ├── REINFORCE_baseline.png
    │       └── REINFORCE_baseline.pth
    │   └── Snake-v1
    │       ├── A2C.gif
    │       ├── DDQN.gif
    │       ├── PPO.gif
    │       └── REINFORCE_baseline.gif
├── rl_utils.py
├── test_agent.py
└── test_env.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Algorithm
 3 | 
 4 | ## Implemented algorithm
 5 | 
 6 | dqn, ddqn, drqn
 7 | 
 8 | reinforce, reinforce with baseline
 9 | 
10 | ac, ac with target, a2c, a2c with target, a3c, sac
11 | 
12 | ppo, ippo, multidiscrete action ppo
13 | 
14 | ## To-be-implemented algorithm
15 | 
16 | dueling dqn
17 | 
18 | trpo
19 | 
20 | ddpg
21 | 
22 | 
23 | # benchmark
24 | 
25 | - custom env
26 | 
27 |     - Snake-0
28 | 
29 |     - Walker(BipedalWalker-v3 discrete version)
30 | 
31 | 
32 | - CartPole-v0
33 | 
34 | - Pendulum-v1
35 | 
36 | - BipedalWalker-v3
37 | 
38 | - BipedalWalkerHardcore-v3
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/ac/CartPole-v0/10core_actor_loss_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/10core_actor_loss_curve.png


--------------------------------------------------------------------------------
/ac/CartPole-v0/10core_critic_loss_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/10core_critic_loss_curve.png


--------------------------------------------------------------------------------
/ac/CartPole-v0/10core_reward_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/10core_reward_curve.png


--------------------------------------------------------------------------------
/ac/CartPole-v0/1core_reward_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/1core_reward_curve.png


--------------------------------------------------------------------------------
/ac/CartPole-v0/2core_reward_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/2core_reward_curve.png


--------------------------------------------------------------------------------
/ac/CartPole-v0/3core_reward_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/3core_reward_curve.png


--------------------------------------------------------------------------------
/ac/CartPole-v0/A2C.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/A2C.gif


--------------------------------------------------------------------------------
/ac/CartPole-v0/A2C.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/A2C.pth


--------------------------------------------------------------------------------
/ac/CartPole-v0/A3C.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/A3C.gif


--------------------------------------------------------------------------------
/ac/CartPole-v0/A3C.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/A3C.pth


--------------------------------------------------------------------------------
/ac/CartPole-v0/AC.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/AC.gif


--------------------------------------------------------------------------------
/ac/CartPole-v0/AC.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/AC.pth


--------------------------------------------------------------------------------
/ac/CartPole-v0/actor_loss_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/actor_loss_curve.png


--------------------------------------------------------------------------------
/ac/CartPole-v0/critic_loss_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/critic_loss_curve.png


--------------------------------------------------------------------------------
/ac/CartPole-v0/reward_curve(A2C).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/reward_curve(A2C).png


--------------------------------------------------------------------------------
/ac/CartPole-v0/reward_curve(A3C).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/reward_curve(A3C).png


--------------------------------------------------------------------------------
/ac/CartPole-v0/reward_curve(AC).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ac/CartPole-v0/reward_curve(AC).png


--------------------------------------------------------------------------------
/ac/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | Just implement the Actor-Critic(AC), the Advantage Actor-Critic(A2C) and Asynchronous Advantage Actor-Critic(A3C) in the simple environment CatPole-v0. The implemented code is not generic and now just test in this simple environment CartPole-v0
 4 | 
 5 | # How to train
 6 | 
 7 | Just run the program ac.py, a2c.py or a3c.py
 8 | 
 9 | # How to test
10 | 
11 | Just run the test.py
12 | 
13 | # The results demonstrate
14 | 
15 | The fellows are only the reward curve and you can find more in the "Actor-Critic-Algorithm-Learning\CartPole-v0" directory.
16 | 
17 | ![reward_curve(AC)](https://github.com/sunwuzhou03/rl_learning/blob/master/Actor-Critic-Algorithm-Learning/CartPole-v0/reward_curve(AC).png) 
18 | 
19 | ![reward_curve(A2C)](https://github.com/sunwuzhou03/rl_learning/blob/master/Actor-Critic-Algorithm-Learning/CartPole-v0/reward_curve(A2C).png) 
20 | 
21 | ![reward_curve(A3C)](https://github.com/sunwuzhou03/rl_learning/blob/master/Actor-Critic-Algorithm-Learning/CartPole-v0/reward_curve(A3C).png)


--------------------------------------------------------------------------------
/ac/a2c.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | import yaml
 13 | import os
 14 | 
 15 | 
 16 | def plot_smooth_reward(rewards,
 17 |                        directory="./",
 18 |                        filename="smooth_reward_plot",
 19 |                        y_label='Reward',
 20 |                        x_label='Episode',
 21 |                        window_size=100):
 22 | 
 23 |     # 创建目标目录（如果不存在）
 24 |     os.makedirs(directory, exist_ok=True)
 25 | 
 26 |     img_title = filename
 27 |     # 拼接文件名和扩展名
 28 |     filename = f"{filename}.png"
 29 | 
 30 |     # 构建完整的文件路径
 31 |     filepath = os.path.join(directory, filename)
 32 | 
 33 |     # 计算滑动窗口平均值
 34 |     smoothed_rewards = np.convolve(rewards,
 35 |                                    np.ones(window_size) / window_size,
 36 |                                    mode='valid')
 37 | 
 38 |     # 绘制原始奖励和平滑奖励曲线
 39 |     plt.plot(rewards, label='Raw Data Curve')
 40 |     plt.plot(smoothed_rewards, label='Smoothed Data Curve')
 41 | 
 42 |     # 设置图例、标题和轴标签
 43 |     plt.legend()
 44 |     plt.title(img_title)
 45 |     plt.xlabel(x_label)
 46 |     plt.ylabel(y_label)
 47 | 
 48 |     # 保存图像
 49 |     plt.savefig(filepath)
 50 | 
 51 |     # 关闭图像
 52 |     plt.close()
 53 | 
 54 | 
 55 | class CriticNet(nn.Module):
 56 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 57 |         super().__init__()
 58 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 59 |         self.fc2 = torch.nn.Linear(hidden_dim, 1)
 60 | 
 61 |     def forward(self, state):
 62 |         hidden_state = F.relu(self.fc1(state))
 63 |         acion_value = self.fc2(hidden_state)
 64 |         return acion_value
 65 | 
 66 | 
 67 | class ActorNet(nn.Module):
 68 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 69 |         super().__init__()
 70 |         self.fc1 = nn.Linear(state_dim, hidden_dim)
 71 |         self.fc2 = nn.Linear(hidden_dim, action_dim)
 72 | 
 73 |     def forward(self, state):
 74 | 
 75 |         hidden_state = F.relu(self.fc1(state))
 76 |         # print(self.fc2(hidden_state))
 77 |         probs = F.softmax(self.fc2(hidden_state), dim=1)
 78 |         return probs
 79 | 
 80 | 
 81 | class A2C:
 82 |     def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
 83 |                  gamma, device) -> None:
 84 |         self.actor = ActorNet(state_dim, hidden_dim, action_dim).to(device)
 85 |         self.critic = CriticNet(state_dim, hidden_dim, action_dim).to(device)
 86 |         self.gamma = gamma
 87 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
 88 |                                                 lr=actor_lr)
 89 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
 90 |                                                  lr=critic_lr)
 91 |         self.device = device
 92 | 
 93 |     def save_model(self, save_path='./', filename='model'):
 94 |         model = {'actor': self.actor, 'critic': self.critic}
 95 |         torch.save(model, f"{save_path}\\{filename}.pth")
 96 | 
 97 |     def load_model(self, load_path):
 98 |         model = torch.load(load_path)
 99 |         self.actor = model['actor']
100 |         self.critic = model['critic']
101 | 
102 |     def take_action(self, state):
103 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
104 |         probs = self.actor(state)
105 |         action_dist = torch.distributions.Categorical(probs)
106 |         action = action_dist.sample()
107 |         return action.item()
108 | 
109 |     def evaluate(self, state):
110 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
111 |         probs = self.actor(state)
112 |         action = torch.argmax(probs, dim=1)
113 |         return action.item()
114 | 
115 |     def update(self, transition_dict):
116 | 
117 |         rewards = torch.tensor(transition_dict['rewards'],
118 |                                dtype=torch.float).view(-1, 1).to(self.device)
119 |         states = torch.tensor(transition_dict['states'],
120 |                               dtype=torch.float).to(self.device)
121 |         actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
122 |             self.device)
123 |         next_states = torch.tensor(transition_dict['next_states'],
124 |                                    dtype=torch.float).to(self.device)
125 |         next_actions = torch.tensor(transition_dict['next_actions']).view(
126 |             -1, 1).to(self.device)
127 |         dones = torch.tensor(transition_dict['dones'],
128 |                              dtype=torch.float).to(self.device).view(-1, 1)
129 | 
130 |         v_now = self.critic(states).view(-1, 1)
131 | 
132 |         v_next = self.critic(next_states).view(-1, 1)
133 | 
134 |         y_now = (self.gamma * v_next * (1 - dones) + rewards).view(-1, 1)
135 |         td_delta = y_now - v_now
136 |         log_prob = torch.log(self.actor(states).gather(1, actions))
137 | 
138 |         actor_loss = torch.mean(-log_prob * td_delta.detach())
139 |         critic_loss = torch.mean(F.mse_loss(y_now.detach(), v_now))
140 | 
141 |         self.actor_optimizer.zero_grad()
142 |         self.critic_optimizer.zero_grad()
143 | 
144 |         actor_loss.backward()
145 |         critic_loss.backward()
146 | 
147 |         self.actor_optimizer.step()
148 |         self.critic_optimizer.step()
149 | 
150 | 
151 | if __name__ == "__main__":
152 | 
153 |     gamma = 0.99
154 |     algorithm_name = "A2C"
155 |     num_episodes = 1000
156 |     actor_lr = 1e-3
157 |     critic_lr = 1e-2
158 |     env_name = 'CartPole-v0'
159 |     hidden_dim = 128
160 | 
161 |     #选择设备
162 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
163 | 
164 |     # 注册环境
165 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
166 | 
167 |     env = gym.make(env_name)
168 | 
169 |     random.seed(0)
170 |     np.random.seed(0)
171 |     env.seed(0)
172 |     torch.manual_seed(0)
173 | 
174 |     state_dim = env.observation_space.shape[0]
175 | 
176 |     action_dim = env.action_space.n
177 |     agent = A2C(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma,
178 |                 device)
179 | 
180 |     return_list = []
181 |     max_reward = 0
182 |     start_time = time.time()
183 |     flag = 0
184 |     for i in range(10):
185 |         with tqdm(total=int(num_episodes / 10),
186 |                   desc='Iteration %d' % i) as pbar:
187 |             for i_episodes in range(int(num_episodes / 10)):
188 |                 episode_return = 0
189 |                 state = env.reset()
190 |                 done = False
191 |                 transition_dict = {
192 |                     'states': [],
193 |                     'actions': [],
194 |                     'next_states': [],
195 |                     'next_actions': [],
196 |                     'rewards': [],
197 |                     'dones': []
198 |                 }
199 | 
200 |                 while not done:
201 |                     action = agent.take_action(state)
202 |                     next_state, reward, done, _ = env.step(action)
203 |                     next_action = agent.take_action(next_state)
204 | 
205 |                     transition_dict['states'].append(state)
206 |                     transition_dict['actions'].append(action)
207 |                     transition_dict['next_states'].append(next_state)
208 |                     transition_dict['next_actions'].append(next_action)
209 |                     transition_dict['rewards'].append(reward)
210 |                     transition_dict['dones'].append(done)
211 | 
212 |                     state = next_state
213 |                     episode_return += reward
214 |                 agent.update(transition_dict)
215 | 
216 |                 return_list.append(episode_return)
217 |                 plot_smooth_reward(return_list, env_name, "reward_curve(A2C)")
218 |                 if episode_return >= max_reward:
219 |                     max_reward = episode_return
220 |                     agent.save_model(env_name, algorithm_name)
221 | 
222 |                 if episode_return >= 200 and flag == 0:
223 |                     flag = 1
224 |                     end_time = time.time()
225 |                     run_time = end_time - start_time
226 | 
227 |                     # 打印程序运行时间
228 |                     print(f"A2C到达200需要：{run_time}秒")
229 | 
230 |                 if (i_episodes + 1 % 10 == 0):
231 |                     pbar.set_postfix({
232 |                         'episode':
233 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
234 |                         'return':
235 |                         '%.3f' % np.mean(return_list[-10:])
236 |                     })
237 |                 pbar.update(1)
238 | 


--------------------------------------------------------------------------------
/ac/a2c_target.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | from rl_utils import plot_smooth_reward
 13 | 
 14 | 
 15 | class CriticNet(nn.Module):
 16 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 17 |         super().__init__()
 18 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 19 |         self.fc2 = torch.nn.Linear(hidden_dim, 1)
 20 | 
 21 |     def forward(self, state):
 22 |         hidden_state = F.relu(self.fc1(state))
 23 |         acion_value = self.fc2(hidden_state)
 24 |         return acion_value
 25 | 
 26 | 
 27 | class ActorNet(nn.Module):
 28 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 29 |         super().__init__()
 30 |         self.fc1 = nn.Linear(state_dim, hidden_dim)
 31 |         self.fc2 = nn.Linear(hidden_dim, action_dim)
 32 | 
 33 |     def forward(self, state):
 34 | 
 35 |         hidden_state = F.relu(self.fc1(state))
 36 |         # print(self.fc2(hidden_state))
 37 |         probs = F.softmax(self.fc2(hidden_state), dim=1)
 38 |         return probs
 39 | 
 40 | 
 41 | class A2C_TARGET:
 42 |     def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
 43 |                  gamma, tau, device) -> None:
 44 |         self.actor = ActorNet(state_dim, hidden_dim, action_dim).to(device)
 45 |         self.critic = CriticNet(state_dim, hidden_dim, action_dim).to(device)
 46 |         self.critic_target = CriticNet(state_dim, hidden_dim,
 47 |                                        action_dim).to(device)
 48 |         self.critic_target.load_state_dict(self.critic.state_dict())
 49 | 
 50 |         self.gamma = gamma
 51 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
 52 |                                                 lr=actor_lr)
 53 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
 54 |                                                  lr=critic_lr)
 55 |         self.device = device
 56 |         self.tau = tau
 57 | 
 58 |     def save_model(self, save_path='./', filename='model'):
 59 |         torch.save(self.actor.state_dict(), f"{save_path}\\{filename}.pth")
 60 | 
 61 |     def load_model(self, load_path):
 62 |         self.actor.load_state_dict(torch.load(load_path))
 63 | 
 64 |     def take_action(self, state):
 65 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 66 |         probs = self.actor(state)
 67 |         action_dist = torch.distributions.Categorical(probs)
 68 |         action = action_dist.sample()
 69 |         return action.item()
 70 | 
 71 |     def soft_update(self, net, target_net):
 72 |         for param_target, param in zip(target_net.parameters(),
 73 |                                        net.parameters()):
 74 |             param_target.data.copy_(param_target.data * (1 - self.tau) +
 75 |                                     param.data * self.tau)
 76 | 
 77 |     def update(self, transition_dict):
 78 |         rewards = torch.tensor(transition_dict['rewards'],
 79 |                                dtype=torch.float).view(-1, 1).to(self.device)
 80 |         states = torch.tensor(transition_dict['states'],
 81 |                               dtype=torch.float).to(self.device)
 82 |         actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
 83 |             self.device)
 84 |         next_states = torch.tensor(transition_dict['next_states'],
 85 |                                    dtype=torch.float).to(self.device)
 86 |         next_actions = torch.tensor(transition_dict['next_actions']).view(
 87 |             -1, 1).to(self.device)
 88 |         dones = torch.tensor(transition_dict['dones'],
 89 |                              dtype=torch.float).to(self.device).view(-1, 1)
 90 | 
 91 |         v_now = self.critic(states).view(-1, 1)
 92 |         v_next = self.critic_target(next_states).view(-1, 1)
 93 |         y_now = (self.gamma * v_next * (1 - dones) + rewards).view(-1, 1)
 94 |         td_delta = y_now - v_now
 95 |         log_prob = torch.log(self.actor(states).gather(1, actions))
 96 | 
 97 |         actor_loss = torch.mean(-log_prob * td_delta.detach())
 98 |         critic_loss = torch.mean(F.mse_loss(y_now.detach(), v_now))
 99 | 
100 |         self.actor_optimizer.zero_grad()
101 |         self.critic_optimizer.zero_grad()
102 | 
103 |         actor_loss.backward()
104 |         critic_loss.backward()
105 | 
106 |         self.actor_optimizer.step()
107 |         self.critic_optimizer.step()
108 |         self.soft_update(self.critic, self.critic_target)
109 | 
110 | 
111 | if __name__ == "__main__":
112 | 
113 |     env_name = 'Snake-v0'  #'CartPole-v0'
114 | 
115 |     # 注册环境
116 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
117 | 
118 |     env = gym.make(env_name)
119 | 
120 |     random.seed(0)
121 |     np.random.seed(0)
122 |     env.seed(0)
123 |     torch.manual_seed(0)
124 |     gamma = 0.98
125 |     algorithm_name = "A2C_TARGET"
126 |     num_episodes = 10000
127 |     actor_lr = 1e-3
128 |     critic_lr = 3e-3
129 |     tau = 5e-3
130 |     device = torch.device('cuda')
131 |     state_dim = env.observation_space.shape[0]
132 |     hidden_dim = 128
133 |     action_dim = env.action_space.n
134 | 
135 |     agent = A2C_TARGET(state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
136 |                        gamma, tau, device)
137 | 
138 |     return_list = []
139 |     max_reward = 0
140 |     for i in range(10):
141 |         with tqdm(total=int(num_episodes / 10),
142 |                   desc='Iteration %d' % i) as pbar:
143 |             for i_episodes in range(int(num_episodes / 10)):
144 |                 episode_return = 0
145 |                 state = env.reset()
146 |                 done = False
147 | 
148 |                 transition_dict = {
149 |                     'states': [],
150 |                     'actions': [],
151 |                     'next_states': [],
152 |                     'next_actions': [],
153 |                     'rewards': [],
154 |                     'dones': []
155 |                 }
156 | 
157 |                 while not done:
158 |                     action = agent.take_action(state)
159 |                     next_state, reward, done, _ = env.step(action)
160 |                     next_action = agent.take_action(next_state)
161 | 
162 |                     transition_dict['states'].append(state)
163 |                     transition_dict['actions'].append(action)
164 |                     transition_dict['next_states'].append(next_state)
165 |                     transition_dict['next_actions'].append(next_action)
166 |                     transition_dict['rewards'].append(reward)
167 |                     transition_dict['dones'].append(done)
168 | 
169 |                     state = next_state
170 |                     episode_return += reward
171 |                     if i_episodes == int(num_episodes / 10) - 1:
172 |                         # time.sleep(0.1)
173 |                         env.render()
174 | 
175 |                 agent.update(transition_dict)
176 | 
177 |                 return_list.append(episode_return)
178 |                 plot_smooth_reward(return_list, 100, env_name, algorithm_name)
179 | 
180 |                 if episode_return > max_reward:
181 |                     max_reward = episode_return
182 |                     agent.save_model(env_name, algorithm_name)
183 |                 if (i_episodes + 1 % 10 == 0):
184 |                     pbar.set_postfix({
185 |                         'episode':
186 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
187 |                         'return':
188 |                         '%.3f' % np.mean(return_list[-10:])
189 |                     })
190 |                 pbar.update(1)
191 | 


--------------------------------------------------------------------------------
/ac/ac.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | import os
 13 | 
 14 | 
 15 | def plot_smooth_reward(rewards,
 16 |                        directory="./",
 17 |                        filename="smooth_reward_plot",
 18 |                        y_label='Reward',
 19 |                        x_label='Episode',
 20 |                        window_size=100):
 21 | 
 22 |     # 创建目标目录（如果不存在）
 23 |     os.makedirs(directory, exist_ok=True)
 24 | 
 25 |     img_title = filename
 26 |     # 拼接文件名和扩展名
 27 |     filename = f"{filename}.png"
 28 | 
 29 |     # 构建完整的文件路径
 30 |     filepath = os.path.join(directory, filename)
 31 | 
 32 |     # 计算滑动窗口平均值
 33 |     smoothed_rewards = np.convolve(rewards,
 34 |                                    np.ones(window_size) / window_size,
 35 |                                    mode='valid')
 36 | 
 37 |     # 绘制原始奖励和平滑奖励曲线
 38 |     plt.plot(rewards, label='Raw Data Curve')
 39 |     plt.plot(smoothed_rewards, label='Smoothed Data Curve')
 40 | 
 41 |     # 设置图例、标题和轴标签
 42 |     plt.legend()
 43 |     plt.title(img_title)
 44 |     plt.xlabel(x_label)
 45 |     plt.ylabel(y_label)
 46 | 
 47 |     # 保存图像
 48 |     plt.savefig(filepath)
 49 | 
 50 |     # 关闭图像
 51 |     plt.close()
 52 | 
 53 | 
 54 | class CriticNet(nn.Module):
 55 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 56 |         super().__init__()
 57 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 58 |         self.fc2 = torch.nn.Linear(hidden_dim, action_dim)
 59 | 
 60 |     def forward(self, state):
 61 |         hidden_state = F.relu(self.fc1(state))
 62 |         acion_value = self.fc2(hidden_state)
 63 |         return acion_value
 64 | 
 65 | 
 66 | class ActorNet(nn.Module):
 67 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 68 |         super().__init__()
 69 |         self.fc1 = nn.Linear(state_dim, hidden_dim)
 70 |         self.fc2 = nn.Linear(hidden_dim, action_dim)
 71 | 
 72 |     def forward(self, state):
 73 | 
 74 |         hidden_state = F.relu(self.fc1(state))
 75 |         # print(self.fc2(hidden_state))
 76 |         probs = F.softmax(self.fc2(hidden_state), dim=1)
 77 |         return probs
 78 | 
 79 | 
 80 | class AC:
 81 |     def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
 82 |                  gamma, device) -> None:
 83 |         self.actor = ActorNet(state_dim, hidden_dim, action_dim).to(device)
 84 |         self.critic = CriticNet(state_dim, hidden_dim, action_dim).to(device)
 85 |         self.gamma = gamma
 86 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
 87 |                                                 lr=actor_lr)
 88 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
 89 |                                                  lr=critic_lr)
 90 |         self.device = device
 91 | 
 92 |     def save_model(self, save_path='./', filename='model'):
 93 |         model = {'actor': self.actor, 'critic': self.critic}
 94 |         torch.save(model, f"{save_path}\\{filename}.pth")
 95 | 
 96 |     def load_model(self, load_path):
 97 |         model = torch.load(load_path)
 98 |         self.actor = model['actor']
 99 |         self.critic = model['critic']
100 | 
101 |     def take_action(self, state):
102 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
103 |         probs = self.actor(state)
104 |         action_dist = torch.distributions.Categorical(probs)
105 |         action = action_dist.sample()
106 |         return action.item()
107 | 
108 |     def evaluate(self, state):
109 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
110 |         probs = self.actor(state)
111 |         action = torch.argmax(probs, dim=1)
112 |         return action.item()
113 | 
114 |     def update(self, transition_dict):
115 |         rewards = torch.tensor(transition_dict['rewards'],
116 |                                dtype=torch.float).view(-1, 1).to(self.device)
117 |         states = torch.tensor(transition_dict['states'],
118 |                               dtype=torch.float).to(self.device)
119 |         actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
120 |             self.device)
121 |         next_states = torch.tensor(transition_dict['next_states'],
122 |                                    dtype=torch.float).to(self.device)
123 |         next_actions = torch.tensor(transition_dict['next_actions']).view(
124 |             -1, 1).to(self.device)
125 |         dones = torch.tensor(transition_dict['dones'],
126 |                              dtype=torch.float).to(self.device).view(-1, 1)
127 | 
128 |         q_now = self.critic(states).gather(1, actions).view(-1, 1)
129 |         q_next = self.critic(next_states).gather(1, next_actions).view(-1, 1)
130 | 
131 |         y_now = (self.gamma * q_next * (1 - dones) + rewards).view(-1, 1)
132 |         td_delta = y_now - q_now
133 |         log_prob = torch.log(self.actor(states).gather(1, actions))
134 |         actor_loss = torch.mean(-log_prob * td_delta.detach())
135 |         critic_loss = torch.mean(F.mse_loss(y_now.detach(), q_now))
136 | 
137 |         self.actor_optimizer.zero_grad()
138 |         self.critic_optimizer.zero_grad()
139 | 
140 |         actor_loss.backward()
141 |         critic_loss.backward()
142 | 
143 |         self.actor_optimizer.step()
144 |         self.critic_optimizer.step()
145 | 
146 | 
147 | if __name__ == "__main__":
148 | 
149 |     gamma = 0.98
150 |     algorithm_name = "AC"
151 |     num_episodes = 1000
152 | 
153 |     actor_lr = 5e-3
154 |     critic_lr = 3e-4
155 |     print(algorithm_name, actor_lr, critic_lr)
156 |     device = torch.device('cuda')
157 | 
158 |     env_name = 'CartPole-v0'
159 | 
160 |     env = gym.make(env_name)
161 | 
162 |     random.seed(0)
163 |     np.random.seed(0)
164 |     env.seed(0)
165 |     torch.manual_seed(0)
166 | 
167 |     state_dim = env.observation_space.shape[0]
168 |     hidden_dim = 128
169 |     action_dim = env.action_space.n
170 | 
171 |     agent = AC(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma,
172 |                device)
173 | 
174 |     return_list = []
175 |     max_reward = 0
176 |     for i in range(10):
177 |         with tqdm(total=int(num_episodes / 10),
178 |                   desc='Iteration %d' % i) as pbar:
179 |             for i_episodes in range(int(num_episodes / 10)):
180 |                 episode_return = 0
181 |                 state = env.reset()
182 |                 done = False
183 | 
184 |                 transition_dict = {
185 |                     'states': [],
186 |                     'actions': [],
187 |                     'next_states': [],
188 |                     'next_actions': [],
189 |                     'rewards': [],
190 |                     'dones': []
191 |                 }
192 |                 while not done:
193 |                     action = agent.take_action(state)
194 |                     next_state, reward, done, _ = env.step(action)
195 |                     next_action = agent.take_action(next_state)
196 | 
197 |                     transition_dict['states'].append(state)
198 |                     transition_dict['actions'].append(action)
199 |                     transition_dict['next_states'].append(next_state)
200 |                     transition_dict['next_actions'].append(next_action)
201 |                     transition_dict['rewards'].append(reward)
202 |                     transition_dict['dones'].append(done)
203 | 
204 |                     state = next_state
205 |                     episode_return += reward
206 |                 agent.update(transition_dict)
207 | 
208 |                 return_list.append(episode_return)
209 |                 plot_smooth_reward(return_list, env_name, "reward_curve(AC)")
210 | 
211 |                 if episode_return >= max_reward:
212 |                     max_reward = episode_return
213 |                     agent.save_model(env_name, algorithm_name)
214 |                 if (i_episodes + 1 % 10 == 0):
215 |                     pbar.set_postfix({
216 |                         'episode':
217 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
218 |                         'return':
219 |                         '%.3f' % np.mean(return_list[-10:])
220 |                     })
221 |                 pbar.update(1)
222 | 


--------------------------------------------------------------------------------
/ac/ac_target.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | from rl_utils import plot_smooth_reward
 13 | 
 14 | 
 15 | class CriticNet(nn.Module):
 16 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 17 |         super().__init__()
 18 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 19 |         self.fc2 = torch.nn.Linear(hidden_dim, action_dim)
 20 | 
 21 |     def forward(self, state):
 22 |         hidden_state = F.relu(self.fc1(state))
 23 |         acion_value = self.fc2(hidden_state)
 24 |         return acion_value
 25 | 
 26 | 
 27 | class ActorNet(nn.Module):
 28 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 29 |         super().__init__()
 30 |         self.fc1 = nn.Linear(state_dim, hidden_dim)
 31 |         self.fc2 = nn.Linear(hidden_dim, action_dim)
 32 | 
 33 |     def forward(self, state):
 34 | 
 35 |         hidden_state = F.relu(self.fc1(state))
 36 |         # print(self.fc2(hidden_state))
 37 |         probs = F.softmax(self.fc2(hidden_state), dim=1)
 38 |         return probs
 39 | 
 40 | 
 41 | class AC_TARGET:
 42 |     def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
 43 |                  gamma, tau, device) -> None:
 44 |         self.actor = ActorNet(state_dim, hidden_dim, action_dim).to(device)
 45 |         self.critic = CriticNet(state_dim, hidden_dim, action_dim).to(device)
 46 |         self.critic_target = CriticNet(state_dim, hidden_dim,
 47 |                                        action_dim).to(device)
 48 |         self.critic_target.load_state_dict(self.critic.state_dict())
 49 |         self.gamma = gamma
 50 |         self.tau = tau
 51 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
 52 |                                                 lr=actor_lr)
 53 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
 54 |                                                  lr=critic_lr)
 55 |         self.device = device
 56 | 
 57 |     def save_model(self, save_path='./', filename='model'):
 58 |         torch.save(self.actor.state_dict(), f"{save_path}\\{filename}.pth")
 59 | 
 60 |     def load_model(self, load_path):
 61 |         self.actor.load_state_dict(torch.load(load_path))
 62 | 
 63 |     def take_action(self, state):
 64 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 65 |         probs = self.actor(state)
 66 |         action_dist = torch.distributions.Categorical(probs)
 67 |         action = action_dist.sample()
 68 |         return action.item()
 69 | 
 70 |     def soft_update(self, net, target_net):
 71 |         for param_target, param in zip(target_net.parameters(),
 72 |                                        net.parameters()):
 73 |             param_target.data.copy_(param_target.data * (1 - self.tau) +
 74 |                                     param.data * self.tau)
 75 | 
 76 |     def update(self, transition_dict):
 77 |         rewards = torch.tensor(transition_dict['rewards'],
 78 |                                dtype=torch.float).view(-1, 1).to(self.device)
 79 |         states = torch.tensor(transition_dict['states'],
 80 |                               dtype=torch.float).to(self.device)
 81 |         actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
 82 |             self.device)
 83 |         next_states = torch.tensor(transition_dict['next_states'],
 84 |                                    dtype=torch.float).to(self.device)
 85 |         next_actions = torch.tensor(transition_dict['next_actions']).view(
 86 |             -1, 1).to(self.device)
 87 |         dones = torch.tensor(transition_dict['dones'],
 88 |                              dtype=torch.float).to(self.device).view(-1, 1)
 89 | 
 90 |         q_now = self.critic(states).gather(1, actions).view(-1, 1)
 91 |         q_next = self.critic_target(next_states).gather(1, next_actions).view(
 92 |             -1, 1)
 93 | 
 94 |         y_now = (self.gamma * q_next + rewards).view(-1, 1)
 95 |         # print(y_now)
 96 |         # print(q_next)
 97 |         td_delta = y_now - q_now
 98 |         log_prob = torch.log(self.actor(states).gather(1, actions))
 99 |         # print(log_prob)
100 |         # print(action)
101 |         actor_loss = torch.mean(-log_prob * td_delta.detach())
102 |         critic_loss = torch.mean(F.mse_loss(y_now.detach(), q_now))
103 |         self.actor_optimizer.zero_grad()
104 |         self.critic_optimizer.zero_grad()
105 |         actor_loss.backward()
106 |         critic_loss.backward()
107 |         self.actor_optimizer.step()
108 |         self.critic_optimizer.step()
109 | 
110 |         self.soft_update(self.critic, self.critic_target)
111 | 
112 | 
113 | if __name__ == "__main__":
114 | 
115 |     gamma = 0.99
116 |     algorithm_name = "AC_TARGET"
117 |     num_episodes = 5000
118 | 
119 |     actor_lr = 1e-3
120 |     critic_lr = 3e-3
121 |     print(algorithm_name, actor_lr, critic_lr)
122 |     device = torch.device('cuda')
123 | 
124 |     env_name = 'Snake-v0'  #'CartPole-v0'
125 | 
126 |     # 注册环境
127 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
128 | 
129 |     env = gym.make(env_name)
130 | 
131 |     random.seed(0)
132 |     np.random.seed(0)
133 |     env.seed(0)
134 |     torch.manual_seed(0)
135 | 
136 |     state_dim = env.observation_space.shape[0]
137 |     hidden_dim = 128
138 |     action_dim = env.action_space.n
139 |     tau = 5e-3
140 |     agent = AC(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma,
141 |                tau, device)
142 | 
143 |     return_list = []
144 |     max_reward = 0
145 |     for i in range(20):
146 |         with tqdm(total=int(num_episodes / 10),
147 |                   desc='Iteration %d' % i) as pbar:
148 |             for i_episodes in range(int(num_episodes / 10)):
149 |                 episode_return = 0
150 |                 state = env.reset()
151 |                 done = False
152 | 
153 |                 transition_dict = {
154 |                     'states': [],
155 |                     'actions': [],
156 |                     'next_states': [],
157 |                     'next_actions': [],
158 |                     'rewards': [],
159 |                     'dones': []
160 |                 }
161 | 
162 |                 while not done:
163 |                     action = agent.take_action(state)
164 |                     next_state, reward, done, _ = env.step(action)
165 |                     next_action = agent.take_action(next_state)
166 |                     env.render()
167 | 
168 |                     transition_dict['states'].append(state)
169 |                     transition_dict['actions'].append(action)
170 |                     transition_dict['next_states'].append(next_state)
171 |                     transition_dict['next_actions'].append(next_action)
172 |                     transition_dict['rewards'].append(reward)
173 |                     transition_dict['dones'].append(done)
174 | 
175 |                     state = next_state
176 |                     episode_return += reward
177 |                     if i_episodes == int(num_episodes / 10) - 1:
178 |                         time.sleep(0.1)
179 |                 agent.update(transition_dict)
180 |                 return_list.append(episode_return)
181 |                 plot_smooth_reward(return_list, 100, env_name, algorithm_name)
182 |                 if episode_return > max_reward:
183 |                     max_reward = episode_return
184 |                     agent.save_model(env_name, algorithm_name)
185 |                 if (i_episodes + 1 % 10 == 0):
186 |                     pbar.set_postfix({
187 |                         'episode':
188 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
189 |                         'return':
190 |                         '%.3f' % np.mean(return_list[-10:])
191 |                     })
192 |                 pbar.update(1)
193 | 


--------------------------------------------------------------------------------
/ac/sacwalker.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import random
  3 | import gym
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from torch.distributions import Normal
  9 | import matplotlib.pyplot as plt
 10 | import rl_utils
 11 | 
 12 | 
 13 | class PolicyNetContinuous(torch.nn.Module):
 14 |     def __init__(self, state_dim, hidden_dim, action_dim, action_bound):
 15 |         super(PolicyNetContinuous, self).__init__()
 16 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 17 |         self.fc_mu = torch.nn.Linear(hidden_dim, action_dim)
 18 |         self.fc_std = torch.nn.Linear(hidden_dim, action_dim)
 19 |         self.action_bound = action_bound
 20 | 
 21 |     def forward(self, x):
 22 |         x = F.relu(self.fc1(x))
 23 |         mu = self.fc_mu(x)
 24 |         std = F.softplus(self.fc_std(x))
 25 |         dist = Normal(mu, std)
 26 |         normal_sample = dist.rsample()  # rsample()是重参数化采样
 27 | 
 28 |         action = torch.tanh(normal_sample)
 29 |         # 计算tanh_normal分布的对数概率密度
 30 |         log_pi = dist.log_prob(normal_sample).sum(dim=1, keepdim=True)
 31 |         log_pi -= (
 32 |             2 *
 33 |             (np.log(2) - normal_sample - F.softplus(-2 * normal_sample))).sum(
 34 |                 dim=1, keepdim=True)
 35 | 
 36 |         action = action * self.action_bound
 37 | 
 38 |         return action, log_pi
 39 | 
 40 | 
 41 | class QValueNetContinuous(torch.nn.Module):
 42 |     def __init__(self, state_dim, hidden_dim, action_dim):
 43 |         super(QValueNetContinuous, self).__init__()
 44 |         self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim)
 45 |         self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim)
 46 |         self.fc_out = torch.nn.Linear(hidden_dim, 1)
 47 | 
 48 |     def forward(self, x, a):
 49 |         cat = torch.cat([x, a], dim=1)
 50 |         x = F.relu(self.fc1(cat))
 51 |         x = F.relu(self.fc2(x))
 52 |         return self.fc_out(x)
 53 | 
 54 | 
 55 | class SACContinuous:
 56 |     ''' 处理连续动作的SAC算法 '''
 57 |     def __init__(self, state_dim, hidden_dim, action_dim, action_bound,
 58 |                  actor_lr, critic_lr, alpha_lr, target_entropy, tau, gamma,
 59 |                  device):
 60 |         self.actor = PolicyNetContinuous(state_dim, hidden_dim, action_dim,
 61 |                                          action_bound).to(device)  # 策略网络
 62 |         self.critic_1 = QValueNetContinuous(state_dim, hidden_dim,
 63 |                                             action_dim).to(device)  # 第一个Q网络
 64 |         self.critic_2 = QValueNetContinuous(state_dim, hidden_dim,
 65 |                                             action_dim).to(device)  # 第二个Q网络
 66 |         self.target_critic_1 = QValueNetContinuous(state_dim,
 67 |                                                    hidden_dim, action_dim).to(
 68 |                                                        device)  # 第一个目标Q网络
 69 |         self.target_critic_2 = QValueNetContinuous(state_dim,
 70 |                                                    hidden_dim, action_dim).to(
 71 |                                                        device)  # 第二个目标Q网络
 72 |         # 令目标Q网络的初始参数和Q网络一样
 73 |         self.target_critic_1.load_state_dict(self.critic_1.state_dict())
 74 |         self.target_critic_2.load_state_dict(self.critic_2.state_dict())
 75 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
 76 |                                                 lr=actor_lr)
 77 |         self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(),
 78 |                                                    lr=critic_lr)
 79 |         self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(),
 80 |                                                    lr=critic_lr)
 81 |         # 使用alpha的log值,可以使训练结果比较稳定
 82 |         self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float)
 83 |         self.log_alpha.requires_grad = True  # 可以对alpha求梯度
 84 |         self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha],
 85 |                                                     lr=alpha_lr)
 86 |         self.target_entropy = target_entropy  # 目标熵的大小
 87 |         self.gamma = gamma
 88 |         self.tau = tau
 89 |         self.device = device
 90 | 
 91 |     def save_model(self, save_path='./', filename='model'):
 92 |         torch.save(self.actor.state_dict(), f"{save_path}\\{filename}.pth")
 93 | 
 94 |     def load_model(self, load_path):
 95 |         self.actor.load_state_dict(torch.load(load_path))
 96 | 
 97 |     def take_action(self, state):
 98 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 99 |         action = self.actor(state)[0]
100 |         # print(action.detach().cpu().numpy())
101 |         return action.detach().cpu().numpy().flatten()
102 | 
103 |     def calc_target(self, rewards, next_states, dones):  # 计算目标Q值
104 |         next_actions, log_prob = self.actor(next_states)
105 |         entropy = -log_prob
106 |         q1_value = self.target_critic_1(next_states, next_actions)
107 |         q2_value = self.target_critic_2(next_states, next_actions)
108 | 
109 |         next_value = torch.min(q1_value,
110 |                                q2_value) + self.log_alpha.exp() * entropy
111 |         td_target = rewards + self.gamma * next_value * (1 - dones)
112 |         return td_target
113 | 
114 |     def soft_update(self, net, target_net):
115 |         for param_target, param in zip(target_net.parameters(),
116 |                                        net.parameters()):
117 |             param_target.data.copy_(param_target.data * (1.0 - self.tau) +
118 |                                     param.data * self.tau)
119 | 
120 |     def update(self, transition_dict):
121 |         states = torch.tensor(transition_dict['states'],
122 |                               dtype=torch.float).to(self.device)
123 |         actions = torch.tensor(transition_dict['actions'],
124 |                                dtype=torch.float).view(-1, 4).to(self.device)
125 |         rewards = torch.tensor(transition_dict['rewards'],
126 |                                dtype=torch.float).view(-1, 1).to(self.device)
127 |         next_states = torch.tensor(transition_dict['next_states'],
128 |                                    dtype=torch.float).to(self.device)
129 |         dones = torch.tensor(transition_dict['dones'],
130 |                              dtype=torch.float).view(-1, 1).to(self.device)
131 | 
132 |         # 更新两个Q网络
133 |         td_target = self.calc_target(rewards, next_states, dones)
134 |         # print(self.critic_1(states, actions))
135 |         # print(td_target.detach())
136 | 
137 |         critic_1_loss = torch.mean(
138 |             F.mse_loss(self.critic_1(states, actions), td_target.detach()))
139 |         critic_2_loss = torch.mean(
140 |             F.mse_loss(self.critic_2(states, actions), td_target.detach()))
141 |         self.critic_1_optimizer.zero_grad()
142 |         critic_1_loss.backward()
143 |         self.critic_1_optimizer.step()
144 |         self.critic_2_optimizer.zero_grad()
145 |         critic_2_loss.backward()
146 |         self.critic_2_optimizer.step()
147 | 
148 |         # 更新策略网络
149 |         new_actions, log_prob = self.actor(states)
150 |         entropy = -log_prob
151 |         q1_value = self.critic_1(states, new_actions)
152 |         q2_value = self.critic_2(states, new_actions)
153 |         actor_loss = torch.mean(-self.log_alpha.exp() * entropy -
154 |                                 torch.min(q1_value, q2_value))
155 |         self.actor_optimizer.zero_grad()
156 |         actor_loss.backward()
157 |         self.actor_optimizer.step()
158 | 
159 |         # 更新alpha值
160 |         alpha_loss = torch.mean(
161 |             (entropy - self.target_entropy).detach() * self.log_alpha.exp())
162 |         self.log_alpha_optimizer.zero_grad()
163 |         alpha_loss.backward()
164 |         self.log_alpha_optimizer.step()
165 | 
166 |         self.soft_update(self.critic_1, self.target_critic_1)
167 |         self.soft_update(self.critic_2, self.target_critic_2)
168 | 
169 | 
170 | class ReplayBuffer:
171 |     def __init__(self, capacity) -> None:
172 |         self.buffer = collections.deque(maxlen=capacity)
173 | 
174 |     def add(self, state, action, reward, next_state, done):
175 |         self.buffer.append([state, action, reward, next_state, done])
176 | 
177 |     def sample(self, batch_size):
178 |         transitions = random.sample(self.buffer, batch_size)
179 |         states, actions, rewards, next_states, dones = zip(*transitions)
180 |         return states, actions, rewards, next_states, dones
181 | 
182 |     def size(self):
183 |         return len(self.buffer)
184 | 
185 | 
186 | if __name__ == "__main__":
187 | 
188 |     algorithm_name = 'SAC_correct'
189 |     env_name = 'BipedalWalker-v3'
190 |     env = gym.make(env_name)
191 |     state_dim = env.observation_space.shape[0]
192 |     action_dim = env.action_space.shape[0]
193 |     action_bound = env.action_space.high[0]  # 动作最大值
194 | 
195 |     random.seed(0)
196 |     np.random.seed(0)
197 |     env.seed(0)
198 |     torch.manual_seed(0)
199 | 
200 |     actor_lr = 4e-4
201 |     critic_lr = 4e-4
202 |     alpha_lr = 4e-4
203 |     num_episodes = 1000
204 |     hidden_dim = 256
205 |     gamma = 0.98
206 |     tau = 0.01  # 软更新参数
207 |     buffer_size = 100000
208 |     minimal_size = 1000
209 |     batch_size = 64
210 |     max_step = 1000
211 |     target_entropy = -env.action_space.shape[0]
212 |     device = torch.device(
213 |         "cuda") if torch.cuda.is_available() else torch.device("cpu")
214 | 
215 |     replay_buffer = ReplayBuffer(buffer_size)
216 |     agent = SACContinuous(state_dim, hidden_dim, action_dim, action_bound,
217 |                           actor_lr, critic_lr, alpha_lr, target_entropy, tau,
218 |                           gamma, device)
219 |     # agent.load_model('BipedalWalker-v3\SACx.pth')
220 |     return_list = []
221 |     max_reward = -float('inf')
222 |     for i in range(10):
223 |         with tqdm(total=int(num_episodes / 10),
224 |                   desc='Iteration %d' % i) as pbar:
225 |             for i_episodes in range(int(num_episodes / 10)):
226 |                 episode_return = 0
227 |                 state = env.reset()
228 |                 done = False
229 | 
230 |                 for _ in range(max_step):
231 | 
232 |                     action = agent.take_action(state)
233 | 
234 |                     next_state, reward, done, _ = env.step(action)
235 |                     episode_return += reward
236 |                     done_ = done
237 |                     if reward <= -100:
238 |                         done = True
239 |                     else:
240 |                         done = False
241 | 
242 |                     if reward == -100:
243 |                         reward = -1
244 | 
245 |                     if i_episodes == int(num_episodes / 10) - 1:
246 |                         env.render()
247 | 
248 |                     replay_buffer.add(state, action, reward, next_state, done)
249 |                     state = next_state
250 | 
251 |                     if replay_buffer.size() > minimal_size:
252 |                         b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(
253 |                             batch_size)
254 |                         transition_dict = {
255 |                             'states': b_s,
256 |                             'actions': b_a,
257 |                             'next_states': b_ns,
258 |                             'rewards': b_r,
259 |                             'dones': b_d
260 |                         }
261 | 
262 |                         agent.update(transition_dict)
263 | 
264 |                     if done_:
265 |                         break
266 | 
267 |                 return_list.append(episode_return)
268 | 
269 |                 rl_utils.plot_smooth_reward(return_list, 100, env_name,
270 |                                             algorithm_name)
271 | 
272 |                 if episode_return >= max_reward:
273 |                     max_reward = episode_return
274 |                     agent.save_model(env_name, algorithm_name)
275 |                     rl_utils.plot_smooth_reward(return_list, 100, env_name,
276 |                                                 algorithm_name)
277 | 
278 |                 if (i_episodes + 1 % 10 == 0):
279 |                     pbar.set_postfix({
280 |                         'episode':
281 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
282 |                         'max_reward':
283 |                         '%d' % (max_reward),
284 |                         'return':
285 |                         '%.3f' % np.mean(return_list[-10:])
286 |                     })
287 |                 pbar.update(1)
288 | 


--------------------------------------------------------------------------------
/ac/test_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | import yaml
 13 | import multiprocessing
 14 | import os
 15 | import argparse
 16 | from a3c import get_args, A3C, ActorNet, CriticNet
 17 | from a2c import A2C
 18 | from ac import AC
 19 | from rl_utils import save_video
 20 | 
 21 | opt = get_args()
 22 | env = gym.make(opt.env_name)
 23 | 
 24 | random.seed(0)
 25 | np.random.seed(0)
 26 | torch.manual_seed(0)
 27 | env.seed(0)
 28 | 
 29 | state_dim = env.observation_space.shape[0]
 30 | hidden_dim = 128
 31 | action_dim = env.action_space.n
 32 | 
 33 | AC_agent = AC(state_dim, hidden_dim, action_dim, opt.actor_lr, opt.critic_lr,
 34 |               opt.gamma, opt.device)
 35 | AC_agent.load_model("CartPole-v0\AC.pth")
 36 | AC_agent.actor.cpu()
 37 | AC_agent.actor.cpu()
 38 | 
 39 | # use gpu to train A2C algorithm so that we need to cpu()
 40 | A2C_agent = A2C(state_dim, hidden_dim, action_dim, opt.actor_lr, opt.critic_lr,
 41 |                 opt.gamma, opt.device)
 42 | A2C_agent.load_model("CartPole-v0\A2C.pth")
 43 | A2C_agent.actor.cpu()
 44 | A2C_agent.actor.cpu()
 45 | 
 46 | A3C_agent = A3C(state_dim, hidden_dim, action_dim, opt.actor_lr, opt.critic_lr,
 47 |                 opt.gamma, opt.device)
 48 | A3C_agent.load_model("CartPole-v0\A3C.pth")
 49 | 
 50 | print("test AC")
 51 | print("-----------------------------------------")
 52 | state = env.reset()
 53 | done = False
 54 | frame_list = []
 55 | total_reward = 0
 56 | step = 0
 57 | max_step = 200
 58 | while step < max_step:
 59 |     action = AC_agent.evaluate(state)  # 随机选择一个行动
 60 |     state, reward, done, info = env.step(action)
 61 |     rendered_image = env.render('rgb_array')
 62 |     frame_list.append(rendered_image)
 63 |     total_reward += reward
 64 |     step += 1
 65 | 
 66 | save_video(frame_list, "CartPole-V0", "AC", "gif")
 67 | print(total_reward)
 68 | 
 69 | print("test A2C")
 70 | print("-----------------------------------------")
 71 | state = env.reset()
 72 | done = False
 73 | frame_list = []
 74 | total_reward = 0
 75 | step = 0
 76 | max_step = 200
 77 | while step < max_step:
 78 |     action = A2C_agent.evaluate(state)  # 随机选择一个行动
 79 |     state, reward, done, info = env.step(action)
 80 |     rendered_image = env.render('rgb_array')
 81 |     frame_list.append(rendered_image)
 82 |     total_reward += reward
 83 |     step += 1
 84 | 
 85 | save_video(frame_list, "CartPole-V0", "A2C", "gif")
 86 | print(total_reward)
 87 | 
 88 | print("test A3C")
 89 | print("-----------------------------------------")
 90 | state = env.reset()
 91 | done = False
 92 | frame_list = []
 93 | total_reward = 0
 94 | step = 0
 95 | max_step = 200
 96 | while step < max_step:
 97 |     action = A2C_agent.evaluate(state)  # 随机选择一个行动
 98 |     state, reward, done, info = env.step(action)
 99 |     rendered_image = env.render('rgb_array')
100 |     frame_list.append(rendered_image)
101 |     total_reward += reward
102 |     step += 1
103 | 
104 | save_video(frame_list, "CartPole-v0", "A3C", "gif")
105 | print(total_reward)
106 | 


--------------------------------------------------------------------------------
/custom_env/README.md:
--------------------------------------------------------------------------------
 1 | # rl_learning
 2 | 
 3 | ## Snake-v0
 4 | ### state space: 6
 5 | (fx-x, fy-y, left, right, up ,down)
 6 | 
 7 | (x, y) is the position of the snake head. 
 8 | 
 9 | (fx, fy) is the position of the food.    
10 | 
11 | left indicates whether the left side of the snake’s head is a boundary or part of the snake’s body. left can only be 0 or 1. 
12 | 
13 | ...
14 | 
15 | ### action space: 4
16 | The snake can move in four directions: up, down, left, and right.
17 | 
18 | ### Algorithm
19 | #### Implemented algorithm
20 | dqn, ddqn, drqn
21 | 
22 | reinforce, reinforce with baseline
23 | 
24 | ac, ac with target, a2c, a2c with target
25 | 
26 | ppo
27 | 
28 | #### To-be-implemented algorithm
29 | dueling dqn
30 | 
31 | a3c
32 | 
33 | trpo
34 | 
35 | ddpg
36 | 
37 | sac
38 | 
39 | #### Successful results
40 | 
41 | !['ppo 20*20'](https://github.com/sunwuzhou03/rl_learning/blob/master/gif/Snake-v1/PPO.gif)
42 | 
43 |  You can find more successful result in gif file.  
44 | 
45 | ## CartPole-v0
46 | 
47 | You can just modify the env_name from 'Snake-v0' to 'CartPole-v0' and adjust the hyper-parameters so that you train.
48 | 
49 | The program include: ac, a2c, ac_target, a2c_target, ddqn, ddrqn, dqn, drqn, ppo, reinforce, reinforce_baseline. 
50 | 
51 | ## Pendulum-v1
52 | 
53 | You can run the ppoPendulum.py program and train agent in this environment.
54 | 
55 | ## BipedalWalker-v3 and BipedalWalkerHardcore-v3
56 | 
57 | You can just modify the env_name from 'BipedalWalker-v3' to 'BipedalWalkerHardcore-v3' in the program sacwalker.py so that you train the agent in two environment.
58 | 
59 | ### The results
60 | 
61 | #### BipedalWalker-v3
62 | 
63 | ![BipedalWalker-v3](https://github.com/sunwuzhou03/rl_learning/blob/master/BipedalWalker-v3/BipedalWalker-v3.gif)
64 | 
65 | #### BipedalWalkerHardcore-v3
66 | 
67 | ![BipedalWalkerHardcore-v3](https://github.com/sunwuzhou03/rl_learning/blob/master/BipedalWalkerHardcore-v3/BipedalWalkerHardcore-v3.gif)
68 | 
69 | 


--------------------------------------------------------------------------------
/custom_env/snake_env.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import random
  4 | import pygame
  5 | import sys
  6 | 
  7 | 
  8 | class SnakeEnv(gym.Env):
  9 |     def __init__(self, width=10, height=10):
 10 |         super(SnakeEnv, self).__init__()
 11 | 
 12 |         self.width = width
 13 |         self.height = height
 14 |         self.grid = np.zeros((height, width))
 15 |         self.snake = [(0, 0)]
 16 |         self.food = self.generate_food()
 17 |         self.direction = "right"
 18 |         self.time = 1
 19 |         self.cell_size = 40
 20 |         self.window_size = (self.width * self.cell_size,
 21 |                             self.height * self.cell_size)
 22 |         # 定义颜色
 23 |         self.color_bg = (255, 255, 255)
 24 |         self.color_head = (0, 120, 120)
 25 |         self.color_snake = (0, 255, 0)
 26 |         self.color_food = (255, 0, 0)
 27 | 
 28 |         # 定义贪吃蛇环境的观测空间和行动空间
 29 |         inf = self.width + self.height
 30 |         low = np.array([-inf, -1.0, 0, 0, 0, 0])  # 连续状态空间的最小值
 31 |         high = np.array([inf, 1.0, 1.0, 1.0, 1.0, 1.0])  # 连续状态空间的最大值
 32 |         continuous_space = gym.spaces.Box(low, high, shape=(6, ), dtype=float)
 33 |         self.observation_space = continuous_space
 34 | 
 35 |         self.action_space = gym.spaces.Discrete(4)
 36 | 
 37 |     def generate_food(self):
 38 |         while True:
 39 |             x = random.randint(0, self.width - 1)
 40 |             y = random.randint(0, self.height - 1)
 41 |             if (x, y) not in self.snake:
 42 |                 return (x, y)
 43 | 
 44 |     def reset(self):
 45 |         self.grid = np.zeros((self.height, self.width))
 46 |         self.snake = [(0, 0)]
 47 |         self.food = self.generate_food()
 48 |         self.time = 1
 49 |         self.direction = "right"
 50 |         self.update_grid()
 51 | 
 52 |         return self.get_state()
 53 | 
 54 |     def get_state(self):
 55 |         state = []
 56 |         x, y = self.snake[0]
 57 |         fx, fy = self.food
 58 |         xbase = self.width
 59 |         ybase = self.height
 60 |         x_norm, y_norm = (fx - x) / xbase, (fy - y) / ybase
 61 |         state.append(fx - x)
 62 |         state.append(fy - y)
 63 |         for gx, gy in [(0, 1), (1, 0), (0, -1), (-1, 0)]:
 64 |             dx, dy = x + gx, y + gy
 65 |             if dx < 0 or dy < 0 or dx >= self.width or dy >= self.height or (
 66 |                     dx, dy) in self.snake:
 67 |                 state.append(0)
 68 |                 continue
 69 |             else:
 70 |                 state.append(1)  #四个方向可以走
 71 |         return np.array(state)
 72 | 
 73 |     def update_grid(self):
 74 |         self.grid = np.zeros((self.height, self.width))
 75 |         x, y = self.snake[0]
 76 |         self.grid[y, x] = 1
 77 |         for x, y in self.snake[1:]:
 78 |             self.grid[y, x] = 2
 79 |         fx, fy = self.food
 80 |         self.grid[fy, fx] = 3
 81 | 
 82 |     def step(self, action):
 83 |         x, y = self.snake[0]
 84 |         if action == 0:  # 左
 85 |             if self.direction != "right":
 86 |                 self.direction = "left"
 87 |         elif action == 1:  # 上
 88 |             if self.direction != "down":
 89 |                 self.direction = "up"
 90 |         elif action == 2:  # 右
 91 |             if self.direction != "left":
 92 |                 self.direction = "right"
 93 |         elif action == 3:  # 下
 94 |             if self.direction != "up":
 95 |                 self.direction = "down"
 96 | 
 97 |         if self.direction == "left":
 98 |             x -= 1
 99 |         elif self.direction == "up":
100 |             y -= 1
101 |         elif self.direction == "right":
102 |             x += 1
103 |         elif self.direction == "down":
104 |             y += 1
105 | 
106 |         if x < 0 or x >= self.width or y < 0 or y >= self.height or (
107 |                 x, y) in self.snake:
108 |             reward = -0.5 - 1 / (len(self.snake))
109 |             self.time = 1
110 |             done = True
111 |         elif (x, y) == self.food:
112 |             reward = 4 + len(self.snake) * 0.1
113 |             self.time = 1
114 |             self.snake.insert(0, (x, y))
115 |             self.food = self.generate_food()
116 |             self.update_grid()
117 |             done = False
118 |         else:
119 |             fx, fy = self.food
120 |             d = (abs(x - fx) + abs(y - fy))
121 |             reward = 0.1 * (2 - d) / (self.time)
122 |             self.snake.insert(0, (x, y))
123 |             self.snake.pop()
124 |             self.update_grid()
125 |             done = False
126 |             self.time += 1
127 |         return self.get_state(), reward, done, {}
128 | 
129 |     def render(self, mode='human'):
130 |         # 初始化pygame窗口
131 |         pygame.init()
132 |         pygame.font.init()
133 |         self.font = pygame.font.Font(None, 30)
134 |         self.window = pygame.display.set_mode(self.window_size)
135 | 
136 |         pygame.display.set_caption("Snake Game")
137 | 
138 |         if mode == 'rgb_array':
139 |             surface = pygame.Surface(
140 |                 (self.width * self.cell_size, self.height * self.cell_size))
141 |             self.window = surface
142 | 
143 |         self.window.fill(self.color_bg)
144 |         snake_length_text = self.font.render("Length: " + str(len(self.snake)),
145 |                                              True, (0, 25, 25))
146 |         self.window.blit(snake_length_text, (0, 0))
147 | 
148 |         for event in pygame.event.get():
149 |             if event.type == pygame.QUIT:
150 |                 pygame.quit()
151 |                 sys.exit()
152 | 
153 |         for y in range(self.height):
154 |             for x in range(self.width):
155 |                 cell_value = self.grid[y, x]
156 |                 cell_rect = pygame.Rect(x * self.cell_size, y * self.cell_size,
157 |                                         self.cell_size, self.cell_size)
158 |                 if cell_value == 1:  # 贪吃蛇身体
159 |                     pygame.draw.rect(self.window, self.color_head, cell_rect)
160 |                 elif cell_value == 2:  # 贪吃蛇身体
161 |                     pygame.draw.rect(self.window, self.color_snake, cell_rect)
162 |                 elif cell_value == 3:  # 食物
163 |                     pygame.draw.rect(self.window, self.color_food, cell_rect)
164 | 
165 |         pygame.display.flip()
166 | 
167 |         if mode == 'rgb_array':
168 |             image_array = pygame.surfarray.array3d(self.window)
169 |             return image_array
170 | 


--------------------------------------------------------------------------------
/ddpg/ddpg.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ddpg/ddpg.py


--------------------------------------------------------------------------------
/ddpg/td3.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ddpg/td3.py


--------------------------------------------------------------------------------
/dqn/ddqn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | from rl_utils import plot_smooth_reward
 12 | 
 13 | 
 14 | class Qnet(torch.nn.Module):
 15 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 16 |         super().__init__()
 17 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 18 |         self.fc2 = torch.nn.Linear(hidden_dim, action_dim)
 19 | 
 20 |     def forward(self, state):
 21 |         hidden_state = F.relu(self.fc1(state))
 22 |         acion_value = self.fc2(hidden_state)
 23 |         return acion_value
 24 | 
 25 | 
 26 | class DDQN:
 27 |     def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
 28 |                  epsilon, update_target, device) -> None:
 29 |         self.Q = Qnet(state_dim, hidden_dim, action_dim).to(device)
 30 |         self.Q_target = Qnet(state_dim, hidden_dim, action_dim).to(device)
 31 |         self.Q_target.load_state_dict(self.Q.state_dict())
 32 |         self.optimizer = torch.optim.Adam(self.Q.parameters(),
 33 |                                           lr=learning_rate)
 34 |         self.action_dim = action_dim
 35 |         self.gamma = gamma
 36 |         self.epsilon = epsilon
 37 |         self.device = device
 38 |         self.count = 1
 39 |         self.update_target = update_target
 40 | 
 41 |     def save_model(self, save_path='./', filename='model'):
 42 |         torch.save(self.Q.state_dict(), f"{save_path}\\{filename}.pth")
 43 | 
 44 |     def load_model(self, load_path):
 45 |         self.Q.load_state_dict(torch.load(load_path))
 46 |         self.Q_target.load_state_dict(torch.load(load_path))
 47 | 
 48 |     def take_action(self, state):
 49 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 50 |         Q_value = self.Q(state)
 51 |         if np.random.random() < self.epsilon:
 52 |             action = np.random.randint(self.action_dim)
 53 |         else:
 54 |             action = torch.argmax(Q_value).item()
 55 |         return action
 56 | 
 57 |     def update(self, states, actions, rewards, next_states, dones):
 58 |         states_cuda = torch.tensor(states, dtype=torch.float).to(self.device)
 59 |         actions_cuda = torch.tensor(actions,
 60 |                                     dtype=torch.float).view(-1,
 61 |                                                             1).to(self.device)
 62 |         rewards_cuda = torch.tensor(rewards,
 63 |                                     dtype=torch.float).to(self.device).view(
 64 |                                         -1, 1)
 65 |         next_states_cuda = torch.tensor(next_states,
 66 |                                         dtype=torch.float).to(self.device)
 67 |         dones_cuda = torch.tensor(dones,
 68 |                                   dtype=torch.float).to(self.device).view(
 69 |                                       -1, 1)
 70 |         q_now = self.Q(states_cuda).gather(1, actions_cuda.long()).view(-1, 1)
 71 | 
 72 |         actions_star = self.Q(next_states_cuda).max(1)[1].view(-1, 1)
 73 | 
 74 |         q_next = self.Q_target(next_states_cuda).gather(1, actions_star).view(
 75 |             -1, 1)
 76 | 
 77 |         y_now = (rewards_cuda + self.gamma * q_next * (1 - dones_cuda)).view(
 78 |             -1, 1)
 79 |         td_error = q_now - y_now
 80 | 
 81 |         loss = torch.mean(F.mse_loss(q_now, y_now))
 82 |         self.optimizer.zero_grad()
 83 |         loss.backward()
 84 |         self.optimizer.step()
 85 | 
 86 |         if self.count % self.update_target == 0:
 87 |             self.Q_target.load_state_dict(self.Q.state_dict())
 88 | 
 89 |         self.count += 1
 90 | 
 91 | 
 92 | class ReplayBuffer:
 93 |     def __init__(self, capacity) -> None:
 94 |         self.buffer = collections.deque(maxlen=capacity)
 95 | 
 96 |     def add(self, state, action, reward, next_state, done):
 97 |         self.buffer.append([state, action, reward, next_state, done])
 98 | 
 99 |     def sample(self, batch_size):
100 |         transitions = random.sample(self.buffer, batch_size)
101 |         states, actions, rewards, next_states, dones = zip(*transitions)
102 |         return states, actions, rewards, next_states, dones
103 | 
104 |     def size(self):
105 |         return len(self.buffer)
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     algorithm_name = "demo"
110 |     gamma = 0.99
111 | 
112 |     num_episodes = 5000
113 |     buffersize = 10000
114 |     minmal_size = 500
115 |     batch_size = 64
116 |     epsilon = 0.01
117 |     learning_rate = 2e-3
118 |     device = torch.device('cuda')
119 | 
120 |     env_name = 'CartPole-v0'
121 |     # 注册环境
122 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
123 | 
124 |     env = gym.make(env_name)
125 |     random.seed(0)
126 |     np.random.seed(0)
127 |     env.seed(0)
128 |     torch.manual_seed(0)
129 |     replay_buffer = ReplayBuffer(buffersize)
130 | 
131 |     state_dim = env.observation_space.shape[0]
132 |     hidden_dim = 128
133 |     action_dim = env.action_space.n
134 |     update_target = 100
135 |     agent = DDQN(state_dim, hidden_dim, action_dim, learning_rate, gamma,
136 |                  epsilon, update_target, device)
137 | 
138 |     return_list = []
139 |     max_reward = 0
140 |     for i in range(20):
141 |         with tqdm(total=int(num_episodes / 10),
142 |                   desc='Iteration %d' % i) as pbar:
143 |             for i_episodes in range(int(num_episodes / 10)):
144 |                 episode_return = 0
145 |                 state = env.reset()
146 |                 done = False
147 |                 while not done:
148 |                     action = agent.take_action(state)
149 |                     next_state, reward, done, _ = env.step(action)
150 | 
151 |                     if i_episodes == int(num_episodes / 10) - 1:
152 |                         env.render()
153 |                         time.sleep(0.1)
154 |                     replay_buffer.add(state, action, reward, next_state, done)
155 |                     state = next_state
156 |                     episode_return += reward
157 |                     if replay_buffer.size() > minmal_size:
158 |                         b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(
159 |                             batch_size)
160 |                         agent.update(b_s, b_a, b_r, b_ns, b_d)
161 |                 return_list.append(episode_return)
162 |                 plot_smooth_reward(return_list, 100, env_name, algorithm_name)
163 |                 if episode_return > max_reward:
164 |                     max_reward = episode_return
165 |                     agent.save_model(env_name, algorithm_name)
166 |                 if (i_episodes + 1 % 10 == 0):
167 |                     pbar.set_postfix({
168 |                         'episode':
169 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
170 |                         'return':
171 |                         '%.3f' % np.mean(return_list[-10:])
172 |                     })
173 |                 pbar.update(1)
174 | 


--------------------------------------------------------------------------------
/dqn/ddrqn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | from rl_utils import plot_smooth_reward
 12 | 
 13 | 
 14 | class Qnet(torch.nn.Module):
 15 |     def __init__(self,
 16 |                  obs_dim,
 17 |                  hidden_dim,
 18 |                  action_dim,
 19 |                  device,
 20 |                  num_layers=1,
 21 |                  batch_first=True) -> None:
 22 |         super().__init__()
 23 |         self.action_dim = action_dim
 24 |         self.hidden_dim = hidden_dim
 25 |         self.device = device
 26 |         self.num_layers = num_layers
 27 |         self.batch_first = batch_first
 28 |         self.fc1 = torch.nn.Linear(obs_dim, hidden_dim)
 29 |         self.fc2 = torch.nn.Linear(hidden_dim, action_dim)
 30 |         self.lstm = torch.nn.LSTM(hidden_dim,
 31 |                                   hidden_dim,
 32 |                                   self.num_layers,
 33 |                                   batch_first=True)
 34 | 
 35 |     def forward(self, state, max_seq=1, batch_size=1):
 36 |         h0 = torch.zeros(self.num_layers, state.size(0),
 37 |                          self.hidden_dim).to(device)
 38 |         c0 = torch.zeros(self.num_layers, state.size(0),
 39 |                          self.hidden_dim).to(device)
 40 | 
 41 |         state = F.relu(self.fc1(state)).reshape(-1, max_seq, self.hidden_dim)
 42 |         state, _ = self.lstm(state, (h0, c0))
 43 |         action_value = self.fc2(state).view(-1, self.action_dim)
 44 |         return action_value
 45 | 
 46 | 
 47 | class DDRQN:
 48 |     def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
 49 |                  epsilon, update_target, device) -> None:
 50 |         self.Q = Qnet(state_dim, hidden_dim, action_dim, device).to(device)
 51 |         self.Q_target = Qnet(state_dim, hidden_dim, action_dim,
 52 |                              device).to(device)
 53 |         self.Q_target.load_state_dict(self.Q.state_dict())
 54 |         self.optimizer = torch.optim.Adam(self.Q.parameters(),
 55 |                                           lr=learning_rate)
 56 |         self.action_dim = action_dim
 57 |         print(action_dim)
 58 |         self.gamma = gamma
 59 |         self.epsilon = epsilon
 60 |         self.device = device
 61 |         self.count = 1
 62 |         self.update_target = update_target
 63 | 
 64 |     def save_model(self, save_path='./', filename='model'):
 65 |         torch.save(self.Q.state_dict(), f"{save_path}\\{filename}.pth")
 66 | 
 67 |     def load_model(self, load_path):
 68 |         self.Q.load_state_dict(torch.load(load_path))
 69 |         self.Q_target.load_state_dict(torch.load(load_path))
 70 | 
 71 |     def take_action(self, state):
 72 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 73 |         Q_value = self.Q(state)
 74 |         if np.random.random() < self.epsilon:
 75 |             action = np.random.randint(self.action_dim)
 76 |         else:
 77 |             action = torch.argmax(Q_value).item()
 78 |         return action
 79 | 
 80 |     def update(self,
 81 |                states,
 82 |                actions,
 83 |                rewards,
 84 |                next_states,
 85 |                dones,
 86 |                max_seq=1,
 87 |                batch_size=1):
 88 |         states = torch.tensor(states, dtype=torch.float).to(self.device)
 89 |         actions = torch.tensor(actions,
 90 |                                dtype=torch.float).view(-1, 1).to(self.device)
 91 |         rewards = torch.tensor(rewards,
 92 |                                dtype=torch.float).to(self.device).view(-1, 1)
 93 |         next_states = torch.tensor(next_states,
 94 |                                    dtype=torch.float).to(self.device)
 95 |         dones = torch.tensor(dones,
 96 |                              dtype=torch.float).to(self.device).view(-1, 1)
 97 |         q_now = self.Q.forward(states, max_seq,
 98 |                                batch_size).gather(1,
 99 |                                                   actions.long()).view(-1, 1)
100 | 
101 |         actions_star = self.Q.forward(next_states, max_seq,
102 |                                       batch_size).max(1)[1].view(-1, 1)
103 | 
104 |         q_next = self.Q_target.forward(next_states,
105 |                                        max_seq, batch_size).gather(
106 |                                            1, actions_star).view(-1, 1)
107 | 
108 |         y_now = (rewards + self.gamma * q_next * (1 - dones)).view(-1, 1)
109 |         td_error = q_now - y_now
110 | 
111 |         loss = torch.mean(F.mse_loss(q_now, y_now))
112 |         self.optimizer.zero_grad()
113 |         loss.backward()
114 |         self.optimizer.step()
115 | 
116 |         if self.count % self.update_target == 0:
117 |             self.Q_target.load_state_dict(self.Q.state_dict())
118 | 
119 |         self.count += 1
120 | 
121 | 
122 | class Recurrent_Memory_ReplayBuffer:
123 |     def __init__(self, capacity, max_seq) -> None:
124 |         self.max_seq = max_seq
125 |         self.buffer = collections.deque(maxlen=capacity)
126 | 
127 |     def add(self, state, action, reward, next_state, done):
128 |         self.buffer.append([state, action, reward, next_state, done])
129 | 
130 |     # def sample(self, batch_size):
131 |     #     transitions = random.sample(self.buffer, batch_size)
132 |     #     states, actions, rewards, next_states, dones = zip(*transitions)
133 |     #     return states, actions, rewards, next_states, dones
134 | 
135 |     def sample(self, batch_size):
136 |         # sample episodic memory
137 |         states, actions, rewards, next_states, dones = [], [], [], [], []
138 |         for _ in range(batch_size):
139 |             finish = random.randint(self.max_seq, self.size() - 1)
140 |             begin = finish - self.max_seq
141 |             data = []
142 |             for idx in range(begin, finish):
143 |                 data.append(self.buffer[idx])
144 |             state, action, reward, next_state, done = zip(*data)
145 |             states.append(np.vstack(state))
146 |             actions.append(action)
147 |             rewards.append(reward)
148 |             next_states.append(np.vstack(next_state))
149 |             dones.append(done)
150 | 
151 |         states = np.array(states)
152 |         actions = np.array(actions)
153 |         rewards = np.array(rewards)
154 |         next_states = np.array(next_states)
155 |         dones = np.array(dones)
156 | 
157 |         return states, actions, rewards, next_states, dones
158 | 
159 |     def size(self):
160 |         return len(self.buffer)
161 | 
162 | 
163 | if __name__ == "__main__":
164 |     algorithm_name = "DDRQN1"
165 |     gamma = 0.99
166 | 
167 |     num_episodes = 5000
168 |     buffersize = 10000
169 |     minmal_size = 500
170 |     batch_size = 64
171 |     epsilon = 0.01
172 |     learning_rate = 2e-3
173 |     max_seq = 1
174 |     device = torch.device('cuda')
175 | 
176 |     env_name = 'Snake-v0'
177 |     # 注册环境
178 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
179 | 
180 |     env = gym.make(env_name)
181 |     random.seed(0)
182 |     np.random.seed(0)
183 |     env.seed(0)
184 |     torch.manual_seed(0)
185 |     replay_buffer = Recurrent_Memory_ReplayBuffer(buffersize, max_seq)
186 | 
187 |     state_dim = env.observation_space.shape[0]
188 |     hidden_dim = 128
189 |     action_dim = env.action_space.n
190 |     update_target = 50
191 |     agent = DDRQN(state_dim, hidden_dim, action_dim, learning_rate, gamma,
192 |                   epsilon, update_target, device)
193 | 
194 |     return_list = []
195 |     max_reward = 0
196 |     for i in range(20):
197 |         with tqdm(total=int(num_episodes / 10),
198 |                   desc='Iteration %d' % i) as pbar:
199 |             for i_episodes in range(int(num_episodes / 10)):
200 |                 episode_return = 0
201 |                 state = env.reset()
202 |                 done = False
203 |                 while not done:
204 |                     action = agent.take_action(state)
205 |                     next_state, reward, done, _ = env.step(action)
206 | 
207 |                     if i_episodes == int(num_episodes / 10) - 1:
208 |                         env.render()
209 |                         time.sleep(0.1)
210 |                     replay_buffer.add(state, action, reward, next_state, done)
211 |                     state = next_state
212 |                     episode_return += reward
213 |                     if replay_buffer.size() > minmal_size:
214 |                         b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(
215 |                             batch_size)
216 |                         agent.update(b_s, b_a, b_r, b_ns, b_d, max_seq,
217 |                                      batch_size)
218 |                 return_list.append(episode_return)
219 |                 plot_smooth_reward(return_list, 100, env_name, algorithm_name)
220 |                 if episode_return > max_reward:
221 |                     max_reward = episode_return
222 |                     agent.save_model(env_name, algorithm_name)
223 |                 if (i_episodes + 1 % 10 == 0):
224 |                     pbar.set_postfix({
225 |                         'episode':
226 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
227 |                         'return':
228 |                         '%.3f' % np.mean(return_list[-10:])
229 |                     })
230 |                 pbar.update(1)
231 | 


--------------------------------------------------------------------------------
/dqn/dqn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | from ddqn import plot_smooth_reward
 12 | 
 13 | 
 14 | class Qnet(torch.nn.Module):
 15 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 16 |         super().__init__()
 17 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 18 |         self.fc2 = torch.nn.Linear(hidden_dim, action_dim)
 19 | 
 20 |     def forward(self, state):
 21 |         hidden_state = F.relu(self.fc1(state))
 22 |         acion_value = self.fc2(hidden_state)
 23 |         return acion_value
 24 | 
 25 | 
 26 | class DQN:
 27 |     def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
 28 |                  epsilon, device) -> None:
 29 |         self.Q = Qnet(state_dim, hidden_dim, action_dim).to(device)
 30 |         self.optimizer = torch.optim.Adam(self.Q.parameters(),
 31 |                                           lr=learning_rate)
 32 |         self.action_dim = action_dim
 33 |         self.gamma = gamma
 34 |         self.epsilon = epsilon
 35 |         self.device = device
 36 |         self.count = 1
 37 |         pass
 38 | 
 39 |     def save_model(self, save_path='./', filename='model'):
 40 |         torch.save(self.Q.state_dict(), f"{save_path}\\{filename}.pth")
 41 | 
 42 |     def load_model(self, load_path):
 43 |         self.Q.load_state_dict(torch.load(load_path))
 44 | 
 45 |     def take_action(self, state):
 46 | 
 47 |         Q_value = self.Q(state)
 48 |         if np.random.random() < self.epsilon:
 49 |             action = np.random.randint(self.action_dim)
 50 |         else:
 51 |             action = torch.argmax(Q_value).item()
 52 |         return action
 53 | 
 54 |     def update(self, states, actions, rewards, next_states, dones):
 55 |         states_cuda = torch.tensor(states, dtype=torch.float).to(self.device)
 56 |         actions_cuda = torch.tensor(actions,
 57 |                                     dtype=torch.float).view(-1,
 58 |                                                             1).to(self.device)
 59 |         rewards_cuda = torch.tensor(rewards,
 60 |                                     dtype=torch.float).to(self.device).view(
 61 |                                         -1, 1)
 62 |         next_states_cuda = torch.tensor(next_states,
 63 |                                         dtype=torch.float).to(self.device)
 64 |         dones_cuda = torch.tensor(dones,
 65 |                                   dtype=torch.float).to(self.device).view(
 66 |                                       -1, 1)
 67 |         q_now = self.Q(states_cuda).gather(1, actions_cuda.long()).view(-1, 1)
 68 | 
 69 |         q_next = self.Q(next_states_cuda).max(1)[0].view(-1, 1)
 70 | 
 71 |         y_now = (rewards_cuda + self.gamma * q_next * (1 - dones_cuda)).view(
 72 |             -1, 1)
 73 |         td_error = q_now - y_now
 74 | 
 75 |         loss = torch.mean(F.mse_loss(q_now, y_now))
 76 |         self.optimizer.zero_grad()
 77 |         loss.backward()
 78 |         self.optimizer.step()
 79 |         self.count += 1
 80 |         if self.count == 100:
 81 |             self.epsilon *= 0.9
 82 | 
 83 | 
 84 | class ReplayBuffer:
 85 |     def __init__(self, capacity) -> None:
 86 |         self.buffer = collections.deque(maxlen=capacity)
 87 | 
 88 |     def add(self, state, action, reward, next_state, done):
 89 |         self.buffer.append([state, action, reward, next_state, done])
 90 | 
 91 |     def sample(self, batch_size):
 92 |         transitions = random.sample(self.buffer, batch_size)
 93 |         states, actions, rewards, next_states, dones = zip(*transitions)
 94 |         return states, actions, rewards, next_states, dones
 95 | 
 96 |     def size(self):
 97 |         return len(self.buffer)
 98 | 
 99 | 
100 | gamma = 0.99
101 | 
102 | num_episodes = 10000
103 | buffersize = 10000
104 | minmal_size = 500
105 | batch_size = 64
106 | epsilon = 0.01
107 | learning_rate = 2e-3
108 | device = torch.device('cuda')
109 | algorithm_name = 'DQN'
110 | env_name = 'Snake-v0'
111 | # 注册环境
112 | gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
113 | 
114 | env = gym.make(env_name)
115 | random.seed(0)
116 | np.random.seed(0)
117 | env.seed(0)
118 | torch.manual_seed(0)
119 | replay_buffer = ReplayBuffer(buffersize)
120 | 
121 | state_dim = env.observation_space.shape[0]
122 | hidden_dim = 128
123 | action_dim = env.action_space.n
124 | 
125 | agent = DQN(state_dim, hidden_dim, action_dim, learning_rate, gamma, epsilon,
126 |             device)
127 | 
128 | return_list = []
129 | max_reward = 0
130 | for i in range(10):
131 |     with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
132 |         for i_episodes in range(int(num_episodes / 10)):
133 |             episode_return = 0
134 |             state = env.reset()
135 |             done = False
136 |             while not done:
137 |                 action = agent.take_action(
138 |                     torch.tensor(state, dtype=torch.float).to(device))
139 |                 next_state, reward, done, _ = env.step(action)
140 | 
141 |                 if i_episodes == int(num_episodes / 10) - 1:
142 |                     env.render()
143 |                     time.sleep(0.1)
144 |                 replay_buffer.add(state, action, reward, next_state, done)
145 |                 state = next_state
146 |                 episode_return += reward
147 |                 if replay_buffer.size() > minmal_size:
148 |                     b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
149 | 
150 |                     agent.update(b_s, b_a, b_r, b_ns, b_d)
151 |             return_list.append(episode_return)
152 |             plot_smooth_reward(return_list, 100, env_name, algorithm_name)
153 |             if episode_return > max_reward:
154 |                 max_reward = episode_return
155 |                 agent.save_model(env_name, algorithm_name)
156 |             if (i_episodes + 1 % 10 == 0):
157 |                 pbar.set_postfix({
158 |                     'episode':
159 |                     '%d' % (num_episodes / 10 * i + i_episodes + 1),
160 |                     'return':
161 |                     '%.3f' % np.mean(return_list[-10:])
162 |                 })
163 |             pbar.update(1)
164 | 


--------------------------------------------------------------------------------
/dqn/drqn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | from ddqn import plot_smooth_reward
 12 | 
 13 | 
 14 | class Qnet(torch.nn.Module):
 15 |     def __init__(self,
 16 |                  obs_dim,
 17 |                  hidden_dim,
 18 |                  action_dim,
 19 |                  device,
 20 |                  num_layers=1,
 21 |                  batch_first=True) -> None:
 22 |         super().__init__()
 23 |         self.action_dim = action_dim
 24 |         self.hidden_dim = hidden_dim
 25 |         self.device = device
 26 |         self.num_layers = num_layers
 27 |         self.batch_first = batch_first
 28 |         self.fc1 = torch.nn.Linear(obs_dim, hidden_dim)
 29 |         self.fc2 = torch.nn.Linear(hidden_dim, action_dim)
 30 |         self.lstm = torch.nn.LSTM(hidden_dim,
 31 |                                   hidden_dim,
 32 |                                   self.num_layers,
 33 |                                   batch_first=True)
 34 | 
 35 |     def forward(self, state, max_seq=1, batch_size=1):
 36 |         h0 = torch.zeros(self.num_layers, state.size(0),
 37 |                          self.hidden_dim).to(device)
 38 |         c0 = torch.zeros(self.num_layers, state.size(0),
 39 |                          self.hidden_dim).to(device)
 40 | 
 41 |         state = F.relu(self.fc1(state)).reshape(-1, max_seq, self.hidden_dim)
 42 |         state, _ = self.lstm(state, (h0, c0))
 43 |         action_value = self.fc2(state).view(-1, self.action_dim)
 44 |         return action_value
 45 | 
 46 | 
 47 | class DRQN:
 48 |     def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
 49 |                  epsilon, device) -> None:
 50 |         self.Q = Qnet(state_dim, hidden_dim, action_dim, device).to(device)
 51 |         self.optimizer = torch.optim.Adam(self.Q.parameters(),
 52 |                                           lr=learning_rate)
 53 |         self.action_dim = action_dim
 54 |         self.gamma = gamma
 55 |         self.epsilon = epsilon
 56 |         self.device = device
 57 |         self.count = 1
 58 |         pass
 59 | 
 60 |     def save_model(self, save_path='./', filename='model'):
 61 |         torch.save(self.Q.state_dict(), f"{save_path}\\{filename}.pth")
 62 | 
 63 |     def load_model(self, load_path):
 64 |         self.Q.load_state_dict(torch.load(load_path))
 65 | 
 66 |     def take_action(self, state):
 67 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 68 |         Q_value = self.Q(state)
 69 |         if np.random.random() < self.epsilon:
 70 |             action = np.random.randint(self.action_dim)
 71 |         else:
 72 |             action = torch.argmax(Q_value).item()
 73 |         return action
 74 | 
 75 |     def update(self,
 76 |                states,
 77 |                actions,
 78 |                rewards,
 79 |                next_states,
 80 |                dones,
 81 |                max_seq=1,
 82 |                batch_size=1):
 83 |         states = torch.tensor(states, dtype=torch.float).to(self.device)
 84 |         actions = torch.tensor(actions,
 85 |                                dtype=torch.float).view(-1, 1).to(self.device)
 86 |         rewards = torch.tensor(rewards,
 87 |                                dtype=torch.float).to(self.device).view(-1, 1)
 88 |         next_states = torch.tensor(next_states,
 89 |                                    dtype=torch.float).to(self.device)
 90 |         dones = torch.tensor(dones,
 91 |                              dtype=torch.float).to(self.device).view(-1, 1)
 92 |         q_now = self.Q.forward(states, max_seq,
 93 |                                batch_size).gather(1,
 94 |                                                   actions.long()).view(-1, 1)
 95 | 
 96 |         q_next = self.Q.forward(next_states, max_seq,
 97 |                                 batch_size).max(1)[0].view(-1, 1)
 98 | 
 99 |         y_now = (rewards + self.gamma * q_next * (1 - dones)).view(-1, 1)
100 |         td_error = q_now - y_now
101 | 
102 |         loss = torch.mean(F.mse_loss(q_now, y_now))
103 |         self.optimizer.zero_grad()
104 |         loss.backward()
105 |         self.optimizer.step()
106 |         self.count += 1
107 |         if self.count == 100:
108 |             self.epsilon *= 0.9
109 | 
110 | 
111 | class Recurrent_Memory_ReplayBuffer:
112 |     def __init__(self, capacity) -> None:
113 |         self.buffer = collections.deque(maxlen=capacity)
114 | 
115 |     def add(self, state, action, reward, next_state, done):
116 |         self.buffer.append([state, action, reward, next_state, done])
117 | 
118 |     # def sample(self, batch_size):
119 |     #     transitions = random.sample(self.buffer, batch_size)
120 |     #     states, actions, rewards, next_states, dones = zip(*transitions)
121 |     #     return states, actions, rewards, next_states, dones
122 | 
123 |     def sample(self, batch_size, max_seq):
124 |         # sample episodic memory
125 |         states, actions, rewards, next_states, dones = [], [], [], [], []
126 |         for _ in range(batch_size):
127 |             finish = random.randint(max_seq, self.size() - 1)
128 |             begin = finish - max_seq
129 |             data = []
130 |             for idx in range(begin, finish):
131 |                 data.append(self.buffer[idx])
132 |             state, action, reward, next_state, done = zip(*data)
133 |             states.append(np.vstack(state))
134 |             actions.append(action)
135 |             rewards.append(reward)
136 |             next_states.append(np.vstack(next_state))
137 |             dones.append(done)
138 | 
139 |         states = np.array(states)
140 |         actions = np.array(actions)
141 |         rewards = np.array(rewards)
142 |         next_states = np.array(next_states)
143 |         dones = np.array(dones)
144 | 
145 |         return states, actions, rewards, next_states, dones
146 | 
147 |     def size(self):
148 |         return len(self.buffer)
149 | 
150 | 
151 | gamma = 0.99
152 | num_episodes = 10000
153 | buffersize = 10000
154 | minmal_size = 500
155 | batch_size = 64
156 | epsilon = 0.01
157 | learning_rate = 2e-3
158 | max_seq = 1
159 | device = torch.device('cuda')
160 | algorithm_name = 'DRQN'
161 | env_name = 'Snake-v0'
162 | # 注册环境
163 | gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
164 | 
165 | env = gym.make(env_name)
166 | random.seed(0)
167 | np.random.seed(0)
168 | env.seed(0)
169 | torch.manual_seed(0)
170 | replay_buffer = Recurrent_Memory_ReplayBuffer(buffersize)
171 | 
172 | state_dim = env.observation_space.shape[0]
173 | hidden_dim = 128
174 | action_dim = env.action_space.n
175 | 
176 | agent = DRQN(state_dim, hidden_dim, action_dim, learning_rate, gamma, epsilon,
177 |              device)
178 | 
179 | return_list = []
180 | max_reward = 0
181 | for i in range(10):
182 |     with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
183 |         for i_episodes in range(int(num_episodes / 10)):
184 |             episode_return = 0
185 |             state = env.reset()
186 |             done = False
187 |             while not done:
188 |                 action = agent.take_action(state)
189 |                 next_state, reward, done, _ = env.step(action)
190 | 
191 |                 if i_episodes == int(num_episodes / 10) - 1:
192 |                     env.render()
193 |                     time.sleep(0.1)
194 |                 replay_buffer.add(state, action, reward, next_state, done)
195 |                 state = next_state
196 |                 episode_return += reward
197 |                 if replay_buffer.size() > minmal_size:
198 |                     b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(
199 |                         batch_size, max_seq)
200 | 
201 |                     agent.update(b_s, b_a, b_r, b_ns, b_d, max_seq, batch_size)
202 |             return_list.append(episode_return)
203 |             plot_smooth_reward(return_list, 100, env_name, algorithm_name)
204 |             if episode_return > max_reward:
205 |                 max_reward = episode_return
206 |                 agent.save_model(env_name, algorithm_name)
207 |             if (i_episodes + 1 % 10 == 0):
208 |                 pbar.set_postfix({
209 |                     'episode':
210 |                     '%d' % (num_episodes / 10 * i + i_episodes + 1),
211 |                     'return':
212 |                     '%.3f' % np.mean(return_list[-10:])
213 |                 })
214 |             pbar.update(1)
215 | 


--------------------------------------------------------------------------------
/pg/reinforce.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | from rl_utils import plot_smooth_reward
 13 | 
 14 | 
 15 | class ActorNet(nn.Module):
 16 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 17 |         super().__init__()
 18 |         self.fc1 = nn.Linear(state_dim, hidden_dim)
 19 |         self.fc2 = nn.Linear(hidden_dim, action_dim)
 20 | 
 21 |     def forward(self, state):
 22 | 
 23 |         hidden_state = F.relu(self.fc1(state))
 24 |         # print(self.fc2(hidden_state))
 25 |         probs = F.softmax(self.fc2(hidden_state), dim=1)
 26 |         return probs
 27 | 
 28 | 
 29 | class Reinforce:
 30 |     def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
 31 |                  device) -> None:
 32 |         self.actor = ActorNet(state_dim, hidden_dim, action_dim).to(device)
 33 |         self.gamma = gamma
 34 |         self.optimizer = torch.optim.Adam(self.actor.parameters(),
 35 |                                           lr=learning_rate)
 36 |         self.device = device
 37 | 
 38 |     def save_model(self, save_path='./', filename='model'):
 39 |         torch.save(self.actor.state_dict(), f"{save_path}\\{filename}.pth")
 40 | 
 41 |     def load_model(self, load_path):
 42 |         self.actor.load_state_dict(torch.load(load_path))
 43 | 
 44 |     def take_action(self, state):
 45 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 46 |         probs = self.actor(state)
 47 |         action_dist = torch.distributions.Categorical(probs)
 48 |         action = action_dist.sample()
 49 |         return action.item()
 50 | 
 51 |     def update(self, transition_dict):
 52 |         reward_list = transition_dict['rewards']
 53 |         state_list = transition_dict['states']
 54 |         action_list = transition_dict['actions']
 55 |         G = 0
 56 |         self.optimizer.zero_grad()
 57 |         for i in reversed(range(len(reward_list))):
 58 |             reward = reward_list[i]
 59 |             state = torch.tensor([state_list[i]],
 60 |                                  dtype=torch.float).to(self.device)
 61 |             action = torch.tensor([action_list[i]]).view(-1, 1).to(self.device)
 62 |             log_prob = torch.log(self.actor(state).gather(1, action))
 63 |             G = self.gamma * G + reward
 64 |             loss = torch.mean(-log_prob * G)
 65 |             loss.backward()
 66 |         self.optimizer.step()
 67 | 
 68 | 
 69 | if __name__ == "__main__":
 70 |     algorithm_name = "REINFORCE"
 71 |     gamma = 0.98
 72 | 
 73 |     num_episodes = 5000
 74 |     learning_rate = 2e-3
 75 |     device = torch.device('cuda')
 76 | 
 77 |     env_name = 'Snake-v0'  #'CartPole-v0'
 78 |     # 注册环境
 79 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
 80 | 
 81 |     env = gym.make(env_name)
 82 | 
 83 |     env.seed(0)
 84 |     torch.manual_seed(0)
 85 | 
 86 |     state_dim = env.observation_space.shape[0]
 87 |     hidden_dim = 128
 88 |     action_dim = env.action_space.n
 89 |     update_target = 100
 90 |     agent = Reinforce(state_dim, hidden_dim, action_dim, learning_rate, gamma,
 91 |                       device)
 92 | 
 93 |     return_list = []
 94 |     max_reward = 0
 95 |     for i in range(20):
 96 |         with tqdm(total=int(num_episodes / 10),
 97 |                   desc='Iteration %d' % i) as pbar:
 98 |             for i_episodes in range(int(num_episodes / 10)):
 99 |                 episode_return = 0
100 |                 state = env.reset()
101 |                 done = False
102 |                 transition_dict = {
103 |                     'states': [],
104 |                     'actions': [],
105 |                     'next_states': [],
106 |                     'rewards': [],
107 |                     'dones': []
108 |                 }
109 |                 while not done:
110 |                     action = agent.take_action(state)
111 |                     next_state, reward, done, _ = env.step(action)
112 |                     env.render()
113 |                     transition_dict['states'].append(state)
114 |                     transition_dict['actions'].append(action)
115 |                     transition_dict['next_states'].append(next_state)
116 |                     transition_dict['rewards'].append(reward)
117 |                     transition_dict['dones'].append(done)
118 |                     state = next_state
119 |                     episode_return += reward
120 |                     if i_episodes == int(num_episodes / 10) - 1:
121 |                         time.sleep(0.1)
122 |                 agent.update(transition_dict)
123 |                 return_list.append(episode_return)
124 |                 plot_smooth_reward(return_list, 100, env_name, algorithm_name)
125 |                 if episode_return > max_reward:
126 |                     max_reward = episode_return
127 |                     agent.save_model(env_name, algorithm_name)
128 |                 if (i_episodes + 1 % 10 == 0):
129 |                     pbar.set_postfix({
130 |                         'episode':
131 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
132 |                         'return':
133 |                         '%.3f' % np.mean(return_list[-10:])
134 |                     })
135 |                 pbar.update(1)
136 | 


--------------------------------------------------------------------------------
/pg/reinforce_baseline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | from rl_utils import plot_smooth_reward
 13 | 
 14 | 
 15 | class ValueNet(torch.nn.Module):
 16 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 17 |         super().__init__()
 18 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 19 |         self.fc2 = torch.nn.Linear(hidden_dim, 1)
 20 | 
 21 |     def forward(self, state):
 22 |         hidden_state = F.relu(self.fc1(state))
 23 |         value = self.fc2(hidden_state)
 24 |         return value
 25 | 
 26 | 
 27 | class ActorNet(nn.Module):
 28 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 29 |         super().__init__()
 30 |         self.fc1 = nn.Linear(state_dim, hidden_dim)
 31 |         self.fc2 = nn.Linear(hidden_dim, action_dim)
 32 | 
 33 |     def forward(self, state):
 34 | 
 35 |         hidden_state = F.relu(self.fc1(state))
 36 |         # print(self.fc2(hidden_state))
 37 |         probs = F.softmax(self.fc2(hidden_state), dim=1)
 38 |         return probs
 39 | 
 40 | 
 41 | class REINFORCE_BASELINE:
 42 |     def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
 43 |                  device) -> None:
 44 |         self.actor = ActorNet(state_dim, hidden_dim, action_dim).to(device)
 45 |         self.vnet = ValueNet(state_dim, hidden_dim, action_dim).to(device)
 46 |         self.gamma = gamma
 47 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
 48 |                                                 lr=learning_rate)
 49 |         self.vnet_optimizer = torch.optim.Adam(self.vnet.parameters(),
 50 |                                                lr=learning_rate)
 51 |         self.device = device
 52 | 
 53 |     def save_model(self, save_path='./', filename='model'):
 54 |         torch.save(self.actor.state_dict(), f"{save_path}\\{filename}.pth")
 55 | 
 56 |     def load_model(self, load_path):
 57 |         self.actor.load_state_dict(torch.load(load_path))
 58 | 
 59 |     def take_action(self, state):
 60 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 61 |         probs = self.actor(state)
 62 |         action_dist = torch.distributions.Categorical(probs)
 63 |         action = action_dist.sample()
 64 |         return action.item()
 65 | 
 66 |     def update(self, transition_dict):
 67 |         reward_list = transition_dict['rewards']
 68 |         state_list = transition_dict['states']
 69 |         action_list = transition_dict['actions']
 70 |         G = 0
 71 |         SG = 0
 72 |         self.actor_optimizer.zero_grad()
 73 |         self.vnet_optimizer.zero_grad()
 74 |         for i in reversed(range(len(reward_list))):
 75 |             reward = torch.tensor([reward_list[i]],
 76 |                                   dtype=torch.float).view(-1,
 77 |                                                           1).to(self.device)
 78 |             state = torch.tensor([state_list[i]],
 79 |                                  dtype=torch.float).to(self.device)
 80 |             action = torch.tensor([action_list[i]]).view(-1, 1).to(self.device)
 81 |             value = self.vnet(state).view(-1, 1).to(self.device)
 82 |             log_prob = torch.log(self.actor(state).gather(1, action))
 83 |             G = self.gamma * G + reward
 84 | 
 85 |             td_delta = G - value
 86 | 
 87 |             actor_loss = -log_prob * td_delta.detach()
 88 |             actor_loss.backward()
 89 | 
 90 |             vnet_loss = F.mse_loss(value, G)
 91 |             vnet_loss.backward()
 92 | 
 93 |         self.actor_optimizer.step()
 94 |         self.vnet_optimizer.step()
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 | 
 99 |     gamma = 0.99
100 |     algorithm_name = "REINFORCE_baseline"
101 |     num_episodes = 10000
102 |     learning_rate = 2e-3
103 |     device = torch.device('cuda')
104 | 
105 |     env_name = 'CartPole-v0'  #'CartPole-v0'
106 |     # 注册环境
107 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
108 | 
109 |     env = gym.make(env_name)
110 | 
111 |     env.seed(0)
112 |     torch.manual_seed(0)
113 | 
114 |     state_dim = env.observation_space.shape[0]
115 |     hidden_dim = 128
116 |     action_dim = env.action_space.n
117 |     update_target = 100
118 |     agent = REINFORCE_BASELINE(state_dim, hidden_dim, action_dim,
119 |                                learning_rate, gamma, device)
120 | 
121 |     return_list = []
122 |     max_reward = 0
123 |     for i in range(10):
124 |         with tqdm(total=int(num_episodes / 10),
125 |                   desc='Iteration %d' % i) as pbar:
126 |             for i_episodes in range(int(num_episodes / 10)):
127 |                 episode_return = 0
128 |                 state = env.reset()
129 |                 done = False
130 |                 transition_dict = {
131 |                     'states': [],
132 |                     'actions': [],
133 |                     'next_states': [],
134 |                     'rewards': [],
135 |                     'dones': []
136 |                 }
137 |                 while not done:
138 |                     action = agent.take_action(state)
139 |                     next_state, reward, done, _ = env.step(action)
140 | 
141 |                     transition_dict['states'].append(state)
142 |                     transition_dict['actions'].append(action)
143 |                     transition_dict['next_states'].append(next_state)
144 |                     transition_dict['rewards'].append(reward)
145 |                     transition_dict['dones'].append(done)
146 |                     state = next_state
147 |                     episode_return += reward
148 |                     if i_episodes == int(num_episodes / 10) - 1:
149 |                         env.render()
150 |                         time.sleep(0.1)
151 |                 agent.update(transition_dict)
152 |                 return_list.append(episode_return)
153 |                 plot_smooth_reward(return_list, 100, env_name, algorithm_name)
154 |                 if episode_return > max_reward:
155 |                     max_reward = episode_return
156 |                     agent.save_model(env_name, algorithm_name)
157 |                 if (i_episodes + 1 % 10 == 0):
158 |                     pbar.set_postfix({
159 |                         'episode':
160 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
161 |                         'return':
162 |                         '%.3f' % np.mean(return_list[-10:])
163 |                     })
164 |                 pbar.update(1)
165 | 


--------------------------------------------------------------------------------
/ppo/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/ppo/.idea/RL_algorithm.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="jdk" jdkName="marl_env" jdkType="Python SDK" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/ppo/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/ppo/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Black">
4 |     <option name="sdkName" value="Python 3.8 (test)" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="marl_env" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ppo/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/RL_algorithm.iml" filepath="$PROJECT_DIR$/.idea/RL_algorithm.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_22_54_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_22_54_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_01_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_01_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_05_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_05_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_16_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_16_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_17_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_17_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_26_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_26_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_32_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_32_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_35_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_35_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_39_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_39_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_40_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_40_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_30_23_50_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_30_23_50_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_31_00_09_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_31_00_09_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_31_00_11_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_31_00_11_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_01_31_00_15_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_01_31_00_15_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_10_58_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_10_58_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_10_59_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_10_59_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_11_00_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_11_00_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_11_01_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_11_01_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_11_02_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_11_02_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_11_03_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_11_03_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_11_04_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_11_04_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_11_06_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_11_06_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_11_07_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_11_07_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_11_08_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_11_08_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_15_41_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_15_41_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_15_48_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_15_48_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_15_49_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_15_49_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_15_50_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_15_50_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_15_51_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_15_51_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_15_53_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_15_53_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_15_54_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_15_54_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_15_59_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_15_59_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_23_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_23_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_25_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_25_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_26_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_26_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_27_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_27_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_28_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_28_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_29_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_29_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_30_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_30_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_31_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_31_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_32_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_32_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_33_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_33_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_34_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_34_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_35_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_35_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_36_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_36_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_16_42_BipedalWalker-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_16_42_BipedalWalker-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_17_53_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_17_53_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_17_55_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_17_55_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_17_56_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_17_56_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_17_58_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_17_58_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_17_59_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_17_59_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_00_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_00_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_02_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_02_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_05_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_05_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_06_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_06_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_07_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_07_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_08_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_08_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_09_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_09_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_12_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_12_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_13_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_13_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_24_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_24_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_41_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_41_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_45_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_45_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_51_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_51_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_18_56_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_18_56_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_19_11_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_19_11_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_19_19_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_19_19_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_19_26_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_19_26_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_19_44_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_19_44_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_19_52_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_19_52_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_19_56_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_19_56_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_20_12_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_20_12_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_20_16_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_20_16_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_20_18_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_20_18_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_20_22_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_20_22_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_21_10_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_21_10_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_21_14_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_21_14_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_21_25_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_21_25_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_21_30_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_21_30_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedalWalker-v3/2024_02_01_21_54_BipedalWalkerHardcore-v3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/BipedalWalker-v3/2024_02_01_21_54_BipedalWalkerHardcore-v3.pth


--------------------------------------------------------------------------------
/ppo/BipedwalkerHardcoreTest.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     env = gym.make("BipedalWalkerHardcore-v3")
 7 |     state = env.reset()  # 初始化状态
 8 |     total_reward = 0
 9 |     step_count = 0
10 |     done = False
11 |     from ppo_multienv import PPOAgent
12 |     from ppo_multienv import parse_args
13 |     import torch
14 |     import time
15 | 
16 |     args = parse_args()
17 |     agent = PPOAgent(env.observation_space, env.action_space, args).to(args.device)
18 |     agent = torch.load("BipedalWalker-v3/2024_02_01_21_54_BipedalWalkerHardcore-v3.pth")
19 |     next_obs = env.reset()  # 初始化状态
20 |     while not done:
21 |         next_obs = torch.tensor(next_obs).unsqueeze(0).to(args.device)
22 |         with torch.no_grad():
23 |             action, logprob, entropy, value = agent.get_action_and_value(next_obs)
24 |         next_obs, reward, done, info = env.step(action.flatten().cpu().numpy())
25 |         if done:
26 |             next_obs = env.reset()
27 |         env.render()
28 |         # action = 1#env.action_space.sample()  # 随机采样一个动作，此处用于演示
29 |         # state, reward, done, info = env.step(action)  # 执行动作
30 |         total_reward += reward
31 |         step_count += 1
32 |         print('At step {}, reward = {}, done = {}'.format(step_count, reward, done))
33 |     print('Total reward: {}'.format(total_reward))


--------------------------------------------------------------------------------
/ppo/MultiFollow/2024_02_24_20_31_Follow.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/MultiFollow/2024_02_24_20_31_Follow.pth


--------------------------------------------------------------------------------
/ppo/MultiFollow/2024_02_24_20_32.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/MultiFollow/2024_02_24_20_32.gif


--------------------------------------------------------------------------------
/ppo/custom_env/MultiAgentFollowEnvV2.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import io
  3 | import gym
  4 | from gym import spaces
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | from matplotlib.animation import FuncAnimation, ArtistAnimation
  8 | import matplotlib.animation as animation
  9 | 
 10 | import imageio
 11 | 
 12 | class MultiAgentFollowEnv(gym.Env):
 13 |     def __init__(self, target_info):
 14 |         np.random.seed(0)
 15 |         
 16 |         self.num_agents = target_info["num_agents"]
 17 | 
 18 |         self.episode_length = 0
 19 |         self.episode_rewards = [0]*self.num_agents
 20 |         self.episode = 0
 21 | 
 22 | 
 23 |         # 团队奖励和个人奖励的加权
 24 |         self.tau=0.9
 25 | 
 26 |         # 目标状态
 27 |         self.target_distances=[]
 28 |         self.target_angles=[]
 29 | 
 30 |         for i in range(self.num_agents):
 31 |             self.target_distances.append(target_info[str(i)][0])
 32 |             self.target_angles.append(target_info[str(i)][1])
 33 |         
 34 |         self.target_distances=np.array(self.target_distances)
 35 |         self.target_angles=np.array(self.target_angles)
 36 | 
 37 |         # 定义状态空间：每个智能体包括目标位置、智能体位置、速度和方向
 38 |         pi = np.pi
 39 |         inf=np.inf
 40 |         self.id=[0]*self.num_agents
 41 |         self.id_=[1]*self.num_agents
 42 |         self.low = np.array(self.id+[-100.0, -100.0, -100.0,-100.0, 0.0, 0])
 43 |         self.high = np.array(self.id_+[100.0,  100.0,100.0,100.0,1.0, 2*pi])
 44 |         self.observation_space = spaces.Box(self.low, self.high,(self.num_agents+2+2+2,), dtype=float)
 45 | 
 46 |         # 定义动作空间：每个智能体包括速度和方向两个维度，多维离散
 47 |         self.action_space=gym.spaces.MultiDiscrete([3,3])
 48 |         # self.action_space = spaces.Tuple([
 49 |         #     spaces.Discrete(3),  # 速度：加速、减速、保持
 50 |         #     spaces.Discrete(3)   # 方向：向左、向右、保持
 51 |         # ])
 52 | 
 53 | 
 54 |         # 定义其他环境参数
 55 |         self.max_steps = 150
 56 |         self.current_step = 0
 57 | 
 58 |         # 到达次数记录以及时间记录
 59 |         self.arrive_count=0
 60 |         self.arrive_time=0
 61 | 
 62 |         self.fig, self.ax = plt.subplots()
 63 | 
 64 |         # 帧列表
 65 |         self.frames=[]
 66 | 
 67 |         # 设置矩形边界限制
 68 |         self.x_min, self.x_max = 0,3
 69 |         self.y_min, self.y_max = 0,3
 70 |         self.main_speed = 0.05
 71 |         self.move_directions = [np.array([1, 0]), np.array([0, 1]), np.array([-1, 0]), np.array([0, -1])]
 72 |         self.current_direction_index = 0
 73 | 
 74 |         # 重置环境状态，随机设置目标位置和每个智能体初始位置、速度、方向
 75 |         self.main_position = np.random.uniform(self.x_min, self.x_max, size=(2,))
 76 | 
 77 |         # 随机设置每个智能体初始位置，确保距离目标一定距离
 78 |         self.agent_positions = np.random.uniform(self.x_min, self.x_max, size=(self.num_agents,2))
 79 |         
 80 |         # 定义每个实体的速度和动作
 81 |         self.speeds = np.zeros(self.num_agents)
 82 |         self.directions = np.zeros(self.num_agents)
 83 | 
 84 | 
 85 |     def update_main_position(self):
 86 |         # 沿着当前方向移动
 87 |         displacement = self.main_speed * self.move_directions[self.current_direction_index]
 88 |         new_main_position = self.main_position + displacement
 89 | 
 90 |         # 运行轨迹矩形边框
 91 |         x_min, x_max = 0.5,2.5
 92 |         y_min, y_max = 0.5,2.5
 93 | 
 94 |         # 判断是否需要改变方向
 95 |         if not (x_min <= new_main_position[0] <= x_max) or \
 96 |            not (y_min <= new_main_position[1] <= y_max):
 97 |             # 如果超出边界，改变方向
 98 |             self.current_direction_index = (self.current_direction_index + 1) % 4
 99 | 
100 |             # 重新计算移动
101 |             displacement = self.main_speed * self.move_directions[self.current_direction_index]
102 |            
103 |             new_main_position = self.main_position + displacement
104 | 
105 |         # 更新主目标位置
106 |         self.main_position = new_main_position
107 |         
108 |         # 重置环境状态，随机设置目标位置和每个智能体初始位置、速度、方向
109 |         self.main_position = np.clip(self.main_position,x_min, x_max)
110 | 
111 | 
112 |     def calculate_target_coordinates(self):
113 | 
114 |         # Calculate x and y coordinates of the targets
115 |         target_x = self.main_position[0] + self.target_distances * np.cos(self.target_angles)
116 |         target_y = self.main_position[1] + self.target_distances * np.sin(self.target_angles)
117 | 
118 |         # Now, target_x and target_y contain the coordinates of each target
119 |         target_coordinates = np.column_stack((target_x, target_y))
120 | 
121 |         return target_coordinates
122 | 
123 |     def reset(self,seed=0):
124 |         # np.random.seed(seed)
125 |         
126 |         pi = np.pi
127 |         inf=np.inf
128 |         
129 |         self.frames=[]
130 | 
131 |         # 重置环境记录
132 |         self.episode_length=0
133 |         self.episode_rewards=[0]*self.num_agents
134 | 
135 |         # 重置环境状态，随机设置参考点位置和每个智能体初始位置、速度、方向
136 |         self.main_position = np.random.uniform(self.x_min, self.x_max, size=(2,))
137 | 
138 |         # 随机设置每个智能体初始位置，确保距离目标一定距离
139 |         self.agent_positions = np.random.uniform(self.x_min, self.x_max, size=(self.num_agents,2))
140 |         
141 |         self.speeds = np.zeros(self.num_agents)
142 |         self.directions = np.zeros(self.num_agents)
143 | 
144 |         self.current_step = 0
145 | 
146 |         # 使用广播操作计算每个代理与其他所有代理位置的差异
147 |         differences = self.agent_positions[:, np.newaxis, :] - self.agent_positions[np.newaxis, :, :]
148 |         # 使用 np.linalg.norm 计算每一行的2范数，即每个代理与其他所有代理之间的欧式距离
149 |         distances = np.linalg.norm(differences, axis=2)
150 |         # 找到距离当前智能体最小的其他智能体位置
151 |         try:
152 |             second_min_indices = np.argsort(distances, axis=1)[:, 1]
153 |             second_min_values = distances[np.arange(len(distances)), second_min_indices]
154 |             done=done or np.any(second_min_values<1)
155 |             print(second_min_values)
156 |             if np.any(second_min_values<1):
157 |                 print(second_min_values)
158 |                 print("碰撞")
159 |         except:
160 |             second_min_indices=[0]*self.num_agents
161 | 
162 | 
163 |         # 返回新的状态，包括目标位置、智能体位置、速度和方向
164 |         states=[]
165 |         for i in range(self.num_agents):
166 |             self.id[i]=1
167 |             states.append(np.concatenate([self.id,self.agent_positions[i]-self.main_position,self.agent_positions[i]-self.agent_positions[second_min_indices[i]],self.speeds[i:i+1],self.directions[i:i+1]]))
168 |             self.id[i]=0
169 | 
170 |         return states
171 | 
172 |     def step(self, action):
173 |         # 解析动作为速度和方向
174 |         for i in range(self.num_agents):
175 |             speed_action, direction_action = action[i]
176 | 
177 |             # 根据速度动作更新速度
178 |             if speed_action == 0:  # 减速
179 |                 self.speeds[i] -= 0.1
180 |                 self.speeds[i]=max(self.speeds[i],self.low[-2])
181 |             elif speed_action == 1:  # 加速
182 |                 self.speeds[i] += 0.1
183 |                 self.speeds[i]=min(self.speeds[i],self.high[-2])
184 | 
185 |             # 根据方向动作更新方向
186 |             if direction_action == 0:  # 向左
187 |                 self.directions[i] -= 0.1
188 |             elif direction_action == 1:  # 向右
189 |                 self.directions[i] += 0.1
190 |             self.directions[i]=np.mod(self.directions,2*np.pi)
191 |         
192 |         
193 |         # 计算每个智能体的位移
194 |         displacements = self.speeds[:, np.newaxis] * np.column_stack([np.cos(self.directions), np.sin(self.directions)])
195 | 
196 |         # 更新每个智能体位置
197 |         self.agent_positions += displacements
198 |         
199 |         # 超出范围则进行取余数操作
200 |         self.agent_positions = np.clip(self.agent_positions, self.x_min,self.x_max)
201 | 
202 |         self.update_main_position()
203 | 
204 |         # 计算每个代理相对于参考点的位置差异
205 |         target_positions=self.calculate_target_coordinates()
206 |         differences = self.agent_positions - target_positions
207 | 
208 |         # 计算奖励
209 |         
210 |         distances_to_target = np.linalg.norm(differences, axis=1)
211 |         
212 |         # main位置取余
213 |         self.main_position = np.clip(self.main_position,0, 5)
214 | 
215 |         # 计算个人奖励
216 |         distance_reward=-0.1*distances_to_target
217 | 
218 |         rewards = distance_reward # 距离倒数作为奖励，目标是最小化距离
219 | 
220 |         # 计算团队奖励
221 |         team_reward=np.mean(rewards)
222 | 
223 |         # 判断是否达到终止条件
224 |         done =  self.current_step >= self.max_steps
225 |         if all(distances < 0.1 for distances in distances_to_target):
226 |             team_reward+=1
227 |         
228 |         # 最终奖励
229 |         rewards=rewards*self.tau+team_reward*(1-self.tau)
230 | 
231 |         self.current_step += 1
232 | 
233 |         # 使用广播操作计算每个代理与其他所有代理位置的差异
234 |         differences = self.agent_positions[:, np.newaxis, :] - self.agent_positions[np.newaxis, :, :]
235 |         # 使用 np.linalg.norm 计算每一行的2范数，即每个代理与其他所有代理之间的欧式距离
236 |         distances = np.linalg.norm(differences, axis=2)
237 |         # 找到距离当前智能体最小的其他智能体位置
238 |         try:
239 |             second_min_indices = np.argsort(distances, axis=1)[:, 1]
240 |             second_min_values = distances[np.arange(len(distances)), second_min_indices]
241 |             done=done or np.any(second_min_values<1)
242 |             if np.any(second_min_values<1):
243 |                 print(second_min_values)
244 |                 print("碰撞")
245 |         except:
246 |             second_min_indices=[0]*self.num_agents      
247 |         
248 |         # 返回新的状态，包括目标位置、智能体位置、速度和方向
249 |         states=[]
250 |         
251 |         for i in range(self.num_agents):
252 |             self.id[i]=1
253 |             states.append(np.concatenate([self.id,self.agent_positions[i]-target_positions[i],self.agent_positions[i]-self.agent_positions[second_min_indices[i]],self.speeds[i:i+1],self.directions[i:i+1]]))
254 |             self.id[i]=0
255 |             self.episode_rewards[i]=rewards[i]
256 | 
257 |         # 组装info
258 |         self.episode_length+=1
259 | 
260 |         info={}
261 |         if done:
262 |             self.episode += 1
263 |             details = {}
264 |             details['r'] = self.episode_rewards
265 |             details['l'] = self.episode_length-1
266 |             details['e'] = self.episode
267 |             info['episode'] = details
268 | 
269 |             if len(self.frames)!=0:
270 |                 timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
271 |                 save_path = f"Follow/{timestamp}.gif"
272 |                 imageio.mimsave(save_path, self.frames, duration=0.1)  # Adjust duration as needed
273 |                 print("GIF saved successfully.")
274 | 
275 |             # print("*"*20)
276 |             # print("距离reward",distance_reward)
277 |             # print("方位reward",angle_reward,0.1*angle_reward)
278 | 
279 |             # print("*"*20)
280 | 
281 |         return states,rewards,done,info
282 | 
283 |     def add_arrow(self, position, direction, color='black'):
284 |         # 添加箭头
285 |         arrow_length = 0.1
286 |         arrow_head_width = 0.05
287 | 
288 |         arrow_dx = arrow_length * np.cos(direction)
289 |         arrow_dy = arrow_length * np.sin(direction)
290 | 
291 |         self.ax.arrow(position[0], position[1], arrow_dx, arrow_dy, color=color, width=arrow_head_width)
292 | 
293 |     def render(self, mode="human"):
294 |         # 清空子图内容
295 |         self.ax.clear()
296 | 
297 |         # 绘制目标
298 |         self.ax.plot(self.main_position[0], self.main_position[1], 'rs', label='main')
299 | 
300 |         # 绘制每个智能体
301 |         for i in range(self.num_agents):
302 |             self.ax.plot(self.agent_positions[i, 0], self.agent_positions[i, 1], 'bo', label=f'agent {i + 1}')
303 | 
304 |             # 添加运动方向箭头
305 |             self.add_arrow(self.agent_positions[i], self.directions[i])
306 | 
307 |         # 设置图形范围
308 |         self.ax.set_xlim(self.x_min,self.x_max)  # 根据需要调整范围
309 |         self.ax.set_ylim(self.y_min,self.y_max)  # 根据需要调整范围
310 | 
311 |         # 添加图例
312 |         self.ax.legend()
313 | 
314 |         if mode == "human":
315 |             # 显示图形
316 |             plt.pause(0.01)  # 添加短暂的时间间隔，单位为秒
317 |         else:
318 |             # 将当前帧的图形添加到列表中
319 |             self.frames.append(self.fig_to_array())
320 |             
321 |     def fig_to_array(self):
322 |         # Convert the current figure to an array of pixels
323 |         buf = io.BytesIO()
324 |         self.ax.figure.savefig(buf, format='png')
325 |         buf.seek(0)
326 |         img = imageio.imread(buf)
327 |         return img
328 | 
329 | 
330 | if __name__ == "__main__":
331 | 
332 | 
333 |     pi=np.pi
334 |     target_info={"num_agents":1,"0":(0,0),"1":(10,pi/3),"2":(20,pi/6),"3":(20,pi/3)}
335 | 
336 |     # 循环测试代码
337 |     env = MultiAgentFollowEnv(target_info)
338 | 
339 |     state = env.reset()  # 重置环境
340 |     done = False
341 | 
342 |     while not done:
343 |         # 随机生成每个智能体的离散动作，其中第一维控制速度，有加速、减速、保持，
344 |         # 第二维控制方向，有向左、向右、保持
345 |         actions = np.random.randint(3, size=(env.num_agents, 2))
346 |         
347 |         actions=np.array([[1,1]])
348 | 
349 |         states, rewards, done, _ = env.step(actions)  # 执行动作
350 | 
351 |         print(rewards)
352 |         
353 |         env.render(mode="human")  # 渲染环境状态
354 | 
355 |     print("Testing complete.")
356 | 


--------------------------------------------------------------------------------
/ppo/custom_env/Walker_Discreate.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | 
 4 | 
 5 | class Walker(gym.Env):
 6 |     def __init__(self, bins=20):
 7 |         self.env = gym.make("BipedalWalker-v3")
 8 |         self.env = gym.wrappers.RecordEpisodeStatistics(self.env)
 9 |         self.observation_space=self.env.observation_space
10 |         self.action_space = gym.spaces.MultiDiscrete([bins]*self.env.action_space.shape[0])
11 |         self.discrete_action = np.linspace(-1., 1., bins)
12 | 
13 |     def step(self, action):
14 |         continuous_action = self.discrete_action[action]
15 |         next_state, reward, done, info = self.env.step(continuous_action)
16 |         return next_state, reward, done, info
17 | 
18 |     def reset(self):
19 |         next_state = self.env.reset()
20 |         return next_state
21 | 
22 |     def render(self, mode="human"):
23 |         self.env.render(mode=mode)
24 | 
25 |     def seed(self, seed=None):
26 |         self.env.seed(seed)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     env = Walker()
31 |     state = env.reset()  # 初始化状态
32 |     total_reward = 0
33 |     step_count = 0
34 |     done = False
35 |     while not done:
36 |         env.render()
37 |         action = env.action_space.sample()  # 随机采样一个动作，此处用于演示
38 |         state, reward, done, info = env.step(action)  # 执行动作
39 |         total_reward += reward
40 |         step_count += 1
41 |         print('At step {}, reward = {}, done = {}'.format(step_count, reward, done))
42 |     print('Total reward: {}'.format(total_reward))
43 | 


--------------------------------------------------------------------------------
/ppo/custom_env/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/custom_env/__init__.py


--------------------------------------------------------------------------------
/ppo/custom_env/__pycache__/MultiAgentFollowEnvV2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/custom_env/__pycache__/MultiAgentFollowEnvV2.cpython-38.pyc


--------------------------------------------------------------------------------
/ppo/custom_env/__pycache__/MultiAgentFollowEnvV3.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/custom_env/__pycache__/MultiAgentFollowEnvV3.cpython-38.pyc


--------------------------------------------------------------------------------
/ppo/custom_env/__pycache__/Walker_Discreate.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/custom_env/__pycache__/Walker_Discreate.cpython-38.pyc


--------------------------------------------------------------------------------
/ppo/custom_env/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/custom_env/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/ppo/custom_env/__pycache__/snake_env.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/custom_env/__pycache__/snake_env.cpython-38.pyc


--------------------------------------------------------------------------------
/ppo/custom_env/snake_env.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import random
  4 | import pygame
  5 | import sys
  6 | import time
  7 | 
  8 | 
  9 | class SnakeEnv(gym.Env):
 10 |     def __init__(self, width=20, height=10):
 11 |         super(SnakeEnv, self).__init__()
 12 | 
 13 |         self.width = width
 14 |         self.height = height
 15 |         self.grid = np.zeros((height, width))
 16 |         self.snake = None
 17 |         self.food = None
 18 |         self.action_mask = [True] * 4
 19 |         self.episode_length = 0
 20 |         self.episode_reward = 0
 21 |         self.episode = 0
 22 |         self.max_episode_length = 200
 23 |         self.cell_size = 20
 24 |         self.window_size = (self.width * self.cell_size,
 25 |                             self.height * self.cell_size)
 26 |         # 定义颜色
 27 |         self.color_bg = (255, 255, 255)
 28 |         self.color_head = (0, 120, 120)
 29 |         self.color_body = (0, 255, 0)
 30 |         self.color_food = (255, 0, 0)
 31 | 
 32 |         # 定义贪吃蛇环境的观测空间和行动空间
 33 |         inf = self.width + self.height
 34 |         low = np.array([-10, -1.0, 0, 0, 0, 0])  # 连续状态空间的最小值
 35 |         high = np.array([10, 1.0, 1.0, 1.0, 1.0, 1.0])  # 连续状态空间的最大值
 36 |         self.observation_space = gym.spaces.Box(low, high, shape=(6,), dtype=float)
 37 | 
 38 |         # 0 左 1上 2右 3下
 39 |         self.action_space = gym.spaces.Discrete(4)
 40 | 
 41 |     def generate_food(self):
 42 |         while True:
 43 |             x = random.randint(0, self.width - 1)
 44 |             y = random.randint(0, self.height - 1)
 45 |             if (x, y) not in self.snake:
 46 |                 return (x, y)
 47 | 
 48 |     def reset(self):
 49 |         self.grid = np.zeros((self.height, self.width))
 50 |         self.snake = [(0, 0)]
 51 |         self.food = self.generate_food()
 52 |         self.time = 1
 53 |         self.action_mask = self.get_mask()
 54 |         self.episode_length = 0
 55 |         self.episode_reward = 0
 56 |         self.update_grid()
 57 |         return self.get_state()
 58 | 
 59 |     # 状态获取函数
 60 |     def get_state(self):
 61 |         state = []
 62 |         x, y = self.snake[0]
 63 |         fx, fy = self.food
 64 |         state.append(fx - x)
 65 |         state.append(fy - y)
 66 |         for gx, gy in [(0, 1), (1, 0), (0, -1), (-1, 0)]:
 67 |             dx, dy = x + gx, y + gy
 68 |             if dx < 0 or dy < 0 or dx >= self.width or dy >= self.height or (
 69 |                     dx, dy) in self.snake:
 70 |                 state.append(0)
 71 |                 continue
 72 |             else:
 73 |                 state.append(1)  # 四个方向可以走
 74 |         return np.array(state, dtype=np.float32)
 75 | 
 76 |     # 更新当前动画状态
 77 |     def update_grid(self):
 78 |         self.grid = np.zeros((self.height, self.width))
 79 |         x, y = self.snake[0]
 80 |         self.grid[y, x] = 1
 81 |         for x, y in self.snake[1:]:
 82 |             self.grid[y, x] = 2
 83 |         fx, fy = self.food
 84 |         self.grid[fy, fx] = 3
 85 | 
 86 |     # 获取动作mask
 87 |     def get_mask(self):
 88 |         action_mask = [True] * 4
 89 |         x, y = self.snake[0]
 90 |         for i, (gx, gy) in enumerate([(0, 1), (1, 0), (0, -1), (-1, 0)]):
 91 |             dx, dy = x + gx, y + gy
 92 |             if dx < 0 or dy < 0 or dx >= self.width or dy >= self.height or (
 93 |                     dx, dy) in self.snake:
 94 |                 action_mask[i] = False
 95 |             else:
 96 |                 action_mask[i] = True  # True则表示动作可以执行
 97 |         return action_mask
 98 | 
 99 |     def step(self, action):
100 |         x, y = self.snake[0]
101 |         direction = [(0, 1), (1, 0), (0, -1), (-1, 0)]
102 |         x = x + direction[action][0]
103 |         y = y + direction[action][1]
104 | 
105 |         self.episode_length += 1
106 |         if x < 0 or x >= self.width or y < 0 or y >= self.height or (
107 |                 x, y) in self.snake:
108 |             reward = -1
109 |             done = True
110 |         elif (x, y) == self.food:
111 |             reward = 1
112 |             self.snake.insert(0, (x, y))
113 |             self.food = self.generate_food()
114 |             self.update_grid()
115 |             done = False
116 |         else:
117 |             fx, fy = self.food
118 |             d = (abs(x - fx) + abs(y - fy))
119 |             reward = 0
120 |             self.snake.insert(0, (x, y))
121 |             self.snake.pop()
122 |             self.update_grid()
123 |             done = False
124 | 
125 |         info = {}
126 |         self.episode_reward += reward
127 |         # 更新action_mask
128 |         self.action_mask = self.get_mask()
129 |         if done:
130 |             self.episode += 1
131 |             details = {}
132 |             details['r'] = self.episode_reward
133 |             details['l'] = self.episode_length
134 |             details['e'] = self.episode
135 |             info['episode'] = details
136 |         return self.get_state(), reward, done, info
137 | 
138 |     def render(self, mode='human'):
139 |         # 初始化pygame窗口
140 |         pygame.init()
141 |         pygame.font.init()
142 |         self.font = pygame.font.Font(None, 30)
143 |         self.window = pygame.display.set_mode(self.window_size)
144 | 
145 |         pygame.display.set_caption("Snake Game")
146 | 
147 |         if mode == 'rgb_array':
148 |             surface = pygame.Surface(
149 |                 (self.width * self.cell_size, self.height * self.cell_size))
150 |             self.window = surface
151 | 
152 |         self.window.fill(self.color_bg)
153 | 
154 |         for event in pygame.event.get():
155 |             if event.type == pygame.QUIT:
156 |                 pygame.quit()
157 |                 sys.exit()
158 | 
159 |         for y in range(self.height):
160 |             for x in range(self.width):
161 |                 cell_value = self.grid[y, x]
162 |                 cell_rect = pygame.Rect(x * self.cell_size, y * self.cell_size,
163 |                                         self.cell_size, self.cell_size)
164 | 
165 |         for y in range(self.height):
166 |             for x in range(self.width):
167 |                 cell_value = self.grid[y, x]
168 |                 cell_rect = pygame.Rect(x * self.cell_size, y * self.cell_size,
169 |                                         self.cell_size, self.cell_size)
170 |                 if cell_value == 0:  # 白色的空白格子
171 |                     pygame.draw.rect(self.window, (255, 255, 0), cell_rect, 1)
172 |                 elif cell_value == 1:  # 贪吃蛇身体
173 |                     pygame.draw.rect(self.window, self.color_head, cell_rect)
174 |                 elif cell_value == 2:  # 贪吃蛇身体
175 |                     pygame.draw.rect(self.window, self.color_body, cell_rect)
176 |                 elif cell_value == 3:  # 食物
177 |                     # pygame.draw.rect(self.window, self.color_food, cell_rect)
178 |                     pygame.draw.circle(self.window, self.color_food,
179 |                                        (cell_rect.x + self.cell_size // 2, cell_rect.y + self.cell_size // 2),
180 |                                        self.cell_size // 2)
181 | 
182 |         snake_length_text = self.font.render("Length: " + str(len(self.snake)),
183 |                                              True, (0, 25, 25))
184 |         self.window.blit(snake_length_text, (0, 0))
185 | 
186 |         pygame.display.flip()
187 | 
188 |         if mode == 'rgb_array':
189 |             image_array = pygame.surfarray.array3d(self.window)
190 |             return image_array
191 | 
192 |     def close(self):
193 |         pygame.quit()  # 释放 Pygame 占用的系统资源
194 |         sys.exit()  # 关闭程序
195 | 
196 | 
197 | if __name__ == "__main__":
198 | 
199 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
200 |     env = gym.make('Snake-v0')
201 | 
202 |     next_obs = env.reset()  # 初始化状态
203 |     total_reward = 0
204 |     step_count = 0
205 |     done = False
206 |     from ppo_snake import PPOAgent
207 |     from ppo_snake import parse_args
208 |     import torch
209 | 
210 |     args = parse_args()
211 |     agent = PPOAgent(env.observation_space, env.action_space, args).to(args.device)
212 |     agent = torch.load("snake-v0/2024_01_30_22_49_Snake-v0.pth")
213 |     while not done:
214 |         next_obs = torch.tensor(next_obs).unsqueeze(0).to(args.device)
215 |         with torch.no_grad():
216 |             action, logprob, entropy, value = agent.get_action_and_value(next_obs)
217 |         next_obs, reward, done, info = env.step(action.flatten().cpu().numpy().item())
218 |         if done:
219 |             next_obs = env.reset()
220 |         env.render()
221 |         time.sleep(0.1)
222 |         # action = 1#env.action_space.sample()  # 随机采样一个动作，此处用于演示
223 |         # state, reward, done, info = env.step(action)  # 执行动作
224 |         total_reward += reward
225 |         step_count += 1
226 |         print('At step {}, reward = {}, done = {}'.format(step_count, reward, done))
227 |     print('Total reward: {}'.format(total_reward))
228 | 


--------------------------------------------------------------------------------
/ppo/ppo.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | from rl_utils import plot_smooth_reward
 13 | 
 14 | 
 15 | def compute_advantage(gamma, lmbda, td_delta):
 16 |     td_delta = td_delta.detach().numpy()
 17 |     advantage_list = []
 18 |     advantage = 0.0
 19 |     for delta in td_delta[::-1]:
 20 |         advantage = gamma * lmbda * advantage + delta
 21 |         advantage_list.append(advantage)
 22 |     advantage_list.reverse()
 23 |     return torch.tensor(advantage_list, dtype=torch.float)
 24 | 
 25 | 
 26 | class CriticNet(nn.Module):
 27 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 28 |         super().__init__()
 29 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 30 |         self.fc2 = torch.nn.Linear(hidden_dim, 1)
 31 | 
 32 |     def forward(self, state):
 33 |         hidden_state = F.relu(self.fc1(state))
 34 |         acion_value = self.fc2(hidden_state)
 35 |         return acion_value
 36 | 
 37 | 
 38 | class ActorNet(nn.Module):
 39 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 40 |         super().__init__()
 41 |         self.fc1 = nn.Linear(state_dim, hidden_dim)
 42 |         self.fc2 = nn.Linear(hidden_dim, action_dim)
 43 | 
 44 |     def forward(self, state):
 45 | 
 46 |         hidden_state = F.relu(self.fc1(state))
 47 |         # print(self.fc2(hidden_state))
 48 |         probs = F.softmax(self.fc2(hidden_state), dim=1)
 49 |         return probs
 50 | 
 51 | 
 52 | class PPO:
 53 |     def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
 54 |                  gamma, lmbda, eps, epochs, device) -> None:
 55 |         self.actor = ActorNet(state_dim, hidden_dim, action_dim).to(device)
 56 |         self.critic = CriticNet(state_dim, hidden_dim, action_dim).to(device)
 57 |         self.gamma = gamma
 58 |         self.lmbda = lmbda
 59 |         self.eps = eps
 60 |         self.epochs = epochs
 61 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
 62 |                                                 lr=actor_lr)
 63 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
 64 |                                                  lr=critic_lr)
 65 |         self.device = device
 66 | 
 67 |     def save_model(self, save_path='./', filename='model'):
 68 |         torch.save(self.actor.state_dict(), f"{save_path}\\{filename}.pth")
 69 | 
 70 |     def load_model(self, load_path):
 71 |         self.actor.load_state_dict(torch.load(load_path))
 72 | 
 73 |     def take_action(self, state):
 74 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 75 |         probs = self.actor(state)
 76 |         action_dist = torch.distributions.Categorical(probs)
 77 |         action = action_dist.sample()
 78 |         return action.item()
 79 | 
 80 |     def update(self, transition_dict):
 81 |         rewards = torch.tensor(transition_dict['rewards'],
 82 |                                dtype=torch.float).view(-1, 1).to(self.device)
 83 |         states = torch.tensor(transition_dict['states'],
 84 |                               dtype=torch.float).to(self.device)
 85 |         actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
 86 |             self.device)
 87 |         next_states = torch.tensor(transition_dict['next_states'],
 88 |                                    dtype=torch.float).to(self.device)
 89 |         next_actions = torch.tensor(transition_dict['next_actions']).view(
 90 |             -1, 1).to(self.device)
 91 |         dones = torch.tensor(transition_dict['dones'],
 92 |                              dtype=torch.float).to(self.device).view(-1, 1)
 93 | 
 94 |         v_now = self.critic(states).view(-1, 1)
 95 |         v_next = self.critic(next_states).view(-1, 1)
 96 |         y_now = (self.gamma * v_next * (1 - dones) + rewards).view(-1, 1)
 97 |         td_delta = y_now - v_now
 98 |         advantage = compute_advantage(self.gamma, self.lmbda,
 99 |                                       td_delta.cpu()).to(self.device)
100 |         old_log_probs = torch.log(self.actor(states).gather(1,
101 |                                                             actions)).detach()
102 |         for _ in range(self.epochs):
103 |             log_probs = torch.log(self.actor(states).gather(1, actions))
104 |             ratio = torch.exp(log_probs - old_log_probs)
105 |             surr1 = ratio * advantage
106 |             surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage
107 |             actor_loss = torch.mean(-torch.min(surr1, surr2))
108 |             critic_loss = torch.mean(
109 |                 F.mse_loss(self.critic(states), y_now.detach()))
110 | 
111 |             self.actor_optimizer.zero_grad()
112 |             self.critic_optimizer.zero_grad()
113 | 
114 |             actor_loss.backward()
115 |             critic_loss.backward()
116 | 
117 |             self.actor_optimizer.step()
118 |             self.critic_optimizer.step()
119 | 
120 | 
121 | if __name__ == "__main__":
122 | 
123 |     gamma = 0.99
124 |     algorithm_name = "PPOdemo"
125 |     num_episodes = 10000
126 |     actor_lr = 1e-3
127 |     critic_lr = 1e-3
128 |     lmbda = 0.95
129 |     eps = 0.2
130 |     epochs = 8
131 |     device = torch.device('cuda')
132 |     env_name = 'Snake-v0'  #'CartPole-v0'
133 | 
134 |     # 注册环境
135 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
136 | 
137 |     env = gym.make(env_name)
138 | 
139 |     random.seed(0)
140 |     np.random.seed(0)
141 |     env.seed(0)
142 |     torch.manual_seed(0)
143 | 
144 |     state_dim = env.observation_space.shape[0]
145 |     hidden_dim = 128
146 |     action_dim = env.action_space.n
147 |     agent = PPO(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma,
148 |                 lmbda, eps, epochs, device)
149 | 
150 |     return_list = []
151 |     max_reward = 0
152 |     for i in range(10):
153 |         with tqdm(total=int(num_episodes / 10),
154 |                   desc='Iteration %d' % i) as pbar:
155 |             for i_episodes in range(int(num_episodes / 10)):
156 |                 episode_return = 0
157 |                 state = env.reset()
158 |                 done = False
159 | 
160 |                 transition_dict = {
161 |                     'states': [],
162 |                     'actions': [],
163 |                     'next_states': [],
164 |                     'next_actions': [],
165 |                     'rewards': [],
166 |                     'dones': []
167 |                 }
168 | 
169 |                 while not done:
170 |                     action = agent.take_action(state)
171 |                     next_state, reward, done, _ = env.step(action)
172 |                     next_action = agent.take_action(next_state)
173 |                     env.render()
174 | 
175 |                     transition_dict['states'].append(state)
176 |                     transition_dict['actions'].append(action)
177 |                     transition_dict['next_states'].append(next_state)
178 |                     transition_dict['next_actions'].append(next_action)
179 |                     transition_dict['rewards'].append(reward)
180 |                     transition_dict['dones'].append(done)
181 | 
182 |                     state = next_state
183 |                     episode_return += reward
184 |                     if i_episodes == int(num_episodes / 10) - 1:
185 |                         env.render()
186 |                         time.sleep(0.1)
187 |                 agent.update(transition_dict)
188 | 
189 |                 return_list.append(episode_return)
190 |                 plot_smooth_reward(return_list, 100, env_name, algorithm_name)
191 |                 if episode_return > max_reward:
192 |                     max_reward = episode_return
193 |                     agent.save_model(env_name, algorithm_name)
194 |                 if (i_episodes + 1 % 10 == 0):
195 |                     pbar.set_postfix({
196 |                         'episode':
197 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
198 |                         'return':
199 |                         '%.3f' % np.mean(return_list[-10:])
200 |                     })
201 |                 pbar.update(1)
202 | 


--------------------------------------------------------------------------------
/ppo/ppoconPendulum.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | from rl_utils import plot_smooth_reward
 13 | 
 14 | 
 15 | def compute_advantage(gamma, lmbda, td_delta):
 16 |     td_delta = td_delta.detach().numpy()
 17 |     advantage_list = []
 18 |     advantage = 0.0
 19 |     for delta in td_delta[::-1]:
 20 |         advantage = gamma * lmbda * advantage + delta
 21 |         advantage_list.append(advantage)
 22 |     advantage_list.reverse()
 23 |     return torch.tensor(advantage_list, dtype=torch.float)
 24 | 
 25 | 
 26 | class CriticNet(nn.Module):
 27 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 28 |         super().__init__()
 29 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 30 |         self.fc2 = torch.nn.Linear(hidden_dim, 1)
 31 | 
 32 |     def forward(self, state):
 33 |         hidden_state = F.relu(self.fc1(state))
 34 |         acion_value = self.fc2(hidden_state)
 35 |         return acion_value
 36 | 
 37 | 
 38 | class ActorNet(nn.Module):
 39 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 40 |         super().__init__()
 41 |         self.fc1 = nn.Linear(state_dim, hidden_dim)
 42 |         self.fc_mu = nn.Linear(hidden_dim, action_dim)
 43 |         self.fc_std = nn.Linear(hidden_dim, action_dim)
 44 | 
 45 |     def forward(self, state):
 46 | 
 47 |         hidden_state = F.relu(self.fc1(state))
 48 |         mu = 2.0 * F.tanh(self.fc_mu(hidden_state))
 49 |         sigma = F.softplus(self.fc_std(hidden_state))
 50 |         return mu, sigma
 51 | 
 52 | 
 53 | class PPO:
 54 |     def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
 55 |                  gamma, lmbda, eps, epochs, device) -> None:
 56 |         self.actor = ActorNet(state_dim, hidden_dim, action_dim).to(device)
 57 |         self.critic = CriticNet(state_dim, hidden_dim, action_dim).to(device)
 58 |         self.gamma = gamma
 59 |         self.lmbda = lmbda
 60 |         self.eps = eps
 61 |         self.epochs = epochs
 62 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
 63 |                                                 lr=actor_lr)
 64 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
 65 |                                                  lr=critic_lr)
 66 |         self.device = device
 67 | 
 68 |     def save_model(self, save_path='./', filename='model'):
 69 |         torch.save(self.actor.state_dict(), f"{save_path}\\{filename}.pth")
 70 | 
 71 |     def load_model(self, load_path):
 72 |         self.actor.load_state_dict(torch.load(load_path))
 73 | 
 74 |     def take_action(self, state):
 75 |         print(state)
 76 |         state = torch.tensor([state], dtype=torch.float).to(self.device)
 77 |         mu, sigma = self.actor(state)
 78 |         action_dist = torch.distributions.Normal(mu, sigma)
 79 |         action = action_dist.sample()
 80 |         # print(action)
 81 |         return action.cpu().numpy().flatten()
 82 | 
 83 |     def update(self, transition_dict):
 84 |         rewards = torch.tensor(transition_dict['rewards'],
 85 |                                dtype=torch.float).view(-1, 1).to(self.device)
 86 |         states = torch.tensor(transition_dict['states'],
 87 |                               dtype=torch.float).to(self.device)
 88 |         actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
 89 |             self.device)
 90 |         next_states = torch.tensor(transition_dict['next_states'],
 91 |                                    dtype=torch.float).to(self.device)
 92 |         next_actions = torch.tensor(transition_dict['next_actions']).view(
 93 |             -1, 1).to(self.device)
 94 |         dones = torch.tensor(transition_dict['dones'],
 95 |                              dtype=torch.float).to(self.device).view(-1, 1)
 96 | 
 97 |         v_now = self.critic(states).view(-1, 1)
 98 |         v_next = self.critic(next_states).view(-1, 1)
 99 |         y_now = (self.gamma * v_next * (1 - dones) + rewards).view(-1, 1)
100 |         td_delta = y_now - v_now
101 |         advantage = compute_advantage(self.gamma, self.lmbda,
102 |                                       td_delta.cpu()).to(self.device)
103 |         mu, sigma = self.actor(states)
104 |         action_dists = torch.distributions.Normal(mu.detach(), sigma.detach())
105 |         old_log_probs = action_dists.log_prob(actions)
106 |         for _ in range(self.epochs):
107 | 
108 |             mu, sigma = self.actor(states)
109 |             action_dists = torch.distributions.Normal(mu, sigma)
110 |             log_probs = action_dists.log_prob(actions)
111 | 
112 |             ratio = torch.exp(log_probs - old_log_probs)
113 |             surr1 = ratio * advantage
114 |             surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage
115 |             actor_loss = torch.mean(-torch.min(surr1, surr2))
116 |             critic_loss = torch.mean(
117 |                 F.mse_loss(self.critic(states), y_now.detach()))
118 | 
119 |             self.actor_optimizer.zero_grad()
120 |             self.critic_optimizer.zero_grad()
121 | 
122 |             actor_loss.backward()
123 |             critic_loss.backward()
124 | 
125 |             self.actor_optimizer.step()
126 |             self.critic_optimizer.step()
127 | 
128 | 
129 | if __name__ == "__main__":
130 | 
131 |     gamma = 0.9
132 |     algorithm_name = "demo"
133 |     num_episodes = 2000
134 |     actor_lr = 1e-4
135 |     critic_lr = 5e-3
136 |     lmbda = 0.9
137 |     eps = 0.2
138 |     epochs = 10
139 |     device = torch.device('cuda')
140 |     env_name = 'Pendulum-v1'  #'CartPole-v0'
141 |     max_step = 500
142 |     # 注册环境
143 |     gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
144 | 
145 |     env = gym.make(env_name)
146 | 
147 |     random.seed(0)
148 |     np.random.seed(0)
149 |     env.seed(0)
150 |     torch.manual_seed(0)
151 | 
152 |     state_dim = env.observation_space.shape[0]
153 |     hidden_dim = 128
154 |     action_dim = env.action_space.shape[0]
155 |     agent = PPO(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma,
156 |                 lmbda, eps, epochs, device)
157 | 
158 |     return_list = []
159 |     inf = float('inf')
160 |     max_reward = -inf
161 |     for i in range(10):
162 |         with tqdm(total=int(num_episodes / 10),
163 |                   desc='Iteration %d' % i) as pbar:
164 |             for i_episodes in range(int(num_episodes / 10)):
165 |                 episode_return = 0
166 |                 state = env.reset()
167 |                 done = False
168 | 
169 |                 transition_dict = {
170 |                     'states': [],
171 |                     'actions': [],
172 |                     'next_states': [],
173 |                     'next_actions': [],
174 |                     'rewards': [],
175 |                     'dones': []
176 |                 }
177 | 
178 |                 for _ in range(max_step):
179 |                     action = agent.take_action(state)
180 |                     next_state, reward, done, _ = env.step(action)
181 | 
182 |                     next_action = agent.take_action(next_state)
183 |                     # print(next_action, next_state, action, state, reward)
184 | 
185 |                     transition_dict['states'].append(state)
186 |                     transition_dict['actions'].append(action)
187 |                     transition_dict['next_states'].append(next_state)
188 |                     transition_dict['next_actions'].append(next_action)
189 |                     transition_dict['rewards'].append((reward + 8.0) / 8.0)
190 |                     transition_dict['dones'].append(done)
191 | 
192 |                     state = next_state
193 |                     episode_return += reward
194 |                     if i_episodes == int(num_episodes / 10) - 1:
195 |                         env.render()
196 |                         time.sleep(0.1)
197 |                     if done:
198 |                         break
199 |                 agent.update(transition_dict)
200 | 
201 |                 return_list.append(episode_return)
202 |                 plot_smooth_reward(return_list, 100, env_name, algorithm_name)
203 |                 if episode_return > max_reward:
204 |                     max_reward = episode_return
205 |                     agent.save_model(env_name, algorithm_name)
206 |                 if (i_episodes + 1 % 10 == 0):
207 |                     pbar.set_postfix({
208 |                         'episode':
209 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
210 |                         'return':
211 |                         '%.3f' % np.mean(return_list[-10:])
212 |                     })
213 |                 pbar.update(1)
214 | 


--------------------------------------------------------------------------------
/ppo/ppodisPendulum.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import gym
  4 | import pygame
  5 | import collections
  6 | import random
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import torch.nn as nn
 12 | from rl_utils import plot_smooth_reward
 13 | 
 14 | 
 15 | def compute_advantage(gamma, lmbda, td_delta):
 16 |     td_delta = td_delta.detach().numpy()
 17 |     advantage_list = []
 18 |     advantage = 0.0
 19 |     for delta in td_delta[::-1]:
 20 |         advantage = gamma * lmbda * advantage + delta
 21 |         advantage_list.append(advantage)
 22 |     advantage_list.reverse()
 23 |     return torch.tensor(advantage_list, dtype=torch.float)
 24 | 
 25 | 
 26 | class CriticNet(nn.Module):
 27 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 28 |         super().__init__()
 29 |         self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
 30 |         self.fc2 = torch.nn.Linear(hidden_dim, 1)
 31 | 
 32 |     def forward(self, state):
 33 |         hidden_state = F.relu(self.fc1(state))
 34 |         acion_value = self.fc2(hidden_state)
 35 |         return acion_value
 36 | 
 37 | 
 38 | class ActorNet(nn.Module):
 39 |     def __init__(self, state_dim, hidden_dim, action_dim) -> None:
 40 |         super().__init__()
 41 |         self.fc1 = nn.Linear(state_dim, hidden_dim)
 42 |         self.fc2 = nn.Linear(hidden_dim, action_dim)
 43 | 
 44 |     def forward(self, state):
 45 | 
 46 |         hidden_state = F.relu(self.fc1(state))
 47 |         # print(self.fc2(hidden_state))
 48 |         probs = F.softmax(self.fc2(hidden_state), dim=1)
 49 |         return probs
 50 | 
 51 | 
 52 | class PPO:
 53 |     def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
 54 |                  gamma, lmbda, eps, epochs, device) -> None:
 55 |         self.actor = ActorNet(state_dim, hidden_dim, action_dim).to(device)
 56 |         self.critic = CriticNet(state_dim, hidden_dim, action_dim).to(device)
 57 |         self.gamma = gamma
 58 |         self.lmbda = lmbda
 59 |         self.eps = eps
 60 |         self.epochs = epochs
 61 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
 62 |                                                 lr=actor_lr)
 63 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
 64 |                                                  lr=critic_lr)
 65 |         self.device = device
 66 | 
 67 |     def save_model(self, save_path='./', filename='model'):
 68 |         torch.save(self.actor.state_dict(), f"{save_path}\\{filename}.pth")
 69 | 
 70 |     def load_model(self, load_path):
 71 |         self.actor.load_state_dict(torch.load(load_path))
 72 | 
 73 |     def take_action(self, state):
 74 |         state = torch.tensor(np.array([state]), dtype=torch.float).to(self.device)
 75 |         probs = self.actor(state)
 76 |         action_dist = torch.distributions.Categorical(probs)
 77 |         action = action_dist.sample()
 78 |         return action.item()
 79 | 
 80 |     def update(self, transition_dict):
 81 |         rewards = torch.tensor(transition_dict['rewards'],
 82 |                                dtype=torch.float).view(-1, 1).to(self.device)
 83 |         states = torch.tensor(np.array(transition_dict['states']),
 84 |                               dtype=torch.float).to(self.device)
 85 |         actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
 86 |             self.device)
 87 |         next_states = torch.tensor(transition_dict['next_states'],
 88 |                                    dtype=torch.float).to(self.device)
 89 |         next_actions = torch.tensor(transition_dict['next_actions']).view(
 90 |             -1, 1).to(self.device)
 91 |         dones = torch.tensor(transition_dict['dones'],
 92 |                              dtype=torch.float).to(self.device).view(-1, 1)
 93 | 
 94 |         v_now = self.critic(states).view(-1, 1)
 95 |         v_next = self.critic(next_states).view(-1, 1)
 96 |         y_now = (self.gamma * v_next * (1 - dones) + rewards).view(-1, 1)
 97 |         td_delta = y_now - v_now
 98 |         advantage = compute_advantage(self.gamma, self.lmbda,
 99 |                                       td_delta.cpu()).to(self.device)
100 |         old_log_probs = torch.log(self.actor(states).gather(1,
101 |                                                             actions)).detach()
102 |         for _ in range(self.epochs):
103 |             log_probs = torch.log(self.actor(states).gather(1, actions))
104 |             ratio = torch.exp(log_probs - old_log_probs)
105 |             surr1 = ratio * advantage
106 |             surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage
107 |             actor_loss = torch.mean(-torch.min(surr1, surr2))
108 |             critic_loss = torch.mean(
109 |                 F.mse_loss(self.critic(states), y_now.detach()))
110 | 
111 |             self.actor_optimizer.zero_grad()
112 |             self.critic_optimizer.zero_grad()
113 | 
114 |             actor_loss.backward()
115 |             critic_loss.backward()
116 | 
117 |             self.actor_optimizer.step()
118 |             self.critic_optimizer.step()
119 |     
120 | 
121 | if __name__ == "__main__":
122 | 
123 |     gamma = 0.9
124 |     algorithm_name = "PPOdis"
125 |     num_episodes = 2000
126 |     actor_lr = 1e-3
127 |     critic_lr = 2e-3
128 |     lmbda = 0.9
129 |     eps = 0.2
130 |     epochs = 10
131 |     device = torch.device('cuda')
132 |     env_name = 'Pendulum-v1'  #'CartPole-v0'
133 |     max_step = 500
134 | 
135 |     env = gym.make(env_name)
136 |     random_seed=3407
137 |     random.seed(random_seed)
138 |     np.random.seed(random_seed)
139 |     torch.manual_seed(random_seed)
140 | 
141 |     state_dim = env.observation_space.shape[0]
142 |     hidden_dim = 256
143 |     try:
144 |         action_dim = env.action_space.n
145 |     except:
146 |         action_dim=15
147 |     agent = PPO(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma,
148 |                 lmbda, eps, epochs, device)
149 | 
150 |     return_list = []
151 |     max_reward = 0
152 | 
153 |     lower_limit = -2  # 下限
154 |     upper_limit = 2  # 上限
155 |     # 计算每个等份的大小
156 |     partition_size = (upper_limit - lower_limit) / (action_dim-1)
157 |     # 使用lambda表达式生成等份的起始和结束位置
158 |     partitions = [ [lower_limit + partition_size*i] for i in range(action_dim)]
159 |     print(partitions)
160 | 
161 | 
162 |     for i in range(10):
163 |         with tqdm(total=int(num_episodes / 10),
164 |                   desc='Iteration %d' % i) as pbar:
165 |             for i_episodes in range(int(num_episodes / 10)):
166 |                 episode_return = 0
167 |                 state = env.reset(seed=random_seed)
168 |                 done = False
169 | 
170 |                 transition_dict = {
171 |                     'states': [],
172 |                     'actions': [],
173 |                     'next_states': [],
174 |                     'next_actions': [],
175 |                     'rewards': [],
176 |                     'dones': []
177 |                 }
178 | 
179 |                 for _ in range(max_step):
180 |                     if done:
181 |                         break
182 |                     action_index = agent.take_action(state)
183 |                     action=partitions[action_index]
184 |                     next_state, reward, done, _ = env.step(action)
185 |                     next_action_index = agent.take_action(next_state)
186 |                     transition_dict['states'].append(state)
187 |                     transition_dict['actions'].append(action_index)
188 |                     transition_dict['next_states'].append(next_state)
189 |                     transition_dict['next_actions'].append(next_action_index)
190 |                     transition_dict['rewards'].append(reward)
191 |                     transition_dict['dones'].append(done)
192 | 
193 |                     state = next_state
194 |                     episode_return += reward
195 |                 agent.update(transition_dict)
196 | 
197 |                 return_list.append(episode_return)
198 |                 plot_smooth_reward(return_list, 5, env_name, algorithm_name)
199 |                 if episode_return >= max_reward:
200 |                     max_reward = episode_return
201 |                     agent.save_model(env_name, algorithm_name)
202 |                 if (i_episodes + 1 % 10 == 0):
203 |                     pbar.set_postfix({
204 |                         'episode':
205 |                         '%d' % (num_episodes / 10 * i + i_episodes + 1),
206 |                         'return':
207 |                         '%.3f' % np.mean(return_list[-10:])
208 |                     })
209 |                 pbar.update(1)
210 |         agent.save_model(env_name, algorithm_name)
211 | 


--------------------------------------------------------------------------------
/ppo/runs/BipedalWalker-v3__ppo_continuous__1__1706597051/events.out.tfevents.1706597051.SKY-20230422NIZ.12216.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/BipedalWalker-v3__ppo_continuous__1__1706597051/events.out.tfevents.1706597051.SKY-20230422NIZ.12216.0


--------------------------------------------------------------------------------
/ppo/runs/BipedalWalker-v3__ppo_continuous__1__1706771659/events.out.tfevents.1706771659.DESKTOP-VAT73EM.12564.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/BipedalWalker-v3__ppo_continuous__1__1706771659/events.out.tfevents.1706771659.DESKTOP-VAT73EM.12564.0


--------------------------------------------------------------------------------
/ppo/runs/BipedalWalker-v3__ppo_multienv__1__1706755654/events.out.tfevents.1706755654.DESKTOP-VAT73EM.4120.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/BipedalWalker-v3__ppo_multienv__1__1706755654/events.out.tfevents.1706755654.DESKTOP-VAT73EM.4120.0


--------------------------------------------------------------------------------
/ppo/runs/BipedalWalker-v3__ppo_multienv__1__1706774718/events.out.tfevents.1706774718.DESKTOP-VAT73EM.11980.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/BipedalWalker-v3__ppo_multienv__1__1706774718/events.out.tfevents.1706774718.DESKTOP-VAT73EM.11980.0


--------------------------------------------------------------------------------
/ppo/runs/BipedalWalkerHardcore-v3__ppo_multienv__1__1706780430/events.out.tfevents.1706780430.DESKTOP-VAT73EM.12616.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/BipedalWalkerHardcore-v3__ppo_multienv__1__1706780430/events.out.tfevents.1706780430.DESKTOP-VAT73EM.12616.0


--------------------------------------------------------------------------------
/ppo/runs/Follow__ppo_sharing__1__1708769402/events.out.tfevents.1708769402.LAPTOP-K5GRC2HU.20764.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/Follow__ppo_sharing__1__1708769402/events.out.tfevents.1708769402.LAPTOP-K5GRC2HU.20764.0


--------------------------------------------------------------------------------
/ppo/runs/Snake-v0__ppo_discretemask__1__1706672792/events.out.tfevents.1706672792.SKY-20230422NIZ.11492.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/Snake-v0__ppo_discretemask__1__1706672792/events.out.tfevents.1706672792.SKY-20230422NIZ.11492.0


--------------------------------------------------------------------------------
/ppo/runs/Snake-v0__ppo_discretemask__1__1707018799/events.out.tfevents.1707018799.DESKTOP-VAT73EM.4348.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/Snake-v0__ppo_discretemask__1__1707018799/events.out.tfevents.1707018799.DESKTOP-VAT73EM.4348.0


--------------------------------------------------------------------------------
/ppo/runs/Snake-v0__ppo_snake__1__1706623864/events.out.tfevents.1706623864.SKY-20230422NIZ.676.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/Snake-v0__ppo_snake__1__1706623864/events.out.tfevents.1706623864.SKY-20230422NIZ.676.0


--------------------------------------------------------------------------------
/ppo/runs/Snake-v0__ppo_snake__1__1708424259/events.out.tfevents.1708424259.LAPTOP-K5GRC2HU.22668.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/Snake-v0__ppo_snake__1__1708424259/events.out.tfevents.1708424259.LAPTOP-K5GRC2HU.22668.0


--------------------------------------------------------------------------------
/ppo/runs/Walker__ppomultidiscrete__1__1706597666/events.out.tfevents.1706597666.SKY-20230422NIZ.10160.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/Walker__ppomultidiscrete__1__1706597666/events.out.tfevents.1706597666.SKY-20230422NIZ.10160.0


--------------------------------------------------------------------------------
/ppo/runs/runs.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/runs/runs.rar


--------------------------------------------------------------------------------
/ppo/snake-v0/2024_01_30_22_14_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0/2024_01_30_22_14_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0/2024_01_30_22_19_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0/2024_01_30_22_19_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0/2024_01_30_22_22_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0/2024_01_30_22_22_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0/2024_01_30_22_36_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0/2024_01_30_22_36_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0/2024_01_30_22_49_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0/2024_01_30_22_49_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0/2024_01_31_19_26_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0/2024_01_31_19_26_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0/2024_01_31_19_27_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0/2024_01_31_19_27_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0/2024_01_31_19_28_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0/2024_01_31_19_28_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_11_48_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_11_48_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_11_49_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_11_49_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_11_51_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_11_51_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_11_55_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_11_55_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_12_00_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_12_00_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_12_09_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_12_09_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_12_13_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_12_13_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_12_31_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_12_31_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_12_48_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_12_48_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_17_20_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_17_20_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/snake-v0mask/2024_01_31_17_21_Snake-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/ppo/snake-v0mask/2024_01_31_17_21_Snake-v0.pth


--------------------------------------------------------------------------------
/ppo/test_follow.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_agent_positions(target_info):
 5 |     num_agents = target_info["num_agents"]
 6 | 
 7 |     # 主位置
 8 |     main_position = np.array([0, 0])
 9 | 
10 |     # 初始化代理位置数组
11 |     agent_positions = np.zeros((num_agents, 2))
12 | 
13 |     # 逐个处理每个代理
14 |     for i in range(num_agents):
15 |         distance, angle = target_info[str(i)]
16 |         # 计算代理相对于主位置的位置
17 |         agent_positions[i, 0] = distance * np.cos(angle)
18 |         agent_positions[i, 1] = distance * np.sin(angle)
19 | 
20 |     # 绘制代理位置和主位置
21 |     plt.scatter(agent_positions[:, 0], agent_positions[:, 1], label="Agents", marker='o', s=100)
22 |     plt.scatter(main_position[0], main_position[1], color='red', label="Main Position", marker='s', s=200)
23 |     
24 |     # 添加注释
25 |     for i, values in target_info.items():
26 |         if i != "num_agents":
27 |             distance, angle = values
28 |             plt.annotate(f"Agent {i}\nDistance: {distance}\nAngle: {angle:.2f}", 
29 |                          xy=(agent_positions[int(i), 0], agent_positions[int(i), 1]), 
30 |                          xytext=(agent_positions[int(i), 0]+1, agent_positions[int(i), 1]+1),
31 |                          arrowprops=dict(facecolor='black', shrink=0.05),
32 |                          fontsize=8, ha='center', va='bottom')
33 | 
34 |     # 设置图形属性
35 |     plt.title("Agent Positions Relative to Main")
36 |     plt.xlabel("X Coordinate")
37 |     plt.ylabel("Y Coordinate")
38 |     plt.legend()
39 |     plt.grid(True)
40 |     plt.axis('equal')  # 使坐标轴比例相等
41 |     plt.show()
42 | 
43 | # 示例数据
44 | pi=np.pi
45 | target_info={"num_agents":4,"0":(5,pi/6),"1":(5,pi/3),"2":(10,pi/6),"3":(10,pi/3)}
46 | 
47 | 
48 | # 调用函数进行绘图
49 | plot_agent_positions(target_info)
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.25.2
2 | matplotlib==3.5.3
3 | numpy==1.24.2
4 | pygame==2.1.2
5 | PyYAML==6.0
6 | PyYAML==6.0.1
7 | torch==1.10.0+cu113
8 | tqdm==4.65.0
9 | 


--------------------------------------------------------------------------------
/result/gif/BipedalWalker-v3/BipedalWalker-v3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalker-v3/BipedalWalker-v3.gif


--------------------------------------------------------------------------------
/result/gif/BipedalWalker-v3/SAC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalker-v3/SAC.png


--------------------------------------------------------------------------------
/result/gif/BipedalWalker-v3/SAC.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalker-v3/SAC.pth


--------------------------------------------------------------------------------
/result/gif/BipedalWalker-v3/SACtrick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalker-v3/SACtrick.png


--------------------------------------------------------------------------------
/result/gif/BipedalWalker-v3/SACtrick.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalker-v3/SACtrick.pth


--------------------------------------------------------------------------------
/result/gif/BipedalWalkerHardcore-v3/BipedalWalkerHardcore-v3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalkerHardcore-v3/BipedalWalkerHardcore-v3.gif


--------------------------------------------------------------------------------
/result/gif/BipedalWalkerHardcore-v3/SAC10000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalkerHardcore-v3/SAC10000.png


--------------------------------------------------------------------------------
/result/gif/BipedalWalkerHardcore-v3/SAC10000.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalkerHardcore-v3/SAC10000.pth


--------------------------------------------------------------------------------
/result/gif/BipedalWalkerHardcore-v3/SAC3000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalkerHardcore-v3/SAC3000.png


--------------------------------------------------------------------------------
/result/gif/BipedalWalkerHardcore-v3/SAC3000.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/BipedalWalkerHardcore-v3/SAC3000.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/A2C.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/A2C.gif


--------------------------------------------------------------------------------
/result/gif/Snake-v0/A2C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/A2C.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/A2C.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/A2C.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/A2C.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   gamma: 0.99
 3 |   algorithm_name: "A2C"
 4 |   num_episodes: 5000
 5 |   actor_lr: 1.0e-3
 6 |   critic_lr: 3.0e-3
 7 |   env_name: 'Snake-v0'
 8 |   hidden_dim: 128
 9 | 
10 | test:
11 |   gamma: 0.99
12 |   algorithm_name: "A2C"
13 |   num_episodes: 5000
14 |   actor_lr: 1.0e-3
15 |   critic_lr: 3.0e-3
16 |   env_name: 'Snake-v0'
17 |   hidden_dim: 128  


--------------------------------------------------------------------------------
/result/gif/Snake-v0/A2C_TARGET.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/A2C_TARGET.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/A2C_TARGET.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/A2C_TARGET.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/AC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/AC.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/AC.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/AC.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/AC_TARGET.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/AC_TARGET.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/AC_TARGET.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/AC_TARGET.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/DDQN.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/DDQN.gif


--------------------------------------------------------------------------------
/result/gif/Snake-v0/DDQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/DDQN.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/DDQN.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/DDQN.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/DDRQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/DDRQN.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/DDRQN.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/DDRQN.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/DQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/DQN.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/DQN.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/DQN.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/DRQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/DRQN.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/DRQN.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/DRQN.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/PPO.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/PPO.gif


--------------------------------------------------------------------------------
/result/gif/Snake-v0/PPO.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/PPO.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/PPO.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/PPO.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/REINFORCE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/REINFORCE.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/REINFORCE.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/REINFORCE.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v0/REINFORCE_baseline.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/REINFORCE_baseline.gif


--------------------------------------------------------------------------------
/result/gif/Snake-v0/REINFORCE_baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/REINFORCE_baseline.png


--------------------------------------------------------------------------------
/result/gif/Snake-v0/REINFORCE_baseline.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v0/REINFORCE_baseline.pth


--------------------------------------------------------------------------------
/result/gif/Snake-v1/A2C.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v1/A2C.gif


--------------------------------------------------------------------------------
/result/gif/Snake-v1/DDQN.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v1/DDQN.gif


--------------------------------------------------------------------------------
/result/gif/Snake-v1/PPO.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v1/PPO.gif


--------------------------------------------------------------------------------
/result/gif/Snake-v1/REINFORCE_baseline.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunwuzhou03/reinforcement-learning-lab/714b812fb704f11c9a34b3a8bb2b6234f15c9b5e/result/gif/Snake-v1/REINFORCE_baseline.gif


--------------------------------------------------------------------------------
/rl_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import gym
 4 | import pygame
 5 | import collections
 6 | import random
 7 | import torch.nn.functional as F
 8 | from tqdm import tqdm
 9 | import matplotlib.pyplot as plt
10 | import time
11 | import torch.nn as nn
12 | import os
13 | import imageio
14 | 
15 | 
16 | def save_video(frames, directory="./", filename="video", mode="gif", fps=30):
17 |     height, width, _ = frames[0].shape
18 | 
19 |     # 创建目标目录（如果不存在）
20 |     os.makedirs(directory, exist_ok=True)
21 | 
22 |     # 拼接文件名和扩展名
23 |     filename = f"{filename}.{mode}"
24 |     # 或者使用下面的语句
25 |     # filename = "{}.{}".format(filename, mode)
26 | 
27 |     # 构建完整的文件路径
28 |     filepath = os.path.join(directory, filename)
29 | 
30 |     # 创建视频写入器
31 |     writer = imageio.get_writer(filepath, fps=fps)
32 | 
33 |     # 将所有帧写入视频
34 |     for frame in frames:
35 |         writer.append_data(frame)
36 | 
37 |     # 关闭视频写入器
38 |     writer.close()
39 | 
40 | 
41 | def plot_smooth_reward(rewards,
42 |                        window_size=100,
43 |                        directory="./",
44 |                        filename="smooth_reward_plot"):
45 | 
46 |     # 创建目标目录（如果不存在）
47 |     os.makedirs(directory, exist_ok=True)
48 | 
49 |     # 拼接文件名和扩展名
50 |     filename = f"{filename}.png"
51 | 
52 |     # 构建完整的文件路径
53 |     filepath = os.path.join(directory, filename)
54 | 
55 |     # 计算滑动窗口平均值
56 |     smoothed_rewards = np.convolve(rewards,
57 |                                    np.ones(window_size) / window_size,
58 |                                    mode='valid')
59 | 
60 |     # 绘制原始奖励和平滑奖励曲线
61 |     plt.plot(rewards, label='Raw Reward')
62 |     plt.plot(smoothed_rewards, label='Smoothed Reward')
63 | 
64 |     # 设置图例、标题和轴标签
65 |     plt.legend()
66 |     plt.title('Smoothed Reward')
67 |     plt.xlabel('Episode')
68 |     plt.ylabel('Reward')
69 | 
70 |     # 保存图像
71 |     plt.savefig(filepath)
72 | 
73 |     # 关闭图像
74 |     plt.close()
75 | 


--------------------------------------------------------------------------------
/test_agent.py:
--------------------------------------------------------------------------------
 1 | from ddqn import DDQN, plot_smooth_reward, ReplayBuffer
 2 | from a2c import A2C
 3 | from ppo import PPO
 4 | from reinforce_baseline import REINFORCE_BASELINE
 5 | import numpy as np
 6 | import torch
 7 | import gym
 8 | import pygame
 9 | import collections
10 | import random
11 | import torch.nn.functional as F
12 | from tqdm import tqdm
13 | import matplotlib.pyplot as plt
14 | import time
15 | import pygame
16 | import datetime
17 | from rl_utils import save_video
18 | 
19 | gamma = 0.99
20 | num_episodes = 5000
21 | hidden_dim = 128
22 | buffersize = 10000
23 | minmal_size = 500
24 | batch_size = 32
25 | epsilon = 0.01
26 | target_update = 50
27 | learning_rate = 2e-3
28 | max_step = 500
29 | device = torch.device('cuda')
30 | 
31 | env_name = 'Snake-v0'
32 | # 注册环境
33 | gym.register(id='Snake-v0', entry_point='snake_env:SnakeEnv')
34 | 
35 | env = gym.make(env_name, width=10, height=10)
36 | random.seed(0)
37 | np.random.seed(0)
38 | env.seed(0)
39 | torch.manual_seed(0)
40 | replay_buffer = ReplayBuffer(buffersize)
41 | 
42 | state_dim = env.observation_space.shape[0]
43 | action_dim = env.action_space.n
44 | 
45 | load_path = 'Snake-v0/DDQN.pth'
46 | agent = DDQN(state_dim, hidden_dim, action_dim, learning_rate, gamma, epsilon,
47 |              target_update, device)
48 | 
49 | load_path = 'Snake-v0/REINFORCE_baseline.pth'
50 | agent = REINFORCE_BASELINE(state_dim, hidden_dim, action_dim, learning_rate,
51 |                            gamma, device)
52 | 
53 | load_path = 'Snake-v0/PPO.pth'  #5best
54 | agent = PPO(state_dim, hidden_dim, action_dim, 1e-3, 1e-3, gamma, 0.95, 0.2, 5,
55 |             device)
56 | 
57 | load_path = 'Snake-v0/A2C.pth'  #5best
58 | agent = A2C(state_dim, hidden_dim, action_dim, 1e-3, 1e-3, gamma, device)
59 | 
60 | # 加载模型的状态字典
61 | state_dict = torch.load(load_path)
62 | 
63 | # 加载状态字典到模型中
64 | 
65 | agent.load_model(load_path)
66 | 
67 | state = env.reset()
68 | done = False
69 | frame_list = []
70 | while not done:
71 |     action = agent.take_action(state)  # 随机选择一个行动
72 |     state, reward, done, info = env.step(action)
73 |     rendered_image = env.render('rgb_array')
74 |     frame_list.append(rendered_image)
75 | save_video(frame_list)
76 | 


--------------------------------------------------------------------------------
/test_env.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | env = gym.make('Pendulum-v1')
 4 | 
 5 | observation = env.reset()
 6 | done = False
 7 | 
 8 | while not done:
 9 |     action = env.action_space.sample()  # 随机选择一个动作
10 |     print(action)
11 |     observation, reward, done, info = env.step(action)  # 执行动作并获取观察、奖励等信息
12 |     print('Observation:', observation)
13 |     print('Reward:', reward)
14 | 


--------------------------------------------------------------------------------