├── 1.REINFORCE
    ├── README.md
    ├── REINFORCE.py
    ├── REINFORCE_baseline.py
    ├── data_train
    │   ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_0.npy
    │   ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_10.npy
    │   ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_100.npy
    │   ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_0.npy
    │   ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_10.npy
    │   ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_100.npy
    │   ├── REINFORCE_env_CartPole-v0_number_2_seed_0.npy
    │   ├── REINFORCE_env_CartPole-v0_number_2_seed_10.npy
    │   ├── REINFORCE_env_CartPole-v0_number_2_seed_100.npy
    │   ├── REINFORCE_env_CartPole-v1_number_2_seed_0.npy
    │   ├── REINFORCE_env_CartPole-v1_number_2_seed_10.npy
    │   └── REINFORCE_env_CartPole-v1_number_2_seed_100.npy
    ├── runs
    │   └── REINFORCE
    │   │   ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_0
    │   │       └── events.out.tfevents.1648121668.李智.23156.0
    │   │   ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_10
    │   │       └── events.out.tfevents.1648121786.李智.23156.1
    │   │   ├── REINFORCE_baseline_env_CartPole-v0_number_2_seed_100
    │   │       └── events.out.tfevents.1648121899.李智.23156.2
    │   │   ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_0
    │   │       └── events.out.tfevents.1648121670.李智.15096.0
    │   │   ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_10
    │   │       └── events.out.tfevents.1648121797.李智.15096.1
    │   │   ├── REINFORCE_baseline_env_CartPole-v1_number_2_seed_100
    │   │       └── events.out.tfevents.1648121918.李智.15096.2
    │   │   ├── REINFORCE_env_CartPole-v0_number_2_seed_0
    │   │       └── events.out.tfevents.1648121512.李智.32424.0
    │   │   ├── REINFORCE_env_CartPole-v0_number_2_seed_10
    │   │       └── events.out.tfevents.1648121583.李智.32424.1
    │   │   ├── REINFORCE_env_CartPole-v0_number_2_seed_100
    │   │       └── events.out.tfevents.1648121655.李智.32424.2
    │   │   ├── REINFORCE_env_CartPole-v1_number_2_seed_0
    │   │       └── events.out.tfevents.1648121526.李智.11256.0
    │   │   ├── REINFORCE_env_CartPole-v1_number_2_seed_10
    │   │       └── events.out.tfevents.1648121607.李智.11256.1
    │   │   └── REINFORCE_env_CartPole-v1_number_2_seed_100
    │   │       └── events.out.tfevents.1648121688.李智.11256.2
    └── training results.png
├── 2.Actor-Critic
    ├── A2C.py
    ├── A2C_results.png
    ├── README.md
    ├── data_train
    │   ├── A2C_env_CartPole-v0_number_9_seed_0.npy
    │   ├── A2C_env_CartPole-v0_number_9_seed_10.npy
    │   ├── A2C_env_CartPole-v0_number_9_seed_100.npy
    │   ├── A2C_env_CartPole-v1_number_9_seed_0.npy
    │   ├── A2C_env_CartPole-v1_number_9_seed_10.npy
    │   └── A2C_env_CartPole-v1_number_9_seed_100.npy
    └── runs
    │   └── A2C
    │       ├── A2C_env_CartPole-v0_number_9_seed_0
    │           └── events.out.tfevents.1648553119.李智.62564.0
    │       ├── A2C_env_CartPole-v0_number_9_seed_10
    │           └── events.out.tfevents.1648553543.李智.62564.1
    │       ├── A2C_env_CartPole-v0_number_9_seed_100
    │           └── events.out.tfevents.1648554019.李智.62564.2
    │       ├── A2C_env_CartPole-v1_number_9_seed_0
    │           └── events.out.tfevents.1648553122.李智.63460.0
    │       ├── A2C_env_CartPole-v1_number_9_seed_10
    │           └── events.out.tfevents.1648553561.李智.63460.1
    │       └── A2C_env_CartPole-v1_number_9_seed_100
    │           └── events.out.tfevents.1648554055.李智.63460.2
├── 3.Rainbow_DQN
    ├── README.md
    ├── Rainbow_DQN_main.py
    ├── __pycache__
    │   ├── network.cpython-37.pyc
    │   ├── rainbow_dqn.cpython-37.pyc
    │   ├── replay_buffer.cpython-37.pyc
    │   └── sum_tree.cpython-37.pyc
    ├── data_train
    │   ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0.npy
    │   ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10.npy
    │   ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100.npy
    │   ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0.npy
    │   ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10.npy
    │   ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100.npy
    │   ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy
    │   ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy
    │   ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy
    │   ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy
    │   ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy
    │   ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy
    │   ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy
    │   ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy
    │   ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy
    │   ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_0.npy
    │   ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_10.npy
    │   └── Rainbow_DQN_env_LunarLander-v2_number_1_seed_100.npy
    ├── drawing_Rainbow_DQN.py
    ├── network.py
    ├── rainbow_dqn.py
    ├── rainbow_dqn_result.png
    ├── replay_buffer.py
    ├── runs
    │   └── DQN
    │   │   ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0
    │   │       └── events.out.tfevents.1658494478.DESKTOP-LMKC0MO.1408.0
    │   │   ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10
    │   │       └── events.out.tfevents.1658507126.DESKTOP-LMKC0MO.1408.1
    │   │   ├── DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100
    │   │       └── events.out.tfevents.1658520541.DESKTOP-LMKC0MO.1408.2
    │   │   ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0
    │   │       └── events.out.tfevents.1658494481.DESKTOP-LMKC0MO.9316.0
    │   │   ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10
    │   │       └── events.out.tfevents.1658512436.DESKTOP-LMKC0MO.9316.1
    │   │   ├── DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100
    │   │       └── events.out.tfevents.1658531515.DESKTOP-LMKC0MO.9316.2
    │   │   ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0
    │   │       └── events.out.tfevents.1658494475.DESKTOP-LMKC0MO.5976.0
    │   │   ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10
    │   │       └── events.out.tfevents.1658511615.DESKTOP-LMKC0MO.5976.1
    │   │   ├── DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100
    │   │       └── events.out.tfevents.1658528978.DESKTOP-LMKC0MO.5976.2
    │   │   ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0
    │   │       └── events.out.tfevents.1658494473.DESKTOP-LMKC0MO.2144.0
    │   │   ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10
    │   │       └── events.out.tfevents.1658511489.DESKTOP-LMKC0MO.2144.1
    │   │   ├── DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100
    │   │       └── events.out.tfevents.1658529336.DESKTOP-LMKC0MO.2144.2
    │   │   ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0
    │   │       └── events.out.tfevents.1658494471.DESKTOP-LMKC0MO.9964.0
    │   │   ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10
    │   │       └── events.out.tfevents.1658510515.DESKTOP-LMKC0MO.9964.1
    │   │   ├── DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100
    │   │       └── events.out.tfevents.1658526626.DESKTOP-LMKC0MO.9964.2
    │   │   ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_0
    │   │       └── events.out.tfevents.1658479209.DESKTOP-LMKC0MO.1228.0
    │   │   ├── Rainbow_DQN_env_LunarLander-v2_number_1_seed_10
    │   │       └── events.out.tfevents.1658479212.DESKTOP-LMKC0MO.10500.0
    │   │   └── Rainbow_DQN_env_LunarLander-v2_number_1_seed_100
    │   │       └── events.out.tfevents.1658479214.DESKTOP-LMKC0MO.9512.0
    └── sum_tree.py
├── 4.PPO-discrete
    ├── PPO_discrete_main.py
    ├── README.md
    ├── __pycache__
    │   ├── normalization.cpython-37.pyc
    │   ├── ppo_discrete.cpython-37.pyc
    │   └── replaybuffer.cpython-37.pyc
    ├── data_train
    │   └── readme.txt
    ├── normalization.py
    ├── ppo_discrete.py
    ├── replaybuffer.py
    ├── runs
    │   └── readme.txt
    └── training_result.png
├── 5.PPO-continuous
    ├── PPO_continuous_main.py
    ├── README.md
    ├── __pycache__
    │   ├── normalization.cpython-37.pyc
    │   ├── ppo_continuous.cpython-37.pyc
    │   └── replaybuffer.cpython-37.pyc
    ├── data_train
    │   └── readme.txt
    ├── normalization.py
    ├── ppo_continuous.py
    ├── replaybuffer.py
    ├── runs
    │   └── readme.txt
    └── training_result.png
├── 6.DDPG
    └── DDPG.py
├── 7.TD3
    ├── README.md
    ├── TD3.py
    ├── TD3_result.png
    ├── data_train
    │   ├── TD3_env_BipedalWalker-v3_number_1_seed_0.npy
    │   ├── TD3_env_BipedalWalker-v3_number_1_seed_10.npy
    │   ├── TD3_env_BipedalWalker-v3_number_1_seed_100.npy
    │   ├── TD3_env_HalfCheetah-v2_number_1_seed_0.npy
    │   ├── TD3_env_HalfCheetah-v2_number_1_seed_10.npy
    │   ├── TD3_env_HalfCheetah-v2_number_1_seed_100.npy
    │   ├── TD3_env_Hopper-v2_number_1_seed_0.npy
    │   ├── TD3_env_Hopper-v2_number_1_seed_10.npy
    │   ├── TD3_env_Hopper-v2_number_1_seed_100.npy
    │   ├── TD3_env_Pendulum-v1_number_1_seed_0.npy
    │   ├── TD3_env_Pendulum-v1_number_1_seed_10.npy
    │   ├── TD3_env_Pendulum-v1_number_1_seed_100.npy
    │   ├── TD3_env_Walker2d-v2_number_1_seed_0.npy
    │   ├── TD3_env_Walker2d-v2_number_1_seed_10.npy
    │   └── TD3_env_Walker2d-v2_number_1_seed_100.npy
    └── runs
    │   └── TD3
    │       ├── TD3_env_BipedalWalker-v3_number_1_seed_0
    │           └── events.out.tfevents.1648952137.李智.93956.0
    │       ├── TD3_env_BipedalWalker-v3_number_1_seed_10
    │           └── events.out.tfevents.1648882414.李智.81744.0
    │       ├── TD3_env_BipedalWalker-v3_number_1_seed_100
    │           └── events.out.tfevents.1648925401.李智.81744.1
    │       ├── TD3_env_HalfCheetah-v2_number_1_seed_0
    │           └── events.out.tfevents.1648909506.李智.60360.2
    │       ├── TD3_env_HalfCheetah-v2_number_1_seed_10
    │           └── events.out.tfevents.1648800524.李智.60360.0
    │       ├── TD3_env_HalfCheetah-v2_number_1_seed_100
    │           └── events.out.tfevents.1648852975.李智.60360.1
    │       ├── TD3_env_Hopper-v2_number_1_seed_0
    │           └── events.out.tfevents.1649010066.李智.85868.2
    │       ├── TD3_env_Hopper-v2_number_1_seed_10
    │           └── events.out.tfevents.1648901654.李智.85868.0
    │       ├── TD3_env_Hopper-v2_number_1_seed_100
    │           └── events.out.tfevents.1648956951.李智.85868.1
    │       ├── TD3_env_Pendulum-v1_number_1_seed_0
    │           └── events.out.tfevents.1649065960.李智.18392.2
    │       ├── TD3_env_Pendulum-v1_number_1_seed_10
    │           └── events.out.tfevents.1649057339.李智.18392.0
    │       ├── TD3_env_Pendulum-v1_number_1_seed_100
    │           └── events.out.tfevents.1649061632.李智.18392.1
    │       ├── TD3_env_Walker2d-v2_number_1_seed_0
    │           └── events.out.tfevents.1648846023.李智.76672.2
    │       ├── TD3_env_Walker2d-v2_number_1_seed_10
    │           └── events.out.tfevents.1648735005.李智.76672.0
    │       └── TD3_env_Walker2d-v2_number_1_seed_100
    │           └── events.out.tfevents.1648793243.李智.76672.1
├── 8.SAC
    └── SAC-continuous.py
├── 9.PPO-discrete-RNN
    ├── PPO+RNN.png
    ├── PPO_discrete_rnn_main.py
    ├── README.md
    ├── __pycache__
    │   ├── normalization.cpython-37.pyc
    │   ├── ppo_discrete_rnn.cpython-37.pyc
    │   └── replaybuffer.cpython-37.pyc
    ├── data_train
    │   ├── PPO_env_CartPole-v1_number_3_seed_0.npy
    │   ├── PPO_env_CartPole-v1_number_3_seed_10.npy
    │   ├── PPO_env_CartPole-v1_number_3_seed_100.npy
    │   ├── PPO_env_CartPole-v1_number_5_seed_0.npy
    │   ├── PPO_env_CartPole-v1_number_5_seed_10.npy
    │   ├── PPO_env_CartPole-v1_number_5_seed_100.npy
    │   ├── PPO_env_LunarLander-v2_number_3_seed_0.npy
    │   ├── PPO_env_LunarLander-v2_number_3_seed_10.npy
    │   ├── PPO_env_LunarLander-v2_number_3_seed_100.npy
    │   ├── PPO_env_LunarLander-v2_number_5_seed_0.npy
    │   ├── PPO_env_LunarLander-v2_number_5_seed_10.npy
    │   └── PPO_env_LunarLander-v2_number_5_seed_100.npy
    ├── normalization.py
    ├── ppo_discrete_rnn.py
    ├── replaybuffer.py
    └── runs
    │   └── PPO_discrete
    │       ├── env_CartPole-v1_number_3_seed_0
    │           └── events.out.tfevents.1659015484.DESKTOP-LMKC0MO.6444.0
    │       ├── env_CartPole-v1_number_3_seed_10
    │           └── events.out.tfevents.1659019387.DESKTOP-LMKC0MO.6444.1
    │       ├── env_CartPole-v1_number_3_seed_100
    │           └── events.out.tfevents.1659022676.DESKTOP-LMKC0MO.6444.2
    │       ├── env_CartPole-v1_number_5_seed_0
    │           └── events.out.tfevents.1659083525.DESKTOP-LMKC0MO.2204.0
    │       ├── env_CartPole-v1_number_5_seed_10
    │           └── events.out.tfevents.1659084659.DESKTOP-LMKC0MO.2204.1
    │       ├── env_CartPole-v1_number_5_seed_100
    │           └── events.out.tfevents.1659085747.DESKTOP-LMKC0MO.2204.2
    │       ├── env_LunarLander-v2_number_3_seed_0
    │           └── events.out.tfevents.1659015468.DESKTOP-LMKC0MO.13484.0
    │       ├── env_LunarLander-v2_number_3_seed_10
    │           └── events.out.tfevents.1659031613.DESKTOP-LMKC0MO.13484.1
    │       ├── env_LunarLander-v2_number_3_seed_100
    │           └── events.out.tfevents.1659045291.DESKTOP-LMKC0MO.13484.2
    │       ├── env_LunarLander-v2_number_5_seed_0
    │           └── events.out.tfevents.1659083526.DESKTOP-LMKC0MO.12096.0
    │       ├── env_LunarLander-v2_number_5_seed_10
    │           └── events.out.tfevents.1659083528.DESKTOP-LMKC0MO.11420.0
    │       └── env_LunarLander-v2_number_5_seed_100
    │           └── events.out.tfevents.1659083530.DESKTOP-LMKC0MO.11484.0
├── LICENSE
└── README.md


/1.REINFORCE/README.md:
--------------------------------------------------------------------------------
 1 | # REINFORCE
 2 | This is a concise Pytorch implementation of REINFORCE.<br />
 3 | REINFORCE.py is a implementation of REINFORCE without the baseline.<br />
 4 | REINFORCE_baseline.py is a implementation of REINFORCE with the baseline.<br />
 5 | 
 6 | ## How to use my code?
 7 | You can dircetly run REINFORCE.py and REINFORCE_baseline.py in your own IDE.<br />
 8 | 
 9 | ### Trainning environments
10 | You can set the 'env_index' in the codes to change the environments.<br />
11 | env_index=0 represent 'CartPole-v0'<br />
12 | env_index=1 represent 'CartPole-v1'<br />
13 | 
14 | ### How to see the training results?
15 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.<br />
16 | The rewards data are saved as numpy in the file 'data_train'.<br />
17 | The training curves are shown below,  which are smoothed by averaging over a window of 10 steps.<br />
18 | The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)<br />
19 | 
20 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/1.REINFORCE/training%20results.png)
21 | 


--------------------------------------------------------------------------------
/1.REINFORCE/REINFORCE.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | from torch.utils.tensorboard import SummaryWriter
  7 | 
  8 | 
  9 | class Policy(nn.Module):
 10 |     def __init__(self, state_dim, action_dim, hidden_width):
 11 |         super(Policy, self).__init__()
 12 |         self.l1 = nn.Linear(state_dim, hidden_width)
 13 |         self.l2 = nn.Linear(hidden_width, action_dim)
 14 | 
 15 |     def forward(self, s):
 16 |         s = F.relu(self.l1(s))
 17 |         a_prob = F.softmax(self.l2(s), dim=1)
 18 |         return a_prob
 19 | 
 20 | 
 21 | class REINFORCE(object):
 22 |     def __init__(self, state_dim, action_dim):
 23 |         self.state_dim = state_dim
 24 |         self.action_dim = action_dim
 25 |         self.hidden_width = 64  # The number of neurons in hidden layers of the neural network
 26 |         self.lr = 4e-4  # learning rate
 27 |         self.GAMMA = 0.99  # discount factor
 28 |         self.episode_s, self.episode_a, self.episode_r = [], [], []
 29 | 
 30 |         self.policy = Policy(state_dim, action_dim, self.hidden_width)
 31 |         self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.lr)
 32 | 
 33 |     def choose_action(self, s, deterministic):
 34 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
 35 |         prob_weights = self.policy(s).detach().numpy().flatten()  # probability distribution(numpy)
 36 |         if deterministic:  # We use the deterministic policy during the evaluating
 37 |             a = np.argmax(prob_weights)  # Select the action with the highest probability
 38 |             return a
 39 |         else:  # We use the stochastic policy during the training
 40 |             a = np.random.choice(range(self.action_dim), p=prob_weights)  # Sample the action according to the probability distribution
 41 |             return a
 42 | 
 43 |     def store(self, s, a, r):
 44 |         self.episode_s.append(s)
 45 |         self.episode_a.append(a)
 46 |         self.episode_r.append(r)
 47 | 
 48 |     def learn(self, ):
 49 |         G = []
 50 |         g = 0
 51 |         for r in reversed(self.episode_r):  # calculate the return G reversely
 52 |             g = self.GAMMA * g + r
 53 |             G.insert(0, g)
 54 | 
 55 |         for t in range(len(self.episode_r)):
 56 |             s = torch.unsqueeze(torch.tensor(self.episode_s[t], dtype=torch.float), 0)
 57 |             a = self.episode_a[t]
 58 |             g = G[t]
 59 | 
 60 |             a_prob = self.policy(s).flatten()
 61 |             policy_loss = -pow(self.GAMMA, t) * g * torch.log(a_prob[a])
 62 |             self.policy_optimizer.zero_grad()
 63 |             policy_loss.backward()
 64 |             self.policy_optimizer.step()
 65 | 
 66 |         # Clean the buffer
 67 |         self.episode_s, self.episode_a, self.episode_r = [], [], []
 68 | 
 69 | 
 70 | def evaluate_policy(env, agent):
 71 |     times = 3  # Perform three evaluations and calculate the average
 72 |     evaluate_reward = 0
 73 |     for _ in range(times):
 74 |         s = env.reset()
 75 |         done = False
 76 |         episode_reward = 0
 77 |         while not done:
 78 |             a = agent.choose_action(s, deterministic=True)  # We use the deterministic policy during the evaluating
 79 |             s_, r, done, _ = env.step(a)
 80 |             episode_reward += r
 81 |             s = s_
 82 |         evaluate_reward += episode_reward
 83 | 
 84 |     return int(evaluate_reward / times)
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |     env_name = ['CartPole-v0', 'CartPole-v1']
 89 |     env_index = 0  # The index of the environments above
 90 |     env = gym.make(env_name[env_index])
 91 |     env_evaluate = gym.make(env_name[env_index])  # When evaluating the policy, we need to rebuild an environment
 92 |     number = 1
 93 |     seed = 0
 94 |     env.seed(seed)
 95 |     env_evaluate.seed(seed)
 96 |     np.random.seed(seed)
 97 |     torch.manual_seed(seed)
 98 | 
 99 |     state_dim = env.observation_space.shape[0]
100 |     action_dim = env.action_space.n
101 |     max_episode_steps = env._max_episode_steps  # Maximum number of steps per episode
102 |     print("state_dim={}".format(state_dim))
103 |     print("action_dim={}".format(action_dim))
104 |     print("max_episode_steps={}".format(max_episode_steps))
105 | 
106 |     agent = REINFORCE(state_dim, action_dim)
107 |     writer = SummaryWriter(log_dir='runs/REINFORCE/REINFORCE_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed))  # build a tensorboard
108 | 
109 |     max_train_steps = 1e5  # Maximum number of training steps
110 |     evaluate_freq = 1e3  # Evaluate the policy every 'evaluate_freq' steps
111 |     evaluate_num = 0  # Record the number of evaluations
112 |     evaluate_rewards = []  # Record the rewards during the evaluating
113 |     total_steps = 0  # Record the total steps during the training
114 | 
115 |     while total_steps < max_train_steps:
116 |         episode_steps = 0
117 |         s = env.reset()
118 |         done = False
119 |         while not done:
120 |             episode_steps += 1
121 |             a = agent.choose_action(s, deterministic=False)
122 |             s_, r, done, _ = env.step(a)
123 |             agent.store(s, a, r)
124 |             s = s_
125 | 
126 |             # Evaluate the policy every 'evaluate_freq' steps
127 |             if (total_steps + 1) % evaluate_freq == 0:
128 |                 evaluate_num += 1
129 |                 evaluate_reward = evaluate_policy(env_evaluate, agent)
130 |                 evaluate_rewards.append(evaluate_reward)
131 |                 print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
132 |                 writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
133 |                 if evaluate_num % 10 == 0:
134 |                     np.save('./data_train/REINFORCE_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
135 | 
136 |             total_steps += 1
137 | 
138 |         # An episode is over,then update
139 |         agent.learn()
140 | 


--------------------------------------------------------------------------------
/1.REINFORCE/REINFORCE_baseline.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | from torch.utils.tensorboard import SummaryWriter
  7 | 
  8 | 
  9 | class Policy(nn.Module):
 10 |     def __init__(self, state_dim, action_dim, hidden_width):
 11 |         super(Policy, self).__init__()
 12 |         self.l1 = nn.Linear(state_dim, hidden_width)
 13 |         self.l2 = nn.Linear(hidden_width, action_dim)
 14 | 
 15 |     def forward(self, s):
 16 |         s = F.relu(self.l1(s))
 17 |         a_prob = F.softmax(self.l2(s), dim=1)
 18 |         return a_prob
 19 | 
 20 | 
 21 | class Value(nn.Module):
 22 |     def __init__(self, state_dim, hidden_width):
 23 |         super(Value, self).__init__()
 24 |         self.l1 = nn.Linear(state_dim, hidden_width)
 25 |         self.l2 = nn.Linear(hidden_width, 1)
 26 | 
 27 |     def forward(self, s):
 28 |         s = F.relu(self.l1(s))
 29 |         v_s = self.l2(s)
 30 |         return v_s
 31 | 
 32 | 
 33 | class REINFORCE(object):
 34 |     def __init__(self, state_dim, action_dim):
 35 |         self.state_dim = state_dim
 36 |         self.action_dim = action_dim
 37 |         self.hidden_width = 64  # The number of neurons in hidden layers of the neural network
 38 |         self.lr = 4e-4  # learning rate
 39 |         self.GAMMA = 0.99  # discount factor
 40 |         self.episode_s, self.episode_a, self.episode_r = [], [], []
 41 | 
 42 |         self.policy = Policy(state_dim, action_dim, self.hidden_width)
 43 |         self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.lr)
 44 | 
 45 |         self.value = Value(state_dim, self.hidden_width)
 46 |         self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=self.lr)
 47 | 
 48 |     def choose_action(self, s, deterministic):
 49 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
 50 |         prob_weights = self.policy(s).detach().numpy().flatten()  # probability distribution(numpy)
 51 |         if deterministic:  # We use the deterministic policy during the evaluating
 52 |             a = np.argmax(prob_weights)  # Select the action with the highest probability
 53 |             return a
 54 |         else:  # We use the stochastic policy during the training
 55 |             a = np.random.choice(range(self.action_dim), p=prob_weights)  # Sample the action according to the probability distribution
 56 |             return a
 57 | 
 58 |     def store(self, s, a, r):
 59 |         self.episode_s.append(s)
 60 |         self.episode_a.append(a)
 61 |         self.episode_r.append(r)
 62 | 
 63 |     def learn(self, ):
 64 |         G = []
 65 |         g = 0
 66 |         for r in reversed(self.episode_r):  # calculate the return G reversely
 67 |             g = self.GAMMA * g + r
 68 |             G.insert(0, g)
 69 | 
 70 |         for t in range(len(self.episode_r)):
 71 |             s = torch.unsqueeze(torch.tensor(self.episode_s[t], dtype=torch.float), 0)
 72 |             a = self.episode_a[t]
 73 |             g = G[t]
 74 |             v_s = self.value(s).flatten()
 75 | 
 76 |             # Update policy
 77 |             a_prob = self.policy(s).flatten()
 78 |             policy_loss = -pow(self.GAMMA, t) * ((g - v_s).detach()) * torch.log(a_prob[a])
 79 |             self.policy_optimizer.zero_grad()
 80 |             policy_loss.backward()
 81 |             self.policy_optimizer.step()
 82 | 
 83 |             # Update value function
 84 |             value_loss = (g - v_s) ** 2
 85 |             self.value_optimizer.zero_grad()
 86 |             value_loss.backward()
 87 |             self.value_optimizer.step()
 88 | 
 89 |         # Clean the buffer
 90 |         self.episode_s, self.episode_a, self.episode_r = [], [], []
 91 | 
 92 | 
 93 | def evaluate_policy(env, agent):
 94 |     times = 3  # Perform three evaluations and calculate the average
 95 |     evaluate_reward = 0
 96 |     for _ in range(times):
 97 |         s = env.reset()
 98 |         done = False
 99 |         episode_reward = 0
100 |         while not done:
101 |             a = agent.choose_action(s, deterministic=True)  # We use the deterministic policy during the evaluating
102 |             s_, r, done, _ = env.step(a)
103 |             episode_reward += r
104 |             s = s_
105 |         evaluate_reward += episode_reward
106 | 
107 |     return int(evaluate_reward / times)
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     env_name = ['CartPole-v0', 'CartPole-v1']
112 |     env_index = 0  # The index of the environments above
113 |     env = gym.make(env_name[env_index])
114 |     env_evaluate = gym.make(env_name[env_index])  # When evaluating the policy, we need to rebuild an environment
115 |     number = 1
116 |     seed = 0
117 |     env.seed(seed)
118 |     env_evaluate.seed(seed)
119 |     np.random.seed(seed)
120 |     torch.manual_seed(seed)
121 | 
122 |     state_dim = env.observation_space.shape[0]
123 |     action_dim = env.action_space.n
124 |     max_episode_steps = env._max_episode_steps  # Maximum number of steps per episode
125 |     print("state_dim={}".format(state_dim))
126 |     print("action_dim={}".format(action_dim))
127 |     print("max_episode_steps={}".format(max_episode_steps))
128 | 
129 |     agent = REINFORCE(state_dim, action_dim)
130 |     writer = SummaryWriter(log_dir='runs/REINFORCE/REINFORCE_baseline_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed))  # build a tensorboard
131 | 
132 |     max_train_steps = 1e5  # Maximum number of training steps
133 |     evaluate_freq = 1e3  # Evaluate the policy every 'evaluate_freq' steps
134 |     evaluate_num = 0  # Record the number of evaluations
135 |     evaluate_rewards = []  # Record the rewards during the evaluating
136 |     total_steps = 0  # Record the total steps during the training
137 | 
138 |     while total_steps < max_train_steps:
139 |         episode_steps = 0
140 |         s = env.reset()
141 |         done = False
142 |         while not done:
143 |             episode_steps += 1
144 |             a = agent.choose_action(s, deterministic=False)
145 |             s_, r, done, _ = env.step(a)
146 |             agent.store(s, a, r)
147 |             s = s_
148 | 
149 |             # Evaluate the policy every 'evaluate_freq' steps
150 |             if (total_steps + 1) % evaluate_freq == 0:
151 |                 evaluate_num += 1
152 |                 evaluate_reward = evaluate_policy(env_evaluate, agent)
153 |                 evaluate_rewards.append(evaluate_reward)
154 |                 print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
155 |                 writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
156 |                 if evaluate_num % 10 == 0:
157 |                     np.save('./data_train/REINFORCE_baseline_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
158 | 
159 |             total_steps += 1
160 | 
161 |         # An episode is over,then update
162 |         agent.learn()
163 | 


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_0.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_10.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v0_number_2_seed_100.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_0.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_10.npy


--------------------------------------------------------------------------------
/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/data_train/REINFORCE_env_CartPole-v1_number_2_seed_100.npy


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121668.李智.23156.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121668.李智.23156.0


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121786.李智.23156.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121786.李智.23156.1


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121899.李智.23156.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121899.李智.23156.2


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121670.李智.15096.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121670.李智.15096.0


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121797.李智.15096.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121797.李智.15096.1


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121918.李智.15096.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_baseline_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121918.李智.15096.2


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121512.李智.32424.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_0/events.out.tfevents.1648121512.李智.32424.0


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121583.李智.32424.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_10/events.out.tfevents.1648121583.李智.32424.1


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121655.李智.32424.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v0_number_2_seed_100/events.out.tfevents.1648121655.李智.32424.2


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121526.李智.11256.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_0/events.out.tfevents.1648121526.李智.11256.0


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121607.李智.11256.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_10/events.out.tfevents.1648121607.李智.11256.1


--------------------------------------------------------------------------------
/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121688.李智.11256.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/runs/REINFORCE/REINFORCE_env_CartPole-v1_number_2_seed_100/events.out.tfevents.1648121688.李智.11256.2


--------------------------------------------------------------------------------
/1.REINFORCE/training results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/1.REINFORCE/training results.png


--------------------------------------------------------------------------------
/2.Actor-Critic/A2C.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | from torch.utils.tensorboard import SummaryWriter
  7 | 
  8 | 
  9 | # The network of the actor
 10 | class Actor(nn.Module):
 11 |     def __init__(self, state_dim, action_dim, hidden_width):
 12 |         super(Actor, self).__init__()
 13 |         self.l1 = nn.Linear(state_dim, hidden_width)
 14 |         self.l2 = nn.Linear(hidden_width, action_dim)
 15 | 
 16 |     def forward(self, s):
 17 |         s = F.relu(self.l1(s))
 18 |         a_prob = F.softmax(self.l2(s), dim=1)
 19 |         return a_prob
 20 | 
 21 | 
 22 | # The network of the critic
 23 | class Critic(nn.Module):
 24 |     def __init__(self, state_dim, hidden_width):
 25 |         super(Critic, self).__init__()
 26 |         self.l1 = nn.Linear(state_dim, hidden_width)
 27 |         self.l2 = nn.Linear(hidden_width, 1)
 28 | 
 29 |     def forward(self, s):
 30 |         s = F.relu(self.l1(s))
 31 |         v_s = self.l2(s)
 32 |         return v_s
 33 | 
 34 | 
 35 | class A2C(object):
 36 |     def __init__(self, state_dim, action_dim):
 37 |         self.state_dim = state_dim
 38 |         self.action_dim = action_dim
 39 |         self.hidden_width = 64  # The number of neurons in hidden layers of the neural network
 40 |         self.lr = 5e-4  # learning rate
 41 |         self.GAMMA = 0.99  # discount factor
 42 |         self.I = 1
 43 | 
 44 |         self.actor = Actor(state_dim, action_dim, self.hidden_width)
 45 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
 46 | 
 47 |         self.critic = Critic(state_dim, self.hidden_width)
 48 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
 49 | 
 50 |     def choose_action(self, s, deterministic):
 51 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
 52 |         prob_weights = self.actor(s).detach().numpy().flatten()  # probability distribution(numpy)
 53 |         if deterministic:  # We use the deterministic policy during the evaluating
 54 |             a = np.argmax(prob_weights)  # Select the action with the highest probability
 55 |             return a
 56 |         else:  # We use the stochastic policy during the training
 57 |             a = np.random.choice(range(self.action_dim), p=prob_weights)  # Sample the action according to the probability distribution
 58 |             return a
 59 | 
 60 |     def learn(self, s, a, r, s_, dw):
 61 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
 62 |         s_ = torch.unsqueeze(torch.tensor(s_, dtype=torch.float), 0)
 63 |         v_s = self.critic(s).flatten()  # v(s)
 64 |         v_s_ = self.critic(s_).flatten()  # v(s')
 65 | 
 66 |         with torch.no_grad():  # td_target has no gradient
 67 |             td_target = r + self.GAMMA * (1 - dw) * v_s_
 68 | 
 69 |         # Update actor
 70 |         log_pi = torch.log(self.actor(s).flatten()[a])  # log pi(a|s)
 71 |         actor_loss = -self.I * ((td_target - v_s).detach()) * log_pi  # Only calculate the derivative of log_pi
 72 |         self.actor_optimizer.zero_grad()
 73 |         actor_loss.backward()
 74 |         self.actor_optimizer.step()
 75 | 
 76 |         # Update critic
 77 |         critic_loss = (td_target - v_s) ** 2  # Only calculate the derivative of v(s)
 78 |         self.critic_optimizer.zero_grad()
 79 |         critic_loss.backward()
 80 |         self.critic_optimizer.step()
 81 | 
 82 |         self.I *= self.GAMMA  # Represent the gamma^t in th policy gradient theorem
 83 | 
 84 | 
 85 | def evaluate_policy(env, agent):
 86 |     times = 3  # Perform three evaluations and calculate the average
 87 |     evaluate_reward = 0
 88 |     for _ in range(times):
 89 |         s = env.reset()
 90 |         done = False
 91 |         episode_reward = 0
 92 |         while not done:
 93 |             a = agent.choose_action(s, deterministic=True)  # We use the deterministic policy during the evaluating
 94 |             s_, r, done, _ = env.step(a)
 95 |             episode_reward += r
 96 |             s = s_
 97 |         evaluate_reward += episode_reward
 98 | 
 99 |     return int(evaluate_reward / times)
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     env_name = ['CartPole-v0', 'CartPole-v1']
104 |     env_index = 0
105 |     env = gym.make(env_name[env_index])
106 |     env_evaluate = gym.make(env_name[env_index])  # When evaluating the policy, we need to rebuild an environment
107 |     number = 9
108 |     # Set random seed
109 |     seed = 0
110 |     env.seed(seed)
111 |     env.action_space.seed(seed)
112 |     env_evaluate.seed(seed)
113 |     env_evaluate.action_space.seed(seed)
114 |     np.random.seed(seed)
115 |     torch.manual_seed(seed)
116 | 
117 |     state_dim = env.observation_space.shape[0]
118 |     action_dim = env.action_space.n
119 |     max_episode_steps = env._max_episode_steps  # Maximum number of steps per episode
120 |     print("state_dim={}".format(state_dim))
121 |     print("action_dim={}".format(action_dim))
122 |     print("max_episode_steps={}".format(max_episode_steps))
123 | 
124 |     agent = A2C(state_dim, action_dim)
125 |     writer = SummaryWriter(log_dir='runs/A2C/A2C_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed))  # Build a tensorboard
126 | 
127 |     max_train_steps = 3e5  # Maximum number of training steps
128 |     evaluate_freq = 1e3  # Evaluate the policy every 'evaluate_freq' steps
129 |     evaluate_rewards = []  # Record the rewards during the evaluating
130 |     evaluate_num = 0  # Record the number of evaluations
131 |     total_steps = 0  # Record the total steps during the training
132 | 
133 |     while total_steps < max_train_steps:
134 |         episode_steps = 0
135 |         s = env.reset()
136 |         done = False
137 |         agent.I = 1
138 |         while not done:
139 |             episode_steps += 1
140 |             a = agent.choose_action(s, deterministic=False)
141 |             s_, r, done, _ = env.step(a)
142 | 
143 |             # When dead or win or reaching the max_epsiode_steps, done will be Ture, we need to distinguish them;
144 |             # dw means dead or win,there is no next state s';
145 |             # but when reaching the max_episode_steps,there is a next state s' actually.
146 |             if done and episode_steps != max_episode_steps:
147 |                 dw = True
148 |             else:
149 |                 dw = False
150 | 
151 |             agent.learn(s, a, r, s_, dw)
152 |             s = s_
153 | 
154 |             # Evaluate the policy every 'evaluate_freq' steps
155 |             if (total_steps + 1) % evaluate_freq == 0:
156 |                 evaluate_num += 1
157 |                 evaluate_reward = evaluate_policy(env_evaluate, agent)
158 |                 evaluate_rewards.append(evaluate_reward)
159 |                 print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
160 |                 writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
161 |                 # Save the rewards
162 |                 if evaluate_num % 10 == 0:
163 |                     np.save('./data_train/A2C_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
164 | 
165 |             total_steps += 1
166 | 


--------------------------------------------------------------------------------
/2.Actor-Critic/A2C_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/A2C_results.png


--------------------------------------------------------------------------------
/2.Actor-Critic/README.md:
--------------------------------------------------------------------------------
 1 | # Actor-Critic(A2C)
 2 | This is a concise Pytorch implementation of Advantage Actor-Critic(A2C).<br />
 3 | 
 4 | ## How to use my code?
 5 | You can dircetly run A2C.py in your own IDE.<br />
 6 | 
 7 | ### Trainning environments
 8 | You can set the 'env_index' in the codes to change the environments.<br />
 9 | env_index=0 represent 'CartPole-v0'<br />
10 | env_index=1 represent 'CartPole-v1'<br />
11 | 
12 | ### How to see the training results?
13 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.<br />
14 | The rewards data are saved as numpy in the file 'data_train'.<br />
15 | The training curves are shown below,  which are smoothed by averaging over a window of 10 steps.<br />
16 | The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)<br />
17 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/2.Actor-Critic/A2C_results.png)
18 | 


--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_0.npy


--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_10.npy


--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v0_number_9_seed_100.npy


--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_0.npy


--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_10.npy


--------------------------------------------------------------------------------
/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/data_train/A2C_env_CartPole-v1_number_9_seed_100.npy


--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_0/events.out.tfevents.1648553119.李智.62564.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_0/events.out.tfevents.1648553119.李智.62564.0


--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_10/events.out.tfevents.1648553543.李智.62564.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_10/events.out.tfevents.1648553543.李智.62564.1


--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_100/events.out.tfevents.1648554019.李智.62564.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v0_number_9_seed_100/events.out.tfevents.1648554019.李智.62564.2


--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_0/events.out.tfevents.1648553122.李智.63460.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_0/events.out.tfevents.1648553122.李智.63460.0


--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_10/events.out.tfevents.1648553561.李智.63460.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_10/events.out.tfevents.1648553561.李智.63460.1


--------------------------------------------------------------------------------
/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_100/events.out.tfevents.1648554055.李智.63460.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/2.Actor-Critic/runs/A2C/A2C_env_CartPole-v1_number_9_seed_100/events.out.tfevents.1648554055.李智.63460.2


--------------------------------------------------------------------------------
/3.Rainbow_DQN/README.md:
--------------------------------------------------------------------------------
 1 | # Rainbow DQN
 2 | This is a concise Pytorch implementation of Rainbow DQN, including Double Q-learning, Dueling network, Noisy network, PER and n-steps Q-learning.<br />
 3 | 
 4 | ## How to use my code?
 5 | You can dircetly run Rainbow_DQN_main.py in your own IDE.<br />
 6 | 
 7 | ### Trainning environments
 8 | You can set the 'env_index' in the code to change the environments.<br />
 9 | env_index=0 represent 'CartPole-v1'<br />
10 | env_index=1 represent 'LunarLander-v2'<br />
11 | 
12 | ### How to see the training results?
13 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.<br />
14 | The rewards data are saved as numpy in the file 'data_train'.<br />
15 | The training curves are shown below.<br />
16 | The right picture is smoothed by averaging over a window of 10 steps. The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)<br />
17 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/3.Rainbow_DQN/rainbow_dqn_result.png)
18 | 
19 | ## Reference
20 | [1] Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep reinforcement learning[J]. nature, 2015, 518(7540): 529-533.<br />
21 | [2] Van Hasselt H, Guez A, Silver D. Deep reinforcement learning with double q-learning[C]//Proceedings of the AAAI conference on artificial intelligence. 2016, 30(1).<br />
22 | [3] Wang Z, Schaul T, Hessel M, et al. Dueling network architectures for deep reinforcement learning[C]//International conference on machine learning. PMLR, 2016: 1995-2003.<br />
23 | [4] Fortunato M, Azar M G, Piot B, et al. Noisy networks for exploration[J]. arXiv preprint arXiv:1706.10295, 2017.<br />
24 | [5] Schaul T, Quan J, Antonoglou I, et al. Prioritized experience replay[J]. arXiv preprint arXiv:1511.05952, 2015.<br />
25 | [6] Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining improvements in deep reinforcement learning[C]//Thirty-second AAAI conference on artificial intelligence. 2018.<br />
26 | 


--------------------------------------------------------------------------------
/3.Rainbow_DQN/Rainbow_DQN_main.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import gym
  4 | from torch.utils.tensorboard import SummaryWriter
  5 | from replay_buffer import *
  6 | from rainbow_dqn import DQN
  7 | import argparse
  8 | 
  9 | 
 10 | class Runner:
 11 |     def __init__(self, args, env_name, number, seed):
 12 |         self.args = args
 13 |         self.env_name = env_name
 14 |         self.number = number
 15 |         self.seed = seed
 16 | 
 17 |         self.env = gym.make(env_name)
 18 |         self.env_evaluate = gym.make(env_name)  # When evaluating the policy, we need to rebuild an environment
 19 |         self.env.seed(seed)
 20 |         self.env.action_space.seed(seed)
 21 |         self.env_evaluate.seed(seed)
 22 |         self.env_evaluate.action_space.seed(seed)
 23 |         np.random.seed(seed)
 24 |         torch.manual_seed(seed)
 25 | 
 26 |         self.args.state_dim = self.env.observation_space.shape[0]
 27 |         self.args.action_dim = self.env.action_space.n
 28 |         self.args.episode_limit = self.env._max_episode_steps  # Maximum number of steps per episode
 29 |         print("env={}".format(self.env_name))
 30 |         print("state_dim={}".format(self.args.state_dim))
 31 |         print("action_dim={}".format(self.args.action_dim))
 32 |         print("episode_limit={}".format(self.args.episode_limit))
 33 | 
 34 |         if args.use_per and args.use_n_steps:
 35 |             self.replay_buffer = N_Steps_Prioritized_ReplayBuffer(args)
 36 |         elif args.use_per:
 37 |             self.replay_buffer = Prioritized_ReplayBuffer(args)
 38 |         elif args.use_n_steps:
 39 |             self.replay_buffer = N_Steps_ReplayBuffer(args)
 40 |         else:
 41 |             self.replay_buffer = ReplayBuffer(args)
 42 |         self.agent = DQN(args)
 43 | 
 44 |         self.algorithm = 'DQN'
 45 |         if args.use_double and args.use_dueling and args.use_noisy and args.use_per and args.use_n_steps:
 46 |             self.algorithm = 'Rainbow_' + self.algorithm
 47 |         else:
 48 |             if args.use_double:
 49 |                 self.algorithm += '_Double'
 50 |             if args.use_dueling:
 51 |                 self.algorithm += '_Dueling'
 52 |             if args.use_noisy:
 53 |                 self.algorithm += '_Noisy'
 54 |             if args.use_per:
 55 |                 self.algorithm += '_PER'
 56 |             if args.use_n_steps:
 57 |                 self.algorithm += "_N_steps"
 58 | 
 59 |         self.writer = SummaryWriter(log_dir='runs/DQN/{}_env_{}_number_{}_seed_{}'.format(self.algorithm, env_name, number, seed))
 60 | 
 61 |         self.evaluate_num = 0  # Record the number of evaluations
 62 |         self.evaluate_rewards = []  # Record the rewards during the evaluating
 63 |         self.total_steps = 0  # Record the total steps during the training
 64 |         if args.use_noisy:  # 如果使用Noisy net，就不需要epsilon贪心策略了
 65 |             self.epsilon = 0
 66 |         else:
 67 |             self.epsilon = self.args.epsilon_init
 68 |             self.epsilon_min = self.args.epsilon_min
 69 |             self.epsilon_decay = (self.args.epsilon_init - self.args.epsilon_min) / self.args.epsilon_decay_steps
 70 | 
 71 |     def run(self, ):
 72 |         self.evaluate_policy()
 73 |         while self.total_steps < self.args.max_train_steps:
 74 |             state = self.env.reset()
 75 |             done = False
 76 |             episode_steps = 0
 77 |             while not done:
 78 |                 action = self.agent.choose_action(state, epsilon=self.epsilon)
 79 |                 next_state, reward, done, _ = self.env.step(action)
 80 |                 episode_steps += 1
 81 |                 self.total_steps += 1
 82 | 
 83 |                 if not self.args.use_noisy:  # Decay epsilon
 84 |                     self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon - self.epsilon_decay > self.epsilon_min else self.epsilon_min
 85 | 
 86 |                 # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
 87 |                 # terminal means dead or win,there is no next state s';
 88 |                 # but when reaching the max_episode_steps,there is a next state s' actually.
 89 |                 if done and episode_steps != self.args.episode_limit:
 90 |                     if self.env_name == 'LunarLander-v2':
 91 |                         if reward <= -100: reward = -1  # good for LunarLander
 92 |                     terminal = True
 93 |                 else:
 94 |                     terminal = False
 95 | 
 96 |                 self.replay_buffer.store_transition(state, action, reward, next_state, terminal, done)  # Store the transition
 97 |                 state = next_state
 98 | 
 99 |                 if self.replay_buffer.current_size >= self.args.batch_size:
100 |                     self.agent.learn(self.replay_buffer, self.total_steps)
101 | 
102 |                 if self.total_steps % self.args.evaluate_freq == 0:
103 |                     self.evaluate_policy()
104 |         # Save reward
105 |         np.save('./data_train/{}_env_{}_number_{}_seed_{}.npy'.format(self.algorithm, self.env_name, self.number, self.seed), np.array(self.evaluate_rewards))
106 | 
107 |     def evaluate_policy(self, ):
108 |         evaluate_reward = 0
109 |         self.agent.net.eval()
110 |         for _ in range(self.args.evaluate_times):
111 |             state = self.env_evaluate.reset()
112 |             done = False
113 |             episode_reward = 0
114 |             while not done:
115 |                 action = self.agent.choose_action(state, epsilon=0)
116 |                 next_state, reward, done, _ = self.env_evaluate.step(action)
117 |                 episode_reward += reward
118 |                 state = next_state
119 |             evaluate_reward += episode_reward
120 |         self.agent.net.train()
121 |         evaluate_reward /= self.args.evaluate_times
122 |         self.evaluate_rewards.append(evaluate_reward)
123 |         print("total_steps:{} \t evaluate_reward:{} \t epsilon：{}".format(self.total_steps, evaluate_reward, self.epsilon))
124 |         self.writer.add_scalar('step_rewards_{}'.format(self.env_name), evaluate_reward, global_step=self.total_steps)
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     parser = argparse.ArgumentParser("Hyperparameter Setting for DQN")
129 |     parser.add_argument("--max_train_steps", type=int, default=int(4e5), help=" Maximum number of training steps")
130 |     parser.add_argument("--evaluate_freq", type=float, default=1e3, help="Evaluate the policy every 'evaluate_freq' steps")
131 |     parser.add_argument("--evaluate_times", type=float, default=3, help="Evaluate times")
132 | 
133 |     parser.add_argument("--buffer_capacity", type=int, default=int(1e5), help="The maximum replay-buffer capacity ")
134 |     parser.add_argument("--batch_size", type=int, default=256, help="batch size")
135 |     parser.add_argument("--hidden_dim", type=int, default=256, help="The number of neurons in hidden layers of the neural network")
136 |     parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate of actor")
137 |     parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
138 |     parser.add_argument("--epsilon_init", type=float, default=0.5, help="Initial epsilon")
139 |     parser.add_argument("--epsilon_min", type=float, default=0.1, help="Minimum epsilon")
140 |     parser.add_argument("--epsilon_decay_steps", type=int, default=int(1e5), help="How many steps before the epsilon decays to the minimum")
141 |     parser.add_argument("--tau", type=float, default=0.005, help="soft update the target network")
142 |     parser.add_argument("--use_soft_update", type=bool, default=True, help="Whether to use soft update")
143 |     parser.add_argument("--target_update_freq", type=int, default=200, help="Update frequency of the target network(hard update)")
144 |     parser.add_argument("--n_steps", type=int, default=5, help="n_steps")
145 |     parser.add_argument("--alpha", type=float, default=0.6, help="PER parameter")
146 |     parser.add_argument("--beta_init", type=float, default=0.4, help="Important sampling parameter in PER")
147 |     parser.add_argument("--use_lr_decay", type=bool, default=True, help="Learning rate Decay")
148 |     parser.add_argument("--grad_clip", type=float, default=10.0, help="Gradient clip")
149 | 
150 |     parser.add_argument("--use_double", type=bool, default=True, help="Whether to use double Q-learning")
151 |     parser.add_argument("--use_dueling", type=bool, default=True, help="Whether to use dueling network")
152 |     parser.add_argument("--use_noisy", type=bool, default=True, help="Whether to use noisy network")
153 |     parser.add_argument("--use_per", type=bool, default=True, help="Whether to use PER")
154 |     parser.add_argument("--use_n_steps", type=bool, default=True, help="Whether to use n_steps Q-learning")
155 | 
156 |     args = parser.parse_args()
157 | 
158 |     env_names = ['CartPole-v1', 'LunarLander-v2']
159 |     env_index = 1
160 |     for seed in [0, 10, 100]:
161 |         runner = Runner(args=args, env_name=env_names[env_index], number=1, seed=seed)
162 |         runner.run()
163 | 


--------------------------------------------------------------------------------
/3.Rainbow_DQN/__pycache__/network.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/network.cpython-37.pyc


--------------------------------------------------------------------------------
/3.Rainbow_DQN/__pycache__/rainbow_dqn.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/rainbow_dqn.cpython-37.pyc


--------------------------------------------------------------------------------
/3.Rainbow_DQN/__pycache__/replay_buffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/replay_buffer.cpython-37.pyc


--------------------------------------------------------------------------------
/3.Rainbow_DQN/__pycache__/sum_tree.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/__pycache__/sum_tree.cpython-37.pyc


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/data_train/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100.npy


--------------------------------------------------------------------------------
/3.Rainbow_DQN/drawing_Rainbow_DQN.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import seaborn as sns
 4 | 
 5 | 
 6 | def smooth(reward):
 7 |     smooth_reward = []
 8 |     for i in range(reward.shape[0]):
 9 |         if i == 0:
10 |             smooth_reward.append(reward[i])
11 |         else:
12 |             smooth_reward.append(smooth_reward[-1] * 0.9 + reward[i] * 0.1)
13 |     return np.array(smooth_reward)
14 | 
15 | 
16 | env_name = ['CartPole-v1', 'LunarLander-v2']
17 | colors = ['r', 'darkorange', 'dodgerblue', 'limegreen', 'yellow', 'magenta', 'chocolate', 'indigo', 'gray', 'aqua', 'g', 'black']
18 | 
19 | 
20 | def get_data(algorithm, env_index, number):
21 |     reward1 = smooth(np.load('./data_train/{}_env_{}_number_{}_seed_0.npy'.format(algorithm, env_name[env_index], number)))
22 |     reward2 = smooth(np.load('./data_train/{}_env_{}_number_{}_seed_10.npy'.format(algorithm, env_name[env_index], number)))
23 |     reward3 = smooth(np.load('./data_train/{}_env_{}_number_{}_seed_100.npy'.format(algorithm, env_name[env_index], number)))
24 |     reward = np.stack((reward1, reward2, reward3), axis=0)
25 |     len = reward1.shape[0]
26 | 
27 |     return reward, len
28 | 
29 | 
30 | def drawing_CP(plt, algorithm, number, color, label):
31 |     reward, len = get_data(algorithm=algorithm, env_index=0, number=number)
32 |     sns.tsplot(time=np.arange(len), data=reward, color=color, linestyle='-')  # color=darkorange dodgerblue
33 |     plt.plot(reward.mean(0), color=color, label=label)
34 |     plt.title("CartPole-v1", size=14)
35 |     plt.xlabel("Steps", size=14)
36 |     plt.ylabel("Reward", size=14)
37 |     plt.xticks([0, 50, 100, 150], ['0', '50k', '100k', '150k'], size=14)
38 |     plt.yticks(size=14)
39 |     plt.ylim([0, 510])
40 |     plt.legend(loc='lower right', fontsize=14)
41 | 
42 | 
43 | def drawing_LL(plt, algorithm, number, color, label):
44 |     reward, len = get_data(algorithm=algorithm, env_index=1, number=number)
45 |     sns.tsplot(time=np.arange(len), data=reward, color=color, linestyle='-')  # color=darkorange dodgerblue
46 |     plt.plot(reward.mean(0), color=color, label=label)
47 |     plt.title("LunarLander-v2", size=14)
48 |     plt.xlabel("Steps", size=14)
49 |     plt.ylabel("Reward", size=14)
50 |     plt.xticks([0, 100, 200, 300, 400], ['0', '100k', '200k', '300k', '400k'], size=14)
51 |     plt.yticks(size=14)
52 |     plt.ylim([-300, 300])
53 |     plt.legend(loc='lower right', fontsize=14)
54 | 
55 | 
56 | sns.set_style('darkgrid')
57 | plt.figure()
58 | drawing_LL(plt, algorithm='Rainbow_DQN', number=1, color=colors[0], label='Rainbow_DQN')
59 | 
60 | drawing_LL(plt, algorithm='DQN_dueling_Noisy_PER_N_steps', number=1, color=colors[1], label='Rainbow_DQN without Double')
61 | 
62 | drawing_LL(plt, algorithm='DQN_double_Noisy_PER_N_steps', number=1, color=colors[2], label='Rainbow_DQN without Dueling')
63 | 
64 | drawing_LL(plt, algorithm='DQN_double_dueling_Noisy_N_steps', number=1, color=colors[3], label='Rainbow DQN without PER')
65 | 
66 | drawing_LL(plt, algorithm='DQN_double_dueling_Noisy_PER', number=1, color=colors[4], label='Rainbow_DQN without N-steps')
67 | 
68 | drawing_LL(plt, algorithm='DQN_double_dueling_PER_N_steps', number=1, color=colors[9], label='Rainbow_DQN without Noisy')
69 | 
70 | 
71 | plt.show()
72 | 


--------------------------------------------------------------------------------
/3.Rainbow_DQN/network.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import math
 5 | 
 6 | 
 7 | class Dueling_Net(nn.Module):
 8 |     def __init__(self, args):
 9 |         super(Dueling_Net, self).__init__()
10 |         self.fc1 = nn.Linear(args.state_dim, args.hidden_dim)
11 |         self.fc2 = nn.Linear(args.hidden_dim, args.hidden_dim)
12 |         if args.use_noisy:
13 |             self.V = NoisyLinear(args.hidden_dim, 1)
14 |             self.A = NoisyLinear(args.hidden_dim, args.action_dim)
15 |         else:
16 |             self.V = nn.Linear(args.hidden_dim, 1)
17 |             self.A = nn.Linear(args.hidden_dim, args.action_dim)
18 | 
19 |     def forward(self, s):
20 |         s = torch.relu(self.fc1(s))
21 |         s = torch.relu(self.fc2(s))
22 |         V = self.V(s)  # batch_size X 1
23 |         A = self.A(s)  # batch_size X action_dim
24 |         Q = V + (A - torch.mean(A, dim=-1, keepdim=True))  # Q(s,a)=V(s)+A(s,a)-mean(A(s,a))
25 |         return Q
26 | 
27 | 
28 | class Net(nn.Module):
29 |     def __init__(self, args):
30 |         super(Net, self).__init__()
31 |         self.fc1 = nn.Linear(args.state_dim, args.hidden_dim)
32 |         self.fc2 = nn.Linear(args.hidden_dim, args.hidden_dim)
33 |         if args.use_noisy:
34 |             self.fc3 = NoisyLinear(args.hidden_dim, args.action_dim)
35 |         else:
36 |             self.fc3 = nn.Linear(args.hidden_dim, args.action_dim)
37 | 
38 |     def forward(self, s):
39 |         s = torch.relu(self.fc1(s))
40 |         s = torch.relu(self.fc2(s))
41 |         Q = self.fc3(s)
42 |         return Q
43 | 
44 | 
45 | class NoisyLinear(nn.Module):
46 |     def __init__(self, in_features, out_features, sigma_init=0.5):
47 |         super(NoisyLinear, self).__init__()
48 |         self.in_features = in_features
49 |         self.out_features = out_features
50 |         self.sigma_init = sigma_init
51 | 
52 |         self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
53 |         self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
54 |         self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))
55 | 
56 |         self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
57 |         self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
58 |         self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
59 | 
60 |         self.reset_parameters()
61 |         self.reset_noise()
62 | 
63 |     def forward(self, x):
64 |         if self.training:
65 |             self.reset_noise()
66 |             weight = self.weight_mu + self.weight_sigma.mul(self.weight_epsilon)  # mul是对应元素相乘
67 |             bias = self.bias_mu + self.bias_sigma.mul(self.bias_epsilon)
68 | 
69 |         else:
70 |             weight = self.weight_mu
71 |             bias = self.bias_mu
72 | 
73 |         return F.linear(x, weight, bias)
74 | 
75 |     def reset_parameters(self):
76 |         mu_range = 1 / math.sqrt(self.in_features)
77 |         self.weight_mu.data.uniform_(-mu_range, mu_range)
78 |         self.bias_mu.data.uniform_(-mu_range, mu_range)
79 | 
80 |         self.weight_sigma.data.fill_(self.sigma_init / math.sqrt(self.in_features))
81 |         self.bias_sigma.data.fill_(self.sigma_init / math.sqrt(self.out_features))  # 这里要除以out_features
82 | 
83 |     def reset_noise(self):
84 |         epsilon_i = self.scale_noise(self.in_features)
85 |         epsilon_j = self.scale_noise(self.out_features)
86 |         self.weight_epsilon.copy_(torch.ger(epsilon_j, epsilon_i))
87 |         self.bias_epsilon.copy_(epsilon_j)
88 | 
89 |     def scale_noise(self, size):
90 |         x = torch.randn(size)  # torch.randn产生标准高斯分布
91 |         x = x.sign().mul(x.abs().sqrt())
92 |         return x
93 | 


--------------------------------------------------------------------------------
/3.Rainbow_DQN/rainbow_dqn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import copy
 4 | from network import Dueling_Net, Net
 5 | 
 6 | 
 7 | class DQN(object):
 8 |     def __init__(self, args):
 9 |         self.action_dim = args.action_dim
10 |         self.batch_size = args.batch_size  # batch size
11 |         self.max_train_steps = args.max_train_steps
12 |         self.lr = args.lr  # learning rate
13 |         self.gamma = args.gamma  # discount factor
14 |         self.tau = args.tau  # Soft update
15 |         self.use_soft_update = args.use_soft_update
16 |         self.target_update_freq = args.target_update_freq  # hard update
17 |         self.update_count = 0
18 | 
19 |         self.grad_clip = args.grad_clip
20 |         self.use_lr_decay = args.use_lr_decay
21 |         self.use_double = args.use_double
22 |         self.use_dueling = args.use_dueling
23 |         self.use_per = args.use_per
24 |         self.use_n_steps = args.use_n_steps
25 |         if self.use_n_steps:
26 |             self.gamma = self.gamma ** args.n_steps
27 | 
28 |         if self.use_dueling:  # Whether to use the 'dueling network'
29 |             self.net = Dueling_Net(args)
30 |         else:
31 |             self.net = Net(args)
32 | 
33 |         self.target_net = copy.deepcopy(self.net)  # Copy the online_net to the target_net
34 | 
35 |         self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.lr)
36 | 
37 |     def choose_action(self, state, epsilon):
38 |         with torch.no_grad():
39 |             state = torch.unsqueeze(torch.tensor(state, dtype=torch.float), 0)
40 |             q = self.net(state)
41 |             if np.random.uniform() > epsilon:
42 |                 action = q.argmax(dim=-1).item()
43 |             else:
44 |                 action = np.random.randint(0, self.action_dim)
45 |             return action
46 | 
47 |     def learn(self, replay_buffer, total_steps):
48 |         batch, batch_index, IS_weight = replay_buffer.sample(total_steps)
49 | 
50 |         with torch.no_grad():  # q_target has no gradient
51 |             if self.use_double:  # Whether to use the 'double q-learning'
52 |                 # Use online_net to select the action
53 |                 a_argmax = self.net(batch['next_state']).argmax(dim=-1, keepdim=True)  # shape：(batch_size,1)
54 |                 # Use target_net to estimate the q_target
55 |                 q_target = batch['reward'] + self.gamma * (1 - batch['terminal']) * self.target_net(batch['next_state']).gather(-1, a_argmax).squeeze(-1)  # shape：(batch_size,)
56 |             else:
57 |                 q_target = batch['reward'] + self.gamma * (1 - batch['terminal']) * self.target_net(batch['next_state']).max(dim=-1)[0]  # shape：(batch_size,)
58 | 
59 |         q_current = self.net(batch['state']).gather(-1, batch['action']).squeeze(-1)  # shape：(batch_size,)
60 |         td_errors = q_current - q_target  # shape：(batch_size,)
61 | 
62 |         if self.use_per:
63 |             loss = (IS_weight * (td_errors ** 2)).mean()
64 |             replay_buffer.update_batch_priorities(batch_index, td_errors.detach().numpy())
65 |         else:
66 |             loss = (td_errors ** 2).mean()
67 | 
68 |         self.optimizer.zero_grad()
69 |         loss.backward()
70 |         if self.grad_clip:
71 |             torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_clip)
72 |         self.optimizer.step()
73 | 
74 |         if self.use_soft_update:  # soft update
75 |             for param, target_param in zip(self.net.parameters(), self.target_net.parameters()):
76 |                 target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
77 |         else:  # hard update
78 |             self.update_count += 1
79 |             if self.update_count % self.target_update_freq == 0:
80 |                 self.target_net.load_state_dict(self.net.state_dict())
81 | 
82 |         if self.use_lr_decay:  # learning rate Decay
83 |             self.lr_decay(total_steps)
84 | 
85 |     def lr_decay(self, total_steps):
86 |         lr_now = 0.9 * self.lr * (1 - total_steps / self.max_train_steps) + 0.1 * self.lr
87 |         for p in self.optimizer.param_groups:
88 |             p['lr'] = lr_now
89 | 


--------------------------------------------------------------------------------
/3.Rainbow_DQN/rainbow_dqn_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/rainbow_dqn_result.png


--------------------------------------------------------------------------------
/3.Rainbow_DQN/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from collections import deque
  4 | from sum_tree import SumTree
  5 | 
  6 | 
  7 | class ReplayBuffer(object):
  8 |     def __init__(self, args):
  9 |         self.batch_size = args.batch_size
 10 |         self.buffer_capacity = args.buffer_capacity
 11 |         self.current_size = 0
 12 |         self.count = 0
 13 |         self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)),
 14 |                        'action': np.zeros((self.buffer_capacity, 1)),
 15 |                        'reward': np.zeros(self.buffer_capacity),
 16 |                        'next_state': np.zeros((self.buffer_capacity, args.state_dim)),
 17 |                        'terminal': np.zeros(self.buffer_capacity),
 18 |                        }
 19 | 
 20 |     def store_transition(self, state, action, reward, next_state, terminal, done):
 21 |         self.buffer['state'][self.count] = state
 22 |         self.buffer['action'][self.count] = action
 23 |         self.buffer['reward'][self.count] = reward
 24 |         self.buffer['next_state'][self.count] = next_state
 25 |         self.buffer['terminal'][self.count] = terminal
 26 |         self.count = (self.count + 1) % self.buffer_capacity  # When the 'count' reaches buffer_capacity, it will be reset to 0.
 27 |         self.current_size = min(self.current_size + 1, self.buffer_capacity)
 28 | 
 29 |     def sample(self, total_steps):
 30 |         index = np.random.randint(0, self.current_size, size=self.batch_size)
 31 |         batch = {}
 32 |         for key in self.buffer.keys():  # numpy->tensor
 33 |             if key == 'action':
 34 |                 batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.long)
 35 |             else:
 36 |                 batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.float32)
 37 | 
 38 |         return batch, None, None
 39 | 
 40 | 
 41 | class N_Steps_ReplayBuffer(object):
 42 |     def __init__(self, args):
 43 |         self.gamma = args.gamma
 44 |         self.batch_size = args.batch_size
 45 |         self.buffer_capacity = args.buffer_capacity
 46 |         self.current_size = 0
 47 |         self.count = 0
 48 |         self.n_steps = args.n_steps
 49 |         self.n_steps_deque = deque(maxlen=self.n_steps)
 50 |         self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)),
 51 |                        'action': np.zeros((self.buffer_capacity, 1)),
 52 |                        'reward': np.zeros(self.buffer_capacity),
 53 |                        'next_state': np.zeros((self.buffer_capacity, args.state_dim)),
 54 |                        'terminal': np.zeros(self.buffer_capacity),
 55 |                        }
 56 | 
 57 |     def store_transition(self, state, action, reward, next_state, terminal, done):
 58 |         transition = (state, action, reward, next_state, terminal, done)
 59 |         self.n_steps_deque.append(transition)
 60 |         if len(self.n_steps_deque) == self.n_steps:
 61 |             state, action, n_steps_reward, next_state, terminal = self.get_n_steps_transition()
 62 |             self.buffer['state'][self.count] = state
 63 |             self.buffer['action'][self.count] = action
 64 |             self.buffer['reward'][self.count] = n_steps_reward
 65 |             self.buffer['next_state'][self.count] = next_state
 66 |             self.buffer['terminal'][self.count] = terminal
 67 |             self.count = (self.count + 1) % self.buffer_capacity  # When the 'count' reaches buffer_capacity, it will be reset to 0.
 68 |             self.current_size = min(self.current_size + 1, self.buffer_capacity)
 69 | 
 70 |     def get_n_steps_transition(self):
 71 |         state, action = self.n_steps_deque[0][:2]
 72 |         next_state, terminal = self.n_steps_deque[-1][3:5]
 73 |         n_steps_reward = 0
 74 |         for i in reversed(range(self.n_steps)):
 75 |             r, s_, ter, d = self.n_steps_deque[i][2:]
 76 |             n_steps_reward = r + self.gamma * (1 - d) * n_steps_reward
 77 |             if d:
 78 |                 next_state, terminal = s_, ter
 79 | 
 80 |         return state, action, n_steps_reward, next_state, terminal
 81 | 
 82 |     def sample(self, total_steps):
 83 |         index = np.random.randint(0, self.current_size, size=self.batch_size)
 84 |         batch = {}
 85 |         for key in self.buffer.keys():  # numpy->tensor
 86 |             if key == 'action':
 87 |                 batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.long)
 88 |             else:
 89 |                 batch[key] = torch.tensor(self.buffer[key][index], dtype=torch.float32)
 90 | 
 91 |         return batch, None, None
 92 | 
 93 | 
 94 | class Prioritized_ReplayBuffer(object):
 95 |     def __init__(self, args):
 96 |         self.max_train_steps = args.max_train_steps
 97 |         self.alpha = args.alpha
 98 |         self.beta_init = args.beta_init
 99 |         self.beta = args.beta_init
100 |         self.batch_size = args.batch_size
101 |         self.buffer_capacity = args.buffer_capacity
102 |         self.sum_tree = SumTree(self.buffer_capacity)
103 |         self.current_size = 0
104 |         self.count = 0
105 |         self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)),
106 |                        'action': np.zeros((self.buffer_capacity, 1)),
107 |                        'reward': np.zeros(self.buffer_capacity),
108 |                        'next_state': np.zeros((self.buffer_capacity, args.state_dim)),
109 |                        'terminal': np.zeros(self.buffer_capacity),
110 |                        }
111 | 
112 |     def store_transition(self, state, action, reward, next_state, terminal, done):
113 |         self.buffer['state'][self.count] = state
114 |         self.buffer['action'][self.count] = action
115 |         self.buffer['reward'][self.count] = reward
116 |         self.buffer['next_state'][self.count] = next_state
117 |         self.buffer['terminal'][self.count] = terminal
118 |         # 如果是第一条经验，初始化优先级为1.0；否则，对于新存入的经验，指定为当前最大的优先级
119 |         priority = 1.0 if self.current_size == 0 else self.sum_tree.priority_max
120 |         self.sum_tree.update(data_index=self.count, priority=priority)  # 更新当前经验在sum_tree中的优先级
121 |         self.count = (self.count + 1) % self.buffer_capacity  # When the 'count' reaches buffer_capacity, it will be reset to 0.
122 |         self.current_size = min(self.current_size + 1, self.buffer_capacity)
123 | 
124 |     def sample(self, total_steps):
125 |         batch_index, IS_weight = self.sum_tree.get_batch_index(current_size=self.current_size, batch_size=self.batch_size, beta=self.beta)
126 |         self.beta = self.beta_init + (1 - self.beta_init) * (total_steps / self.max_train_steps)  # beta：beta_init->1.0
127 |         batch = {}
128 |         for key in self.buffer.keys():  # numpy->tensor
129 |             if key == 'action':
130 |                 batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.long)
131 |             else:
132 |                 batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.float32)
133 | 
134 |         return batch, batch_index, IS_weight
135 | 
136 |     def update_batch_priorities(self, batch_index, td_errors):  # 根据传入的td_error，更新batch_index所对应数据的priorities
137 |         priorities = (np.abs(td_errors) + 0.01) ** self.alpha
138 |         for index, priority in zip(batch_index, priorities):
139 |             self.sum_tree.update(data_index=index, priority=priority)
140 | 
141 | 
142 | class N_Steps_Prioritized_ReplayBuffer(object):
143 |     def __init__(self, args):
144 |         self.max_train_steps = args.max_train_steps
145 |         self.alpha = args.alpha
146 |         self.beta_init = args.beta_init
147 |         self.beta = args.beta_init
148 |         self.gamma = args.gamma
149 |         self.batch_size = args.batch_size
150 |         self.buffer_capacity = args.buffer_capacity
151 |         self.sum_tree = SumTree(self.buffer_capacity)
152 |         self.n_steps = args.n_steps
153 |         self.n_steps_deque = deque(maxlen=self.n_steps)
154 |         self.buffer = {'state': np.zeros((self.buffer_capacity, args.state_dim)),
155 |                        'action': np.zeros((self.buffer_capacity, 1)),
156 |                        'reward': np.zeros(self.buffer_capacity),
157 |                        'next_state': np.zeros((self.buffer_capacity, args.state_dim)),
158 |                        'terminal': np.zeros(self.buffer_capacity),
159 |                        }
160 |         self.current_size = 0
161 |         self.count = 0
162 | 
163 |     def store_transition(self, state, action, reward, next_state, terminal, done):
164 |         transition = (state, action, reward, next_state, terminal, done)
165 |         self.n_steps_deque.append(transition)
166 |         if len(self.n_steps_deque) == self.n_steps:
167 |             state, action, n_steps_reward, next_state, terminal = self.get_n_steps_transition()
168 |             self.buffer['state'][self.count] = state
169 |             self.buffer['action'][self.count] = action
170 |             self.buffer['reward'][self.count] = n_steps_reward
171 |             self.buffer['next_state'][self.count] = next_state
172 |             self.buffer['terminal'][self.count] = terminal
173 |             # 如果是buffer中的第一条经验，那么指定priority为1.0；否则对于新存入的经验，指定为当前最大的priority
174 |             priority = 1.0 if self.current_size == 0 else self.sum_tree.priority_max
175 |             self.sum_tree.update(data_index=self.count, priority=priority)  # 更新当前经验在sum_tree中的优先级
176 |             self.count = (self.count + 1) % self.buffer_capacity  # When 'count' reaches buffer_capacity, it will be reset to 0.
177 |             self.current_size = min(self.current_size + 1, self.buffer_capacity)
178 | 
179 |     def sample(self, total_steps):
180 |         batch_index, IS_weight = self.sum_tree.get_batch_index(current_size=self.current_size, batch_size=self.batch_size, beta=self.beta)
181 |         self.beta = self.beta_init + (1 - self.beta_init) * (total_steps / self.max_train_steps)  # beta：beta_init->1.0
182 |         batch = {}
183 |         for key in self.buffer.keys():  # numpy->tensor
184 |             if key == 'action':
185 |                 batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.long)
186 |             else:
187 |                 batch[key] = torch.tensor(self.buffer[key][batch_index], dtype=torch.float32)
188 | 
189 |         return batch, batch_index, IS_weight
190 | 
191 |     def get_n_steps_transition(self):
192 |         state, action = self.n_steps_deque[0][:2]  # 获取deque中第一个transition的s和a
193 |         next_state, terminal = self.n_steps_deque[-1][3:5]  # 获取deque中最后一个transition的s'和terminal
194 |         n_steps_reward = 0
195 |         for i in reversed(range(self.n_steps)):  # 逆序计算n_steps_reward
196 |             r, s_, ter, d = self.n_steps_deque[i][2:]
197 |             n_steps_reward = r + self.gamma * (1 - d) * n_steps_reward
198 |             if d:  # 如果done=True，说明一个回合结束，保存deque中当前这个transition的s'和terminal作为这个n_steps_transition的next_state和terminal
199 |                 next_state, terminal = s_, ter
200 | 
201 |         return state, action, n_steps_reward, next_state, terminal
202 | 
203 |     def update_batch_priorities(self, batch_index, td_errors):  # 根据传入的td_error，更新batch_index所对应数据的priorities
204 |         priorities = (np.abs(td_errors) + 0.01) ** self.alpha
205 |         for index, priority in zip(batch_index, priorities):
206 |             self.sum_tree.update(data_index=index, priority=priority)
207 | 


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494478.DESKTOP-LMKC0MO.1408.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494478.DESKTOP-LMKC0MO.1408.0


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658507126.DESKTOP-LMKC0MO.1408.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658507126.DESKTOP-LMKC0MO.1408.1


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658520541.DESKTOP-LMKC0MO.1408.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658520541.DESKTOP-LMKC0MO.1408.2


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494481.DESKTOP-LMKC0MO.9316.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494481.DESKTOP-LMKC0MO.9316.0


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658512436.DESKTOP-LMKC0MO.9316.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658512436.DESKTOP-LMKC0MO.9316.1


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658531515.DESKTOP-LMKC0MO.9316.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_Noisy_PER_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658531515.DESKTOP-LMKC0MO.9316.2


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494475.DESKTOP-LMKC0MO.5976.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494475.DESKTOP-LMKC0MO.5976.0


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511615.DESKTOP-LMKC0MO.5976.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511615.DESKTOP-LMKC0MO.5976.1


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658528978.DESKTOP-LMKC0MO.5976.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Dueling_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658528978.DESKTOP-LMKC0MO.5976.2


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494473.DESKTOP-LMKC0MO.2144.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494473.DESKTOP-LMKC0MO.2144.0


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511489.DESKTOP-LMKC0MO.2144.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658511489.DESKTOP-LMKC0MO.2144.1


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658529336.DESKTOP-LMKC0MO.2144.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Double_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658529336.DESKTOP-LMKC0MO.2144.2


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494471.DESKTOP-LMKC0MO.9964.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658494471.DESKTOP-LMKC0MO.9964.0


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658510515.DESKTOP-LMKC0MO.9964.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658510515.DESKTOP-LMKC0MO.9964.1


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658526626.DESKTOP-LMKC0MO.9964.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/DQN_Dueling_Noisy_PER_N_steps_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658526626.DESKTOP-LMKC0MO.9964.2


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658479209.DESKTOP-LMKC0MO.1228.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_0/events.out.tfevents.1658479209.DESKTOP-LMKC0MO.1228.0


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658479212.DESKTOP-LMKC0MO.10500.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_10/events.out.tfevents.1658479212.DESKTOP-LMKC0MO.10500.0


--------------------------------------------------------------------------------
/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658479214.DESKTOP-LMKC0MO.9512.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/3.Rainbow_DQN/runs/DQN/Rainbow_DQN_env_LunarLander-v2_number_1_seed_100/events.out.tfevents.1658479214.DESKTOP-LMKC0MO.9512.0


--------------------------------------------------------------------------------
/3.Rainbow_DQN/sum_tree.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | class SumTree(object):
 5 |     """
 6 |     Story data with its priority in the tree.
 7 |     Tree structure and array storage:
 8 | 
 9 |     Tree index:
10 |          0         -> storing priority sum
11 |         / \
12 |       1     2
13 |      / \   / \
14 |     3   4 5   6    -> storing priority for transitions
15 | 
16 |     Array type for storing:
17 |     [0,1,2,3,4,5,6]
18 |     """
19 | 
20 |     def __init__(self, buffer_capacity):
21 |         self.buffer_capacity = buffer_capacity  # buffer的容量
22 |         self.tree_capacity = 2 * buffer_capacity - 1  # sum_tree的容量
23 |         self.tree = np.zeros(self.tree_capacity)
24 | 
25 |     def update(self, data_index, priority):
26 |         # data_index表示当前数据在buffer中的index
27 |         # tree_index表示当前数据在sum_tree中的index
28 |         tree_index = data_index + self.buffer_capacity - 1  # 把当前数据在buffer中的index转换为在sum_tree中的index
29 |         change = priority - self.tree[tree_index]  # 当前数据的priority的改变量
30 |         self.tree[tree_index] = priority  # 更新树的最后一层叶子节点的优先级
31 |         # then propagate the change through the tree
32 |         while tree_index != 0:  # 更新上层节点的优先级，一直传播到最顶端
33 |             tree_index = (tree_index - 1) // 2
34 |             self.tree[tree_index] += change
35 | 
36 |     def get_index(self, v):
37 |         parent_idx = 0  # 从树的顶端开始
38 |         while True:
39 |             child_left_idx = 2 * parent_idx + 1  # 父节点下方的左右两个子节点的index
40 |             child_right_idx = child_left_idx + 1
41 |             if child_left_idx >= self.tree_capacity:  # reach bottom, end search
42 |                 tree_index = parent_idx  # tree_index表示采样到的数据在sum_tree中的index
43 |                 break
44 |             else:  # downward search, always search for a higher priority node
45 |                 if v <= self.tree[child_left_idx]:
46 |                     parent_idx = child_left_idx
47 |                 else:
48 |                     v -= self.tree[child_left_idx]
49 |                     parent_idx = child_right_idx
50 | 
51 |         data_index = tree_index - self.buffer_capacity + 1  # tree_index->data_index
52 |         return data_index, self.tree[tree_index]  # 返回采样到的data在buffer中的index,以及相对应的priority
53 | 
54 |     def get_batch_index(self, current_size, batch_size, beta):
55 |         batch_index = np.zeros(batch_size, dtype=np.long)
56 |         IS_weight = torch.zeros(batch_size, dtype=torch.float32)
57 |         segment = self.priority_sum / batch_size  # 把[0,priority_sum]等分成batch_size个区间，在每个区间均匀采样一个数
58 |         for i in range(batch_size):
59 |             a = segment * i
60 |             b = segment * (i + 1)
61 |             v = np.random.uniform(a, b)
62 |             index, priority = self.get_index(v)
63 |             batch_index[i] = index
64 |             prob = priority / self.priority_sum  # 当前数据被采样的概率
65 |             IS_weight[i] = (current_size * prob) ** (-beta)
66 |         IS_weight /= IS_weight.max()  # normalization
67 | 
68 |         return batch_index, IS_weight
69 | 
70 |     @property
71 |     def priority_sum(self):
72 |         return self.tree[0]  # 树的顶端保存了所有priority之和
73 | 
74 |     @property
75 |     def priority_max(self):
76 |         return self.tree[self.buffer_capacity - 1:].max()  # 树的最后一层叶节点，保存的才是每个数据对应的priority
77 | 


--------------------------------------------------------------------------------
/4.PPO-discrete/PPO_discrete_main.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from torch.utils.tensorboard import SummaryWriter
  4 | import gym
  5 | import argparse
  6 | from normalization import Normalization, RewardScaling
  7 | from replaybuffer import ReplayBuffer
  8 | from ppo_discrete import PPO_discrete
  9 | 
 10 | 
 11 | def evaluate_policy(args, env, agent, state_norm):
 12 |     times = 3
 13 |     evaluate_reward = 0
 14 |     for _ in range(times):
 15 |         s = env.reset()
 16 |         if args.use_state_norm:  # During the evaluating,update=False
 17 |             s = state_norm(s, update=False)
 18 |         done = False
 19 |         episode_reward = 0
 20 |         while not done:
 21 |             a = agent.evaluate(s)  # We use the deterministic policy during the evaluating
 22 |             s_, r, done, _ = env.step(a)
 23 |             if args.use_state_norm:
 24 |                 s_ = state_norm(s_, update=False)
 25 |             episode_reward += r
 26 |             s = s_
 27 |         evaluate_reward += episode_reward
 28 | 
 29 |     return evaluate_reward / times
 30 | 
 31 | 
 32 | def main(args, env_name, number, seed):
 33 |     env = gym.make(env_name)
 34 |     env_evaluate = gym.make(env_name)  # When evaluating the policy, we need to rebuild an environment
 35 |     # Set random seed
 36 |     env.seed(seed)
 37 |     env.action_space.seed(seed)
 38 |     env_evaluate.seed(seed)
 39 |     env_evaluate.action_space.seed(seed)
 40 |     np.random.seed(seed)
 41 |     torch.manual_seed(seed)
 42 | 
 43 |     args.state_dim = env.observation_space.shape[0]
 44 |     args.action_dim = env.action_space.n
 45 |     args.max_episode_steps = env._max_episode_steps  # Maximum number of steps per episode
 46 |     print("env={}".format(env_name))
 47 |     print("state_dim={}".format(args.state_dim))
 48 |     print("action_dim={}".format(args.action_dim))
 49 |     print("max_episode_steps={}".format(args.max_episode_steps))
 50 | 
 51 |     evaluate_num = 0  # Record the number of evaluations
 52 |     evaluate_rewards = []  # Record the rewards during the evaluating
 53 |     total_steps = 0  # Record the total steps during the training
 54 | 
 55 |     replay_buffer = ReplayBuffer(args)
 56 |     agent = PPO_discrete(args)
 57 | 
 58 |     # Build a tensorboard
 59 |     writer = SummaryWriter(log_dir='runs/PPO_discrete/env_{}_number_{}_seed_{}'.format(env_name, number, seed))
 60 | 
 61 |     state_norm = Normalization(shape=args.state_dim)  # Trick 2:state normalization
 62 |     if args.use_reward_norm:  # Trick 3:reward normalization
 63 |         reward_norm = Normalization(shape=1)
 64 |     elif args.use_reward_scaling:  # Trick 4:reward scaling
 65 |         reward_scaling = RewardScaling(shape=1, gamma=args.gamma)
 66 | 
 67 |     while total_steps < args.max_train_steps:
 68 |         s = env.reset()
 69 |         if args.use_state_norm:
 70 |             s = state_norm(s)
 71 |         if args.use_reward_scaling:
 72 |             reward_scaling.reset()
 73 |         episode_steps = 0
 74 |         done = False
 75 |         while not done:
 76 |             episode_steps += 1
 77 |             a, a_logprob = agent.choose_action(s)  # Action and the corresponding log probability
 78 |             s_, r, done, _ = env.step(a)
 79 | 
 80 |             if args.use_state_norm:
 81 |                 s_ = state_norm(s_)
 82 |             if args.use_reward_norm:
 83 |                 r = reward_norm(r)
 84 |             elif args.use_reward_scaling:
 85 |                 r = reward_scaling(r)
 86 | 
 87 |             # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
 88 |             # dw means dead or win,there is no next state s';
 89 |             # but when reaching the max_episode_steps,there is a next state s' actually.
 90 |             if done and episode_steps != args.max_episode_steps:
 91 |                 dw = True
 92 |             else:
 93 |                 dw = False
 94 | 
 95 |             replay_buffer.store(s, a, a_logprob, r, s_, dw, done)
 96 |             s = s_
 97 |             total_steps += 1
 98 | 
 99 |             # When the number of transitions in buffer reaches batch_size,then update
100 |             if replay_buffer.count == args.batch_size:
101 |                 agent.update(replay_buffer, total_steps)
102 |                 replay_buffer.count = 0
103 | 
104 |             # Evaluate the policy every 'evaluate_freq' steps
105 |             if total_steps % args.evaluate_freq == 0:
106 |                 evaluate_num += 1
107 |                 evaluate_reward = evaluate_policy(args, env_evaluate, agent, state_norm)
108 |                 evaluate_rewards.append(evaluate_reward)
109 |                 print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
110 |                 writer.add_scalar('step_rewards_{}'.format(env_name), evaluate_rewards[-1], global_step=total_steps)
111 |                 # Save the rewards
112 |                 if evaluate_num % args.save_freq == 0:
113 |                     np.save('./data_train/PPO_discrete_env_{}_number_{}_seed_{}.npy'.format(env_name, number, seed), np.array(evaluate_rewards))
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     parser = argparse.ArgumentParser("Hyperparameter Setting for PPO-discrete")
118 |     parser.add_argument("--max_train_steps", type=int, default=int(2e5), help=" Maximum number of training steps")
119 |     parser.add_argument("--evaluate_freq", type=float, default=5e3, help="Evaluate the policy every 'evaluate_freq' steps")
120 |     parser.add_argument("--save_freq", type=int, default=20, help="Save frequency")
121 |     parser.add_argument("--batch_size", type=int, default=2048, help="Batch size")
122 |     parser.add_argument("--mini_batch_size", type=int, default=64, help="Minibatch size")
123 |     parser.add_argument("--hidden_width", type=int, default=64, help="The number of neurons in hidden layers of the neural network")
124 |     parser.add_argument("--lr_a", type=float, default=3e-4, help="Learning rate of actor")
125 |     parser.add_argument("--lr_c", type=float, default=3e-4, help="Learning rate of critic")
126 |     parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
127 |     parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter")
128 |     parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter")
129 |     parser.add_argument("--K_epochs", type=int, default=10, help="PPO parameter")
130 |     parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization")
131 |     parser.add_argument("--use_state_norm", type=bool, default=True, help="Trick 2:state normalization")
132 |     parser.add_argument("--use_reward_norm", type=bool, default=False, help="Trick 3:reward normalization")
133 |     parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling")
134 |     parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy")
135 |     parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay")
136 |     parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip")
137 |     parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization")
138 |     parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5")
139 |     parser.add_argument("--use_tanh", type=float, default=True, help="Trick 10: tanh activation function")
140 | 
141 |     args = parser.parse_args()
142 | 
143 |     env_name = ['CartPole-v1', 'LunarLander-v2']
144 |     env_index = 1
145 |     main(args, env_name=env_name[env_index], number=1, seed=0)
146 | 


--------------------------------------------------------------------------------
/4.PPO-discrete/README.md:
--------------------------------------------------------------------------------
 1 | # PPO-discrete
 2 | This is a concise Pytorch implementation of PPO on discrete action space with 10 tricks.<br />
 3 | 
 4 | ## 10 tricks
 5 | Trick 1—Advantage Normalization.<br />
 6 | Trick 2—State Normalization.<br />
 7 | Trick 3 & Trick 4—— Reward Normalization & Reward Scaling.<br />
 8 | Trick 5—Policy Entropy.<br />
 9 | Trick 6—Learning Rate Decay.<br />
10 | Trick 7—Gradient clip.<br />
11 | Trick 8—Orthogonal Initialization.<br />
12 | Trick 9—Adam Optimizer Epsilon Parameter.<br />
13 | Trick10—Tanh Activation Function.<br />
14 | 
15 | ## How to use my code?
16 | You can dircetly run 'PPO_discrete_main.py' in your own IDE.<br />
17 | 
18 | ## Trainning environments
19 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 2 environments.<br />
20 | env_index=0 represent 'CartPole-v1'<br />
21 | env_index=1 represent 'LunarLander-v2'<br />
22 | 
23 | ## Training result
24 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/4.PPO-discrete/training_result.png)
25 | 
26 | ## Tutorial
27 | If you can read Chinese, you can get more information from this blog.https://zhuanlan.zhihu.com/p/512327050
28 | 


--------------------------------------------------------------------------------
/4.PPO-discrete/__pycache__/normalization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/__pycache__/normalization.cpython-37.pyc


--------------------------------------------------------------------------------
/4.PPO-discrete/__pycache__/ppo_discrete.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/__pycache__/ppo_discrete.cpython-37.pyc


--------------------------------------------------------------------------------
/4.PPO-discrete/__pycache__/replaybuffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/__pycache__/replaybuffer.cpython-37.pyc


--------------------------------------------------------------------------------
/4.PPO-discrete/data_train/readme.txt:
--------------------------------------------------------------------------------
1 | This is a file used to store the training reward data.


--------------------------------------------------------------------------------
/4.PPO-discrete/normalization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RunningMeanStd:
 5 |     # Dynamically calculate mean and std
 6 |     def __init__(self, shape):  # shape:the dimension of input data
 7 |         self.n = 0
 8 |         self.mean = np.zeros(shape)
 9 |         self.S = np.zeros(shape)
10 |         self.std = np.sqrt(self.S)
11 | 
12 |     def update(self, x):
13 |         x = np.array(x)
14 |         self.n += 1
15 |         if self.n == 1:
16 |             self.mean = x
17 |             self.std = x
18 |         else:
19 |             old_mean = self.mean.copy()
20 |             self.mean = old_mean + (x - old_mean) / self.n
21 |             self.S = self.S + (x - old_mean) * (x - self.mean)
22 |             self.std = np.sqrt(self.S / self.n)
23 | 
24 | 
25 | class Normalization:
26 |     def __init__(self, shape):
27 |         self.running_ms = RunningMeanStd(shape=shape)
28 | 
29 |     def __call__(self, x, update=True):
30 |         # Whether to update the mean and std,during the evaluating,update=False
31 |         if update:
32 |             self.running_ms.update(x)
33 |         x = (x - self.running_ms.mean) / (self.running_ms.std + 1e-8)
34 | 
35 |         return x
36 | 
37 | 
38 | class RewardScaling:
39 |     def __init__(self, shape, gamma):
40 |         self.shape = shape  # reward shape=1
41 |         self.gamma = gamma  # discount factor
42 |         self.running_ms = RunningMeanStd(shape=self.shape)
43 |         self.R = np.zeros(self.shape)
44 | 
45 |     def __call__(self, x):
46 |         self.R = self.gamma * self.R + x
47 |         self.running_ms.update(self.R)
48 |         x = x / (self.running_ms.std + 1e-8)  # Only divided std
49 |         return x
50 | 
51 |     def reset(self):  # When an episode is done,we should reset 'self.R'
52 |         self.R = np.zeros(self.shape)
53 | 


--------------------------------------------------------------------------------
/4.PPO-discrete/ppo_discrete.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
  6 | from torch.distributions import Categorical
  7 | 
  8 | 
  9 | # Trick 8: orthogonal initialization
 10 | def orthogonal_init(layer, gain=1.0):
 11 |     nn.init.orthogonal_(layer.weight, gain=gain)
 12 |     nn.init.constant_(layer.bias, 0)
 13 | 
 14 | 
 15 | class Actor(nn.Module):
 16 |     def __init__(self, args):
 17 |         super(Actor, self).__init__()
 18 |         self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
 19 |         self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
 20 |         self.fc3 = nn.Linear(args.hidden_width, args.action_dim)
 21 |         self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh]  # Trick10: use tanh
 22 | 
 23 |         if args.use_orthogonal_init:
 24 |             print("------use_orthogonal_init------")
 25 |             orthogonal_init(self.fc1)
 26 |             orthogonal_init(self.fc2)
 27 |             orthogonal_init(self.fc3, gain=0.01)
 28 | 
 29 |     def forward(self, s):
 30 |         s = self.activate_func(self.fc1(s))
 31 |         s = self.activate_func(self.fc2(s))
 32 |         a_prob = torch.softmax(self.fc3(s), dim=1)
 33 |         return a_prob
 34 | 
 35 | 
 36 | class Critic(nn.Module):
 37 |     def __init__(self, args):
 38 |         super(Critic, self).__init__()
 39 |         self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
 40 |         self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
 41 |         self.fc3 = nn.Linear(args.hidden_width, 1)
 42 |         self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh]  # Trick10: use tanh
 43 | 
 44 |         if args.use_orthogonal_init:
 45 |             print("------use_orthogonal_init------")
 46 |             orthogonal_init(self.fc1)
 47 |             orthogonal_init(self.fc2)
 48 |             orthogonal_init(self.fc3)
 49 | 
 50 |     def forward(self, s):
 51 |         s = self.activate_func(self.fc1(s))
 52 |         s = self.activate_func(self.fc2(s))
 53 |         v_s = self.fc3(s)
 54 |         return v_s
 55 | 
 56 | 
 57 | class PPO_discrete:
 58 |     def __init__(self, args):
 59 |         self.batch_size = args.batch_size
 60 |         self.mini_batch_size = args.mini_batch_size
 61 |         self.max_train_steps = args.max_train_steps
 62 |         self.lr_a = args.lr_a  # Learning rate of actor
 63 |         self.lr_c = args.lr_c  # Learning rate of critic
 64 |         self.gamma = args.gamma  # Discount factor
 65 |         self.lamda = args.lamda  # GAE parameter
 66 |         self.epsilon = args.epsilon  # PPO clip parameter
 67 |         self.K_epochs = args.K_epochs  # PPO parameter
 68 |         self.entropy_coef = args.entropy_coef  # Entropy coefficient
 69 |         self.set_adam_eps = args.set_adam_eps
 70 |         self.use_grad_clip = args.use_grad_clip
 71 |         self.use_lr_decay = args.use_lr_decay
 72 |         self.use_adv_norm = args.use_adv_norm
 73 | 
 74 |         self.actor = Actor(args)
 75 |         self.critic = Critic(args)
 76 |         if self.set_adam_eps:  # Trick 9: set Adam epsilon=1e-5
 77 |             self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a, eps=1e-5)
 78 |             self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c, eps=1e-5)
 79 |         else:
 80 |             self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a)
 81 |             self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c)
 82 | 
 83 |     def evaluate(self, s):  # When evaluating the policy, we select the action with the highest probability
 84 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
 85 |         a_prob = self.actor(s).detach().numpy().flatten()
 86 |         a = np.argmax(a_prob)
 87 |         return a
 88 | 
 89 |     def choose_action(self, s):
 90 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
 91 |         with torch.no_grad():
 92 |             dist = Categorical(probs=self.actor(s))
 93 |             a = dist.sample()
 94 |             a_logprob = dist.log_prob(a)
 95 |         return a.numpy()[0], a_logprob.numpy()[0]
 96 | 
 97 |     def update(self, replay_buffer, total_steps):
 98 |         s, a, a_logprob, r, s_, dw, done = replay_buffer.numpy_to_tensor()  # Get training data
 99 |         """
100 |             Calculate the advantage using GAE
101 |             'dw=True' means dead or win, there is no next state s'
102 |             'done=True' represents the terminal of an episode(dead or win or reaching the max_episode_steps). When calculating the adv, if done=True, gae=0
103 |         """
104 |         adv = []
105 |         gae = 0
106 |         with torch.no_grad():  # adv and v_target have no gradient
107 |             vs = self.critic(s)
108 |             vs_ = self.critic(s_)
109 |             deltas = r + self.gamma * (1.0 - dw) * vs_ - vs
110 |             for delta, d in zip(reversed(deltas.flatten().numpy()), reversed(done.flatten().numpy())):
111 |                 gae = delta + self.gamma * self.lamda * gae * (1.0 - d)
112 |                 adv.insert(0, gae)
113 |             adv = torch.tensor(adv, dtype=torch.float).view(-1, 1)
114 |             v_target = adv + vs
115 |             if self.use_adv_norm:  # Trick 1:advantage normalization
116 |                 adv = ((adv - adv.mean()) / (adv.std() + 1e-5))
117 | 
118 |         # Optimize policy for K epochs:
119 |         for _ in range(self.K_epochs):
120 |             # Random sampling and no repetition. 'False' indicates that training will continue even if the number of samples in the last time is less than mini_batch_size
121 |             for index in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.mini_batch_size, False):
122 |                 dist_now = Categorical(probs=self.actor(s[index]))
123 |                 dist_entropy = dist_now.entropy().view(-1, 1)  # shape(mini_batch_size X 1)
124 |                 a_logprob_now = dist_now.log_prob(a[index].squeeze()).view(-1, 1)  # shape(mini_batch_size X 1)
125 |                 # a/b=exp(log(a)-log(b))
126 |                 ratios = torch.exp(a_logprob_now - a_logprob[index])  # shape(mini_batch_size X 1)
127 | 
128 |                 surr1 = ratios * adv[index]  # Only calculate the gradient of 'a_logprob_now' in ratios
129 |                 surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * adv[index]
130 |                 actor_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy  # shape(mini_batch_size X 1)
131 |                 # Update actor
132 |                 self.optimizer_actor.zero_grad()
133 |                 actor_loss.mean().backward()
134 |                 if self.use_grad_clip:  # Trick 7: Gradient clip
135 |                     torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
136 |                 self.optimizer_actor.step()
137 | 
138 |                 v_s = self.critic(s[index])
139 |                 critic_loss = F.mse_loss(v_target[index], v_s)
140 |                 # Update critic
141 |                 self.optimizer_critic.zero_grad()
142 |                 critic_loss.backward()
143 |                 if self.use_grad_clip:  # Trick 7: Gradient clip
144 |                     torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
145 |                 self.optimizer_critic.step()
146 | 
147 |         if self.use_lr_decay:  # Trick 6:learning rate Decay
148 |             self.lr_decay(total_steps)
149 | 
150 |     def lr_decay(self, total_steps):
151 |         lr_a_now = self.lr_a * (1 - total_steps / self.max_train_steps)
152 |         lr_c_now = self.lr_c * (1 - total_steps / self.max_train_steps)
153 |         for p in self.optimizer_actor.param_groups:
154 |             p['lr'] = lr_a_now
155 |         for p in self.optimizer_critic.param_groups:
156 |             p['lr'] = lr_c_now
157 | 


--------------------------------------------------------------------------------
/4.PPO-discrete/replaybuffer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class ReplayBuffer:
 6 |     def __init__(self, args):
 7 |         self.s = np.zeros((args.batch_size, args.state_dim))
 8 |         self.a = np.zeros((args.batch_size, 1))
 9 |         self.a_logprob = np.zeros((args.batch_size, 1))
10 |         self.r = np.zeros((args.batch_size, 1))
11 |         self.s_ = np.zeros((args.batch_size, args.state_dim))
12 |         self.dw = np.zeros((args.batch_size, 1))
13 |         self.done = np.zeros((args.batch_size, 1))
14 |         self.count = 0
15 | 
16 |     def store(self, s, a, a_logprob, r, s_, dw, done):
17 |         self.s[self.count] = s
18 |         self.a[self.count] = a
19 |         self.a_logprob[self.count] = a_logprob
20 |         self.r[self.count] = r
21 |         self.s_[self.count] = s_
22 |         self.dw[self.count] = dw
23 |         self.done[self.count] = done
24 |         self.count += 1
25 | 
26 |     def numpy_to_tensor(self):
27 |         s = torch.tensor(self.s, dtype=torch.float)
28 |         a = torch.tensor(self.a, dtype=torch.long)  # In discrete action space, 'a' needs to be torch.long
29 |         a_logprob = torch.tensor(self.a_logprob, dtype=torch.float)
30 |         r = torch.tensor(self.r, dtype=torch.float)
31 |         s_ = torch.tensor(self.s_, dtype=torch.float)
32 |         dw = torch.tensor(self.dw, dtype=torch.float)
33 |         done = torch.tensor(self.done, dtype=torch.float)
34 | 
35 |         return s, a, a_logprob, r, s_, dw, done
36 | 


--------------------------------------------------------------------------------
/4.PPO-discrete/runs/readme.txt:
--------------------------------------------------------------------------------
1 | This is a file used to save the tensorboard data.


--------------------------------------------------------------------------------
/4.PPO-discrete/training_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/4.PPO-discrete/training_result.png


--------------------------------------------------------------------------------
/5.PPO-continuous/PPO_continuous_main.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from torch.utils.tensorboard import SummaryWriter
  4 | import gym
  5 | import argparse
  6 | from normalization import Normalization, RewardScaling
  7 | from replaybuffer import ReplayBuffer
  8 | from ppo_continuous import PPO_continuous
  9 | 
 10 | 
 11 | def evaluate_policy(args, env, agent, state_norm):
 12 |     times = 3
 13 |     evaluate_reward = 0
 14 |     for _ in range(times):
 15 |         s = env.reset()
 16 |         if args.use_state_norm:
 17 |             s = state_norm(s, update=False)  # During the evaluating,update=False
 18 |         done = False
 19 |         episode_reward = 0
 20 |         while not done:
 21 |             a = agent.evaluate(s)  # We use the deterministic policy during the evaluating
 22 |             if args.policy_dist == "Beta":
 23 |                 action = 2 * (a - 0.5) * args.max_action  # [0,1]->[-max,max]
 24 |             else:
 25 |                 action = a
 26 |             s_, r, done, _ = env.step(action)
 27 |             if args.use_state_norm:
 28 |                 s_ = state_norm(s_, update=False)
 29 |             episode_reward += r
 30 |             s = s_
 31 |         evaluate_reward += episode_reward
 32 | 
 33 |     return evaluate_reward / times
 34 | 
 35 | 
 36 | def main(args, env_name, number, seed):
 37 |     env = gym.make(env_name)
 38 |     env_evaluate = gym.make(env_name)  # When evaluating the policy, we need to rebuild an environment
 39 |     # Set random seed
 40 |     env.seed(seed)
 41 |     env.action_space.seed(seed)
 42 |     env_evaluate.seed(seed)
 43 |     env_evaluate.action_space.seed(seed)
 44 |     np.random.seed(seed)
 45 |     torch.manual_seed(seed)
 46 | 
 47 |     args.state_dim = env.observation_space.shape[0]
 48 |     args.action_dim = env.action_space.shape[0]
 49 |     args.max_action = float(env.action_space.high[0])
 50 |     args.max_episode_steps = env._max_episode_steps  # Maximum number of steps per episode
 51 |     print("env={}".format(env_name))
 52 |     print("state_dim={}".format(args.state_dim))
 53 |     print("action_dim={}".format(args.action_dim))
 54 |     print("max_action={}".format(args.max_action))
 55 |     print("max_episode_steps={}".format(args.max_episode_steps))
 56 | 
 57 |     evaluate_num = 0  # Record the number of evaluations
 58 |     evaluate_rewards = []  # Record the rewards during the evaluating
 59 |     total_steps = 0  # Record the total steps during the training
 60 | 
 61 |     replay_buffer = ReplayBuffer(args)
 62 |     agent = PPO_continuous(args)
 63 | 
 64 |     # Build a tensorboard
 65 |     writer = SummaryWriter(log_dir='runs/PPO_continuous/env_{}_{}_number_{}_seed_{}'.format(env_name, args.policy_dist, number, seed))
 66 | 
 67 |     state_norm = Normalization(shape=args.state_dim)  # Trick 2:state normalization
 68 |     if args.use_reward_norm:  # Trick 3:reward normalization
 69 |         reward_norm = Normalization(shape=1)
 70 |     elif args.use_reward_scaling:  # Trick 4:reward scaling
 71 |         reward_scaling = RewardScaling(shape=1, gamma=args.gamma)
 72 | 
 73 |     while total_steps < args.max_train_steps:
 74 |         s = env.reset()
 75 |         if args.use_state_norm:
 76 |             s = state_norm(s)
 77 |         if args.use_reward_scaling:
 78 |             reward_scaling.reset()
 79 |         episode_steps = 0
 80 |         done = False
 81 |         while not done:
 82 |             episode_steps += 1
 83 |             a, a_logprob = agent.choose_action(s)  # Action and the corresponding log probability
 84 |             if args.policy_dist == "Beta":
 85 |                 action = 2 * (a - 0.5) * args.max_action  # [0,1]->[-max,max]
 86 |             else:
 87 |                 action = a
 88 |             s_, r, done, _ = env.step(action)
 89 | 
 90 |             if args.use_state_norm:
 91 |                 s_ = state_norm(s_)
 92 |             if args.use_reward_norm:
 93 |                 r = reward_norm(r)
 94 |             elif args.use_reward_scaling:
 95 |                 r = reward_scaling(r)
 96 | 
 97 |             # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
 98 |             # dw means dead or win,there is no next state s';
 99 |             # but when reaching the max_episode_steps,there is a next state s' actually.
100 |             if done and episode_steps != args.max_episode_steps:
101 |                 dw = True
102 |             else:
103 |                 dw = False
104 | 
105 |             # Take the 'action'，but store the original 'a'（especially for Beta）
106 |             replay_buffer.store(s, a, a_logprob, r, s_, dw, done)
107 |             s = s_
108 |             total_steps += 1
109 | 
110 |             # When the number of transitions in buffer reaches batch_size,then update
111 |             if replay_buffer.count == args.batch_size:
112 |                 agent.update(replay_buffer, total_steps)
113 |                 replay_buffer.count = 0
114 | 
115 |             # Evaluate the policy every 'evaluate_freq' steps
116 |             if total_steps % args.evaluate_freq == 0:
117 |                 evaluate_num += 1
118 |                 evaluate_reward = evaluate_policy(args, env_evaluate, agent, state_norm)
119 |                 evaluate_rewards.append(evaluate_reward)
120 |                 print("evaluate_num:{} \t evaluate_reward:{} \t".format(evaluate_num, evaluate_reward))
121 |                 writer.add_scalar('step_rewards_{}'.format(env_name), evaluate_rewards[-1], global_step=total_steps)
122 |                 # Save the rewards
123 |                 if evaluate_num % args.save_freq == 0:
124 |                     np.save('./data_train/PPO_continuous_{}_env_{}_number_{}_seed_{}.npy'.format(args.policy_dist, env_name, number, seed), np.array(evaluate_rewards))
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     parser = argparse.ArgumentParser("Hyperparameters Setting for PPO-continuous")
129 |     parser.add_argument("--max_train_steps", type=int, default=int(3e6), help=" Maximum number of training steps")
130 |     parser.add_argument("--evaluate_freq", type=float, default=5e3, help="Evaluate the policy every 'evaluate_freq' steps")
131 |     parser.add_argument("--save_freq", type=int, default=20, help="Save frequency")
132 |     parser.add_argument("--policy_dist", type=str, default="Gaussian", help="Beta or Gaussian")
133 |     parser.add_argument("--batch_size", type=int, default=2048, help="Batch size")
134 |     parser.add_argument("--mini_batch_size", type=int, default=64, help="Minibatch size")
135 |     parser.add_argument("--hidden_width", type=int, default=64, help="The number of neurons in hidden layers of the neural network")
136 |     parser.add_argument("--lr_a", type=float, default=3e-4, help="Learning rate of actor")
137 |     parser.add_argument("--lr_c", type=float, default=3e-4, help="Learning rate of critic")
138 |     parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
139 |     parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter")
140 |     parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter")
141 |     parser.add_argument("--K_epochs", type=int, default=10, help="PPO parameter")
142 |     parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization")
143 |     parser.add_argument("--use_state_norm", type=bool, default=True, help="Trick 2:state normalization")
144 |     parser.add_argument("--use_reward_norm", type=bool, default=False, help="Trick 3:reward normalization")
145 |     parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling")
146 |     parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy")
147 |     parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay")
148 |     parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip")
149 |     parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization")
150 |     parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5")
151 |     parser.add_argument("--use_tanh", type=float, default=True, help="Trick 10: tanh activation function")
152 | 
153 |     args = parser.parse_args()
154 | 
155 |     env_name = ['BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2']
156 |     env_index = 1
157 |     main(args, env_name=env_name[env_index], number=1, seed=10)
158 | 


--------------------------------------------------------------------------------
/5.PPO-continuous/README.md:
--------------------------------------------------------------------------------
 1 | # PPO-continuous
 2 | This is a concise Pytorch implementation of PPO on continuous action space with 10 tricks.<br />
 3 | 
 4 | ## 10 tricks
 5 | Trick 1—Advantage Normalization.<br />
 6 | Trick 2—State Normalization.<br />
 7 | Trick 3 & Trick 4—— Reward Normalization & Reward Scaling.<br />
 8 | Trick 5—Policy Entropy.<br />
 9 | Trick 6—Learning Rate Decay.<br />
10 | Trick 7—Gradient clip.<br />
11 | Trick 8—Orthogonal Initialization.<br />
12 | Trick 9—Adam Optimizer Epsilon Parameter.<br />
13 | Trick10—Tanh Activation Function.<br />
14 | 
15 | ## How to use my code?
16 | You can dircetly run 'PPO_continuous_main.py' in your own IDE.<br />
17 | 
18 | ## Trainning environments
19 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 4 environments.<br />
20 | env_index=0 represent 'BipedalWalker-v3'<br />
21 | env_index=1 represent 'HalfCheetah-v2'<br />
22 | env_index=2 represent 'Hopper-v2'<br />
23 | env_index=3 represent 'Walker2d-v2'<br />
24 | 
25 | ## Trainning result
26 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/5.PPO-continuous/training_result.png)
27 | 
28 | ## Tutorial
29 | If you can read Chinese, you can get more information from this blog.https://zhuanlan.zhihu.com/p/512327050
30 | 


--------------------------------------------------------------------------------
/5.PPO-continuous/__pycache__/normalization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/__pycache__/normalization.cpython-37.pyc


--------------------------------------------------------------------------------
/5.PPO-continuous/__pycache__/ppo_continuous.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/__pycache__/ppo_continuous.cpython-37.pyc


--------------------------------------------------------------------------------
/5.PPO-continuous/__pycache__/replaybuffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/__pycache__/replaybuffer.cpython-37.pyc


--------------------------------------------------------------------------------
/5.PPO-continuous/data_train/readme.txt:
--------------------------------------------------------------------------------
1 | This is a file used to store the training reward data.


--------------------------------------------------------------------------------
/5.PPO-continuous/normalization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RunningMeanStd:
 5 |     # Dynamically calculate mean and std
 6 |     def __init__(self, shape):  # shape:the dimension of input data
 7 |         self.n = 0
 8 |         self.mean = np.zeros(shape)
 9 |         self.S = np.zeros(shape)
10 |         self.std = np.sqrt(self.S)
11 | 
12 |     def update(self, x):
13 |         x = np.array(x)
14 |         self.n += 1
15 |         if self.n == 1:
16 |             self.mean = x
17 |             self.std = x
18 |         else:
19 |             old_mean = self.mean.copy()
20 |             self.mean = old_mean + (x - old_mean) / self.n
21 |             self.S = self.S + (x - old_mean) * (x - self.mean)
22 |             self.std = np.sqrt(self.S / self.n)
23 | 
24 | 
25 | class Normalization:
26 |     def __init__(self, shape):
27 |         self.running_ms = RunningMeanStd(shape=shape)
28 | 
29 |     def __call__(self, x, update=True):
30 |         # Whether to update the mean and std,during the evaluating,update=False
31 |         if update:
32 |             self.running_ms.update(x)
33 |         x = (x - self.running_ms.mean) / (self.running_ms.std + 1e-8)
34 | 
35 |         return x
36 | 
37 | 
38 | class RewardScaling:
39 |     def __init__(self, shape, gamma):
40 |         self.shape = shape  # reward shape=1
41 |         self.gamma = gamma  # discount factor
42 |         self.running_ms = RunningMeanStd(shape=self.shape)
43 |         self.R = np.zeros(self.shape)
44 | 
45 |     def __call__(self, x):
46 |         self.R = self.gamma * self.R + x
47 |         self.running_ms.update(self.R)
48 |         x = x / (self.running_ms.std + 1e-8)  # Only divided std
49 |         return x
50 | 
51 |     def reset(self):  # When an episode is done,we should reset 'self.R'
52 |         self.R = np.zeros(self.shape)
53 | 


--------------------------------------------------------------------------------
/5.PPO-continuous/ppo_continuous.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
  4 | import torch.nn as nn
  5 | from torch.distributions import Beta, Normal
  6 | 
  7 | 
  8 | # Trick 8: orthogonal initialization
  9 | def orthogonal_init(layer, gain=1.0):
 10 |     nn.init.orthogonal_(layer.weight, gain=gain)
 11 |     nn.init.constant_(layer.bias, 0)
 12 | 
 13 | 
 14 | class Actor_Beta(nn.Module):
 15 |     def __init__(self, args):
 16 |         super(Actor_Beta, self).__init__()
 17 |         self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
 18 |         self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
 19 |         self.alpha_layer = nn.Linear(args.hidden_width, args.action_dim)
 20 |         self.beta_layer = nn.Linear(args.hidden_width, args.action_dim)
 21 |         self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh]  # Trick10: use tanh
 22 | 
 23 |         if args.use_orthogonal_init:
 24 |             print("------use_orthogonal_init------")
 25 |             orthogonal_init(self.fc1)
 26 |             orthogonal_init(self.fc2)
 27 |             orthogonal_init(self.alpha_layer, gain=0.01)
 28 |             orthogonal_init(self.beta_layer, gain=0.01)
 29 | 
 30 |     def forward(self, s):
 31 |         s = self.activate_func(self.fc1(s))
 32 |         s = self.activate_func(self.fc2(s))
 33 |         # alpha and beta need to be larger than 1,so we use 'softplus' as the activation function and then plus 1
 34 |         alpha = F.softplus(self.alpha_layer(s)) + 1.0
 35 |         beta = F.softplus(self.beta_layer(s)) + 1.0
 36 |         return alpha, beta
 37 | 
 38 |     def get_dist(self, s):
 39 |         alpha, beta = self.forward(s)
 40 |         dist = Beta(alpha, beta)
 41 |         return dist
 42 | 
 43 |     def mean(self, s):
 44 |         alpha, beta = self.forward(s)
 45 |         mean = alpha / (alpha + beta)  # The mean of the beta distribution
 46 |         return mean
 47 | 
 48 | 
 49 | class Actor_Gaussian(nn.Module):
 50 |     def __init__(self, args):
 51 |         super(Actor_Gaussian, self).__init__()
 52 |         self.max_action = args.max_action
 53 |         self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
 54 |         self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
 55 |         self.mean_layer = nn.Linear(args.hidden_width, args.action_dim)
 56 |         self.log_std = nn.Parameter(torch.zeros(1, args.action_dim))  # We use 'nn.Parameter' to train log_std automatically
 57 |         self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh]  # Trick10: use tanh
 58 | 
 59 |         if args.use_orthogonal_init:
 60 |             print("------use_orthogonal_init------")
 61 |             orthogonal_init(self.fc1)
 62 |             orthogonal_init(self.fc2)
 63 |             orthogonal_init(self.mean_layer, gain=0.01)
 64 | 
 65 |     def forward(self, s):
 66 |         s = self.activate_func(self.fc1(s))
 67 |         s = self.activate_func(self.fc2(s))
 68 |         mean = self.max_action * torch.tanh(self.mean_layer(s))  # [-1,1]->[-max_action,max_action]
 69 |         return mean
 70 | 
 71 |     def get_dist(self, s):
 72 |         mean = self.forward(s)
 73 |         log_std = self.log_std.expand_as(mean)  # To make 'log_std' have the same dimension as 'mean'
 74 |         std = torch.exp(log_std)  # The reason we train the 'log_std' is to ensure std=exp(log_std)>0
 75 |         dist = Normal(mean, std)  # Get the Gaussian distribution
 76 |         return dist
 77 | 
 78 | 
 79 | class Critic(nn.Module):
 80 |     def __init__(self, args):
 81 |         super(Critic, self).__init__()
 82 |         self.fc1 = nn.Linear(args.state_dim, args.hidden_width)
 83 |         self.fc2 = nn.Linear(args.hidden_width, args.hidden_width)
 84 |         self.fc3 = nn.Linear(args.hidden_width, 1)
 85 |         self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh]  # Trick10: use tanh
 86 | 
 87 |         if args.use_orthogonal_init:
 88 |             print("------use_orthogonal_init------")
 89 |             orthogonal_init(self.fc1)
 90 |             orthogonal_init(self.fc2)
 91 |             orthogonal_init(self.fc3)
 92 | 
 93 |     def forward(self, s):
 94 |         s = self.activate_func(self.fc1(s))
 95 |         s = self.activate_func(self.fc2(s))
 96 |         v_s = self.fc3(s)
 97 |         return v_s
 98 | 
 99 | 
100 | class PPO_continuous():
101 |     def __init__(self, args):
102 |         self.policy_dist = args.policy_dist
103 |         self.max_action = args.max_action
104 |         self.batch_size = args.batch_size
105 |         self.mini_batch_size = args.mini_batch_size
106 |         self.max_train_steps = args.max_train_steps
107 |         self.lr_a = args.lr_a  # Learning rate of actor
108 |         self.lr_c = args.lr_c  # Learning rate of critic
109 |         self.gamma = args.gamma  # Discount factor
110 |         self.lamda = args.lamda  # GAE parameter
111 |         self.epsilon = args.epsilon  # PPO clip parameter
112 |         self.K_epochs = args.K_epochs  # PPO parameter
113 |         self.entropy_coef = args.entropy_coef  # Entropy coefficient
114 |         self.set_adam_eps = args.set_adam_eps
115 |         self.use_grad_clip = args.use_grad_clip
116 |         self.use_lr_decay = args.use_lr_decay
117 |         self.use_adv_norm = args.use_adv_norm
118 | 
119 |         if self.policy_dist == "Beta":
120 |             self.actor = Actor_Beta(args)
121 |         else:
122 |             self.actor = Actor_Gaussian(args)
123 |         self.critic = Critic(args)
124 | 
125 |         if self.set_adam_eps:  # Trick 9: set Adam epsilon=1e-5
126 |             self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a, eps=1e-5)
127 |             self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c, eps=1e-5)
128 |         else:
129 |             self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_a)
130 |             self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_c)
131 | 
132 |     def evaluate(self, s):  # When evaluating the policy, we only use the mean
133 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
134 |         if self.policy_dist == "Beta":
135 |             a = self.actor.mean(s).detach().numpy().flatten()
136 |         else:
137 |             a = self.actor(s).detach().numpy().flatten()
138 |         return a
139 | 
140 |     def choose_action(self, s):
141 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
142 |         if self.policy_dist == "Beta":
143 |             with torch.no_grad():
144 |                 dist = self.actor.get_dist(s)
145 |                 a = dist.sample()  # Sample the action according to the probability distribution
146 |                 a_logprob = dist.log_prob(a)  # The log probability density of the action
147 |         else:
148 |             with torch.no_grad():
149 |                 dist = self.actor.get_dist(s)
150 |                 a = dist.sample()  # Sample the action according to the probability distribution
151 |                 a = torch.clamp(a, -self.max_action, self.max_action)  # [-max,max]
152 |                 a_logprob = dist.log_prob(a)  # The log probability density of the action
153 |         return a.numpy().flatten(), a_logprob.numpy().flatten()
154 | 
155 |     def update(self, replay_buffer, total_steps):
156 |         s, a, a_logprob, r, s_, dw, done = replay_buffer.numpy_to_tensor()  # Get training data
157 |         """
158 |             Calculate the advantage using GAE
159 |             'dw=True' means dead or win, there is no next state s'
160 |             'done=True' represents the terminal of an episode(dead or win or reaching the max_episode_steps). When calculating the adv, if done=True, gae=0
161 |         """
162 |         adv = []
163 |         gae = 0
164 |         with torch.no_grad():  # adv and v_target have no gradient
165 |             vs = self.critic(s)
166 |             vs_ = self.critic(s_)
167 |             deltas = r + self.gamma * (1.0 - dw) * vs_ - vs
168 |             for delta, d in zip(reversed(deltas.flatten().numpy()), reversed(done.flatten().numpy())):
169 |                 gae = delta + self.gamma * self.lamda * gae * (1.0 - d)
170 |                 adv.insert(0, gae)
171 |             adv = torch.tensor(adv, dtype=torch.float).view(-1, 1)
172 |             v_target = adv + vs
173 |             if self.use_adv_norm:  # Trick 1:advantage normalization
174 |                 adv = ((adv - adv.mean()) / (adv.std() + 1e-5))
175 | 
176 |         # Optimize policy for K epochs:
177 |         for _ in range(self.K_epochs):
178 |             # Random sampling and no repetition. 'False' indicates that training will continue even if the number of samples in the last time is less than mini_batch_size
179 |             for index in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.mini_batch_size, False):
180 |                 dist_now = self.actor.get_dist(s[index])
181 |                 dist_entropy = dist_now.entropy().sum(1, keepdim=True)  # shape(mini_batch_size X 1)
182 |                 a_logprob_now = dist_now.log_prob(a[index])
183 |                 # a/b=exp(log(a)-log(b))  In multi-dimensional continuous action space，we need to sum up the log_prob
184 |                 ratios = torch.exp(a_logprob_now.sum(1, keepdim=True) - a_logprob[index].sum(1, keepdim=True))  # shape(mini_batch_size X 1)
185 | 
186 |                 surr1 = ratios * adv[index]  # Only calculate the gradient of 'a_logprob_now' in ratios
187 |                 surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * adv[index]
188 |                 actor_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy  # Trick 5: policy entropy
189 |                 # Update actor
190 |                 self.optimizer_actor.zero_grad()
191 |                 actor_loss.mean().backward()
192 |                 if self.use_grad_clip:  # Trick 7: Gradient clip
193 |                     torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
194 |                 self.optimizer_actor.step()
195 | 
196 |                 v_s = self.critic(s[index])
197 |                 critic_loss = F.mse_loss(v_target[index], v_s)
198 |                 # Update critic
199 |                 self.optimizer_critic.zero_grad()
200 |                 critic_loss.backward()
201 |                 if self.use_grad_clip:  # Trick 7: Gradient clip
202 |                     torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
203 |                 self.optimizer_critic.step()
204 | 
205 |         if self.use_lr_decay:  # Trick 6:learning rate Decay
206 |             self.lr_decay(total_steps)
207 | 
208 |     def lr_decay(self, total_steps):
209 |         lr_a_now = self.lr_a * (1 - total_steps / self.max_train_steps)
210 |         lr_c_now = self.lr_c * (1 - total_steps / self.max_train_steps)
211 |         for p in self.optimizer_actor.param_groups:
212 |             p['lr'] = lr_a_now
213 |         for p in self.optimizer_critic.param_groups:
214 |             p['lr'] = lr_c_now
215 | 


--------------------------------------------------------------------------------
/5.PPO-continuous/replaybuffer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class ReplayBuffer:
 6 |     def __init__(self, args):
 7 |         self.s = np.zeros((args.batch_size, args.state_dim))
 8 |         self.a = np.zeros((args.batch_size, args.action_dim))
 9 |         self.a_logprob = np.zeros((args.batch_size, args.action_dim))
10 |         self.r = np.zeros((args.batch_size, 1))
11 |         self.s_ = np.zeros((args.batch_size, args.state_dim))
12 |         self.dw = np.zeros((args.batch_size, 1))
13 |         self.done = np.zeros((args.batch_size, 1))
14 |         self.count = 0
15 | 
16 |     def store(self, s, a, a_logprob, r, s_, dw, done):
17 |         self.s[self.count] = s
18 |         self.a[self.count] = a
19 |         self.a_logprob[self.count] = a_logprob
20 |         self.r[self.count] = r
21 |         self.s_[self.count] = s_
22 |         self.dw[self.count] = dw
23 |         self.done[self.count] = done
24 |         self.count += 1
25 | 
26 |     def numpy_to_tensor(self):
27 |         s = torch.tensor(self.s, dtype=torch.float)
28 |         a = torch.tensor(self.a, dtype=torch.float)
29 |         a_logprob = torch.tensor(self.a_logprob, dtype=torch.float)
30 |         r = torch.tensor(self.r, dtype=torch.float)
31 |         s_ = torch.tensor(self.s_, dtype=torch.float)
32 |         dw = torch.tensor(self.dw, dtype=torch.float)
33 |         done = torch.tensor(self.done, dtype=torch.float)
34 | 
35 |         return s, a, a_logprob, r, s_, dw, done
36 | 


--------------------------------------------------------------------------------
/5.PPO-continuous/runs/readme.txt:
--------------------------------------------------------------------------------
1 | This is a file used to save the tensorboard data.


--------------------------------------------------------------------------------
/5.PPO-continuous/training_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/5.PPO-continuous/training_result.png


--------------------------------------------------------------------------------
/6.DDPG/DDPG.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | import copy
  7 | from torch.utils.tensorboard import SummaryWriter
  8 | 
  9 | 
 10 | class Actor(nn.Module):
 11 |     def __init__(self, state_dim, action_dim, hidden_width, max_action):
 12 |         super(Actor, self).__init__()
 13 |         self.max_action = max_action
 14 |         self.l1 = nn.Linear(state_dim, hidden_width)
 15 |         self.l2 = nn.Linear(hidden_width, hidden_width)
 16 |         self.l3 = nn.Linear(hidden_width, action_dim)
 17 | 
 18 |     def forward(self, s):
 19 |         s = F.relu(self.l1(s))
 20 |         s = F.relu(self.l2(s))
 21 |         a = self.max_action * torch.tanh(self.l3(s))  # [-max,max]
 22 |         return a
 23 | 
 24 | 
 25 | class Critic(nn.Module):  # According to (s,a), directly calculate Q(s,a)
 26 |     def __init__(self, state_dim, action_dim, hidden_width):
 27 |         super(Critic, self).__init__()
 28 |         self.l1 = nn.Linear(state_dim + action_dim, hidden_width)
 29 |         self.l2 = nn.Linear(hidden_width, hidden_width)
 30 |         self.l3 = nn.Linear(hidden_width, 1)
 31 | 
 32 |     def forward(self, s, a):
 33 |         q = F.relu(self.l1(torch.cat([s, a], 1)))
 34 |         q = F.relu(self.l2(q))
 35 |         q = self.l3(q)
 36 |         return q
 37 | 
 38 | 
 39 | class ReplayBuffer(object):
 40 |     def __init__(self, state_dim, action_dim):
 41 |         self.max_size = int(1e6)
 42 |         self.count = 0
 43 |         self.size = 0
 44 |         self.s = np.zeros((self.max_size, state_dim))
 45 |         self.a = np.zeros((self.max_size, action_dim))
 46 |         self.r = np.zeros((self.max_size, 1))
 47 |         self.s_ = np.zeros((self.max_size, state_dim))
 48 |         self.dw = np.zeros((self.max_size, 1))
 49 | 
 50 |     def store(self, s, a, r, s_, dw):
 51 |         self.s[self.count] = s
 52 |         self.a[self.count] = a
 53 |         self.r[self.count] = r
 54 |         self.s_[self.count] = s_
 55 |         self.dw[self.count] = dw
 56 |         self.count = (self.count + 1) % self.max_size  # When the 'count' reaches max_size, it will be reset to 0.
 57 |         self.size = min(self.size + 1, self.max_size)  # Record the number of  transitions
 58 | 
 59 |     def sample(self, batch_size):
 60 |         index = np.random.choice(self.size, size=batch_size)  # Randomly sampling
 61 |         batch_s = torch.tensor(self.s[index], dtype=torch.float)
 62 |         batch_a = torch.tensor(self.a[index], dtype=torch.float)
 63 |         batch_r = torch.tensor(self.r[index], dtype=torch.float)
 64 |         batch_s_ = torch.tensor(self.s_[index], dtype=torch.float)
 65 |         batch_dw = torch.tensor(self.dw[index], dtype=torch.float)
 66 | 
 67 |         return batch_s, batch_a, batch_r, batch_s_, batch_dw
 68 | 
 69 | 
 70 | class DDPG(object):
 71 |     def __init__(self, state_dim, action_dim, max_action):
 72 |         self.hidden_width = 256  # The number of neurons in hidden layers of the neural network
 73 |         self.batch_size = 256  # batch size
 74 |         self.GAMMA = 0.99  # discount factor
 75 |         self.TAU = 0.005  # Softly update the target network
 76 |         self.lr = 3e-4  # learning rate
 77 | 
 78 |         self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action)
 79 |         self.actor_target = copy.deepcopy(self.actor)
 80 |         self.critic = Critic(state_dim, action_dim, self.hidden_width)
 81 |         self.critic_target = copy.deepcopy(self.critic)
 82 | 
 83 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
 84 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
 85 | 
 86 |         self.MseLoss = nn.MSELoss()
 87 | 
 88 |     def choose_action(self, s):
 89 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
 90 |         a = self.actor(s).data.numpy().flatten()
 91 |         return a
 92 | 
 93 |     def learn(self, relay_buffer):
 94 |         batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size)  # Sample a batch
 95 | 
 96 |         # Compute the target Q
 97 |         with torch.no_grad():  # target_Q has no gradient
 98 |             Q_ = self.critic_target(batch_s_, self.actor_target(batch_s_))
 99 |             target_Q = batch_r + self.GAMMA * (1 - batch_dw) * Q_
100 | 
101 |         # Compute the current Q and the critic loss
102 |         current_Q = self.critic(batch_s, batch_a)
103 |         critic_loss = self.MseLoss(target_Q, current_Q)
104 |         # Optimize the critic
105 |         self.critic_optimizer.zero_grad()
106 |         critic_loss.backward()
107 |         self.critic_optimizer.step()
108 | 
109 |         # Freeze critic networks so you don't waste computational effort
110 |         for params in self.critic.parameters():
111 |             params.requires_grad = False
112 | 
113 |         # Compute the actor loss
114 |         actor_loss = -self.critic(batch_s, self.actor(batch_s)).mean()
115 |         # Optimize the actor
116 |         self.actor_optimizer.zero_grad()
117 |         actor_loss.backward()
118 |         self.actor_optimizer.step()
119 | 
120 |         # Unfreeze critic networks
121 |         for params in self.critic.parameters():
122 |             params.requires_grad = True
123 | 
124 |         # Softly update the target networks
125 |         for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
126 |             target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
127 | 
128 |         for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
129 |             target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
130 | 
131 | 
132 | def evaluate_policy(env, agent):
133 |     times = 3  # Perform three evaluations and calculate the average
134 |     evaluate_reward = 0
135 |     for _ in range(times):
136 |         s = env.reset()
137 |         done = False
138 |         episode_reward = 0
139 |         while not done:
140 |             a = agent.choose_action(s)  # We do not add noise when evaluating
141 |             s_, r, done, _ = env.step(a)
142 |             episode_reward += r
143 |             s = s_
144 |         evaluate_reward += episode_reward
145 | 
146 |     return int(evaluate_reward / times)
147 | 
148 | 
149 | def reward_adapter(r, env_index):
150 |     if env_index == 0:  # Pendulum-v1
151 |         r = (r + 8) / 8
152 |     elif env_index == 1:  # BipedalWalker-v3
153 |         if r <= -100:
154 |             r = -1
155 |     return r
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2']
160 |     env_index = 0
161 |     env = gym.make(env_name[env_index])
162 |     env_evaluate = gym.make(env_name[env_index])  # When evaluating the policy, we need to rebuild an environment
163 |     number = 1
164 |     # Set random seed
165 |     seed = 0
166 |     env.seed(seed)
167 |     env.action_space.seed(seed)
168 |     env_evaluate.seed(seed)
169 |     env_evaluate.action_space.seed(seed)
170 |     np.random.seed(seed)
171 |     torch.manual_seed(seed)
172 | 
173 |     state_dim = env.observation_space.shape[0]
174 |     action_dim = env.action_space.shape[0]
175 |     max_action = float(env.action_space.high[0])
176 |     max_episode_steps = env._max_episode_steps  # Maximum number of steps per episode
177 |     print("env={}".format(env_name[env_index]))
178 |     print("state_dim={}".format(state_dim))
179 |     print("action_dim={}".format(action_dim))
180 |     print("max_action={}".format(max_action))
181 |     print("max_episode_steps={}".format(max_episode_steps))
182 | 
183 |     agent = DDPG(state_dim, action_dim, max_action)
184 |     replay_buffer = ReplayBuffer(state_dim, action_dim)
185 |     # Build a tensorboard
186 |     writer = SummaryWriter(log_dir='runs/DDPG/DDPG_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed))
187 | 
188 |     noise_std = 0.1 * max_action  # the std of Gaussian noise for exploration
189 |     max_train_steps = 3e6  # Maximum number of training steps
190 |     random_steps = 25e3  # Take the random actions in the beginning for the better exploration
191 |     update_freq = 50  # Take 50 steps,then update the networks 50 times
192 |     evaluate_freq = 1e3  # Evaluate the policy every 'evaluate_freq' steps
193 |     evaluate_num = 0  # Record the number of evaluations
194 |     evaluate_rewards = []  # Record the rewards during the evaluating
195 |     total_steps = 0  # Record the total steps during the training
196 | 
197 |     while total_steps < max_train_steps:
198 |         s = env.reset()
199 |         episode_steps = 0
200 |         done = False
201 |         while not done:
202 |             episode_steps += 1
203 |             if total_steps < random_steps:  # Take the random actions in the beginning for the better exploration
204 |                 a = env.action_space.sample()
205 |             else:
206 |                 # Add Gaussian noise to actions for exploration
207 |                 a = agent.choose_action(s)
208 |                 a = (a + np.random.normal(0, noise_std, size=action_dim)).clip(-max_action, max_action)
209 |             s_, r, done, _ = env.step(a)
210 |             r = reward_adapter(r, env_index)   # Adjust rewards for better performance
211 |             # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
212 |             # dw means dead or win,there is no next state s';
213 |             # but when reaching the max_episode_steps,there is a next state s' actually.
214 |             if done and episode_steps != max_episode_steps:
215 |                 dw = True
216 |             else:
217 |                 dw = False
218 |             replay_buffer.store(s, a, r, s_, dw)  # Store the transition
219 |             s = s_
220 | 
221 |             # Take 50 steps,then update the networks 50 times
222 |             if total_steps >= random_steps and total_steps % update_freq == 0:
223 |                 for _ in range(update_freq):
224 |                     agent.learn(replay_buffer)
225 | 
226 |             # Evaluate the policy every 'evaluate_freq' steps
227 |             if (total_steps + 1) % evaluate_freq == 0:
228 |                 evaluate_num += 1
229 |                 evaluate_reward = evaluate_policy(env_evaluate, agent)
230 |                 evaluate_rewards.append(evaluate_reward)
231 |                 print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward))
232 |                 writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
233 |                 # Save the rewards
234 |                 if evaluate_num % 10 == 0:
235 |                     np.save('./data_train/DDPG_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
236 | 
237 |             total_steps += 1
238 | 


--------------------------------------------------------------------------------
/7.TD3/README.md:
--------------------------------------------------------------------------------
 1 | # TD3
 2 | This is a concise Pytorch implementation of TD3(Twin Delayed DDPG) on continuous action space.<br />
 3 | 
 4 | 
 5 | ## How to use my code?
 6 | You can dircetly run 'TD3.py' in your own IDE.<br />
 7 | 
 8 | ### Trainning environments
 9 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 5 environments.<br />
10 | env_index=0 represent 'Pendulum-v1'<br />
11 | env_index=1 represent 'BipedalWalker-v3'<br />
12 | env_index=2 represent 'HalfCheetah-v2'<br />
13 | env_index=3 represent 'Hopper-v2'<br />
14 | env_index=4 represent 'Walker2d-v2'<br />
15 | 
16 | ### How to see the training results?
17 | You can use the tensorboard to visualize the training curves, which are saved in the file 'runs'.<br />
18 | The rewards data are saved as numpy in the file 'data_train'.<br />
19 | The training curves are shown below,  which are smoothed by averaging over a window of 10 steps.<br />
20 | The solid line and the shadow respectively represent the average and standard deviation over three different random seeds. (seed=0, 10, 100)<br />
21 | 
22 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/7.TD3/TD3_result.png)
23 | 
24 | ## Reference
25 | [1] Fujimoto S, Hoof H, Meger D. Addressing function approximation error in actor-critic methods[C]//International conference on machine learning. PMLR, 2018: 1587-1596.<br />
26 | 


--------------------------------------------------------------------------------
/7.TD3/TD3.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | import copy
  7 | from torch.utils.tensorboard import SummaryWriter
  8 | 
  9 | 
 10 | class Actor(nn.Module):
 11 |     def __init__(self, state_dim, action_dim, hidden_width, max_action):
 12 |         super(Actor, self).__init__()
 13 |         self.max_action = max_action
 14 |         self.l1 = nn.Linear(state_dim, hidden_width)
 15 |         self.l2 = nn.Linear(hidden_width, hidden_width)
 16 |         self.l3 = nn.Linear(hidden_width, action_dim)
 17 | 
 18 |     def forward(self, s):
 19 |         s = F.relu(self.l1(s))
 20 |         s = F.relu(self.l2(s))
 21 |         a = self.max_action * torch.tanh(self.l3(s))  # [-max,max]
 22 |         return a
 23 | 
 24 | 
 25 | class Critic(nn.Module):  # According to (s,a), directly calculate Q(s,a)
 26 |     def __init__(self, state_dim, action_dim, hidden_width):
 27 |         super(Critic, self).__init__()
 28 |         # Q1
 29 |         self.l1 = nn.Linear(state_dim + action_dim, hidden_width)
 30 |         self.l2 = nn.Linear(hidden_width, hidden_width)
 31 |         self.l3 = nn.Linear(hidden_width, 1)
 32 |         # Q2
 33 |         self.l4 = nn.Linear(state_dim + action_dim, hidden_width)
 34 |         self.l5 = nn.Linear(hidden_width, hidden_width)
 35 |         self.l6 = nn.Linear(hidden_width, 1)
 36 | 
 37 |     def forward(self, s, a):
 38 |         s_a = torch.cat([s, a], 1)
 39 |         q1 = F.relu(self.l1(s_a))
 40 |         q1 = F.relu(self.l2(q1))
 41 |         q1 = self.l3(q1)
 42 | 
 43 |         q2 = F.relu(self.l4(s_a))
 44 |         q2 = F.relu(self.l5(q2))
 45 |         q2 = self.l6(q2)
 46 | 
 47 |         return q1, q2
 48 | 
 49 |     def Q1(self, s, a):
 50 |         s_a = torch.cat([s, a], 1)
 51 |         q1 = F.relu(self.l1(s_a))
 52 |         q1 = F.relu(self.l2(q1))
 53 |         q1 = self.l3(q1)
 54 | 
 55 |         return q1
 56 | 
 57 | 
 58 | class ReplayBuffer(object):
 59 |     def __init__(self, state_dim, action_dim):
 60 |         self.max_size = int(1e6)
 61 |         self.count = 0
 62 |         self.size = 0
 63 |         self.s = np.zeros((self.max_size, state_dim))
 64 |         self.a = np.zeros((self.max_size, action_dim))
 65 |         self.r = np.zeros((self.max_size, 1))
 66 |         self.s_ = np.zeros((self.max_size, state_dim))
 67 |         self.dw = np.zeros((self.max_size, 1))
 68 | 
 69 |     def store(self, s, a, r, s_, dw):
 70 |         self.s[self.count] = s
 71 |         self.a[self.count] = a
 72 |         self.r[self.count] = r
 73 |         self.s_[self.count] = s_
 74 |         self.dw[self.count] = dw
 75 |         self.count = (self.count + 1) % self.max_size  # When the 'count' reaches max_size, it will be reset to 0.
 76 |         self.size = min(self.size + 1, self.max_size)  # Record the number of  transitions
 77 | 
 78 |     def sample(self, batch_size):
 79 |         index = np.random.choice(self.size, size=batch_size)  # Randomly sampling
 80 |         batch_s = torch.tensor(self.s[index], dtype=torch.float)
 81 |         batch_a = torch.tensor(self.a[index], dtype=torch.float)
 82 |         batch_r = torch.tensor(self.r[index], dtype=torch.float)
 83 |         batch_s_ = torch.tensor(self.s_[index], dtype=torch.float)
 84 |         batch_dw = torch.tensor(self.dw[index], dtype=torch.float)
 85 | 
 86 |         return batch_s, batch_a, batch_r, batch_s_, batch_dw
 87 | 
 88 | 
 89 | class TD3(object):
 90 |     def __init__(self, state_dim, action_dim, max_action):
 91 |         self.max_action = max_action
 92 |         self.hidden_width = 256  # The number of neurons in hidden layers of the neural network
 93 |         self.batch_size = 256  # batch size
 94 |         self.GAMMA = 0.99  # discount factor
 95 |         self.TAU = 0.005  # Softly update the target network
 96 |         self.lr = 3e-4  # learning rate
 97 |         self.policy_noise = 0.2 * max_action  # The noise for the trick 'target policy smoothing'
 98 |         self.noise_clip = 0.5 * max_action  # Clip the noise
 99 |         self.policy_freq = 2  # The frequency of policy updates
100 |         self.actor_pointer = 0
101 | 
102 |         self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action)
103 |         self.actor_target = copy.deepcopy(self.actor)
104 |         self.critic = Critic(state_dim, action_dim, self.hidden_width)
105 |         self.critic_target = copy.deepcopy(self.critic)
106 | 
107 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
108 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
109 | 
110 |     def choose_action(self, s):
111 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
112 |         a = self.actor(s).data.numpy().flatten()
113 |         return a
114 | 
115 |     def learn(self, relay_buffer):
116 |         self.actor_pointer += 1
117 |         batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size)  # Sample a batch
118 | 
119 |         # Compute the target Q
120 |         with torch.no_grad():  # target_Q has no gradient
121 |             # Trick 1:target policy smoothing
122 |             # torch.randn_like can generate random numbers sampled from N(0,1)，which have the same size as 'batch_a'
123 |             noise = (torch.randn_like(batch_a) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
124 |             next_action = (self.actor_target(batch_s_) + noise).clamp(-self.max_action, self.max_action)
125 | 
126 |             # Trick 2:clipped double Q-learning
127 |             target_Q1, target_Q2 = self.critic_target(batch_s_, next_action)
128 |             target_Q = batch_r + self.GAMMA * (1 - batch_dw) * torch.min(target_Q1, target_Q2)
129 | 
130 |         # Get the current Q
131 |         current_Q1, current_Q2 = self.critic(batch_s, batch_a)
132 |         # Compute the critic loss
133 |         critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
134 |         # Optimize the critic
135 |         self.critic_optimizer.zero_grad()
136 |         critic_loss.backward()
137 |         self.critic_optimizer.step()
138 | 
139 |         # Trick 3:delayed policy updates
140 |         if self.actor_pointer % self.policy_freq == 0:
141 |             # Freeze critic networks so you don't waste computational effort
142 |             for params in self.critic.parameters():
143 |                 params.requires_grad = False
144 | 
145 |             # Compute actor loss
146 |             actor_loss = -self.critic.Q1(batch_s, self.actor(batch_s)).mean()  # Only use Q1
147 |             # Optimize the actor
148 |             self.actor_optimizer.zero_grad()
149 |             actor_loss.backward()
150 |             self.actor_optimizer.step()
151 | 
152 |             # Unfreeze critic networks
153 |             for params in self.critic.parameters():
154 |                 params.requires_grad = True
155 | 
156 |             # Softly update the target networks
157 |             for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
158 |                 target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
159 | 
160 |             for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
161 |                 target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
162 | 
163 | 
164 | def evaluate_policy(env, agent):
165 |     times = 3  # Perform three evaluations and calculate the average
166 |     evaluate_reward = 0
167 |     for _ in range(times):
168 |         s = env.reset()
169 |         done = False
170 |         episode_reward = 0
171 |         while not done:
172 |             a = agent.choose_action(s)  # We do not add noise when evaluating
173 |             s_, r, done, _ = env.step(a)
174 |             episode_reward += r
175 |             s = s_
176 |         evaluate_reward += episode_reward
177 | 
178 |     return int(evaluate_reward / times)
179 | 
180 | 
181 | def reward_adapter(r, env_index):
182 |     if env_index == 0:  # Pendulum-v1
183 |         r = (r + 8) / 8
184 |     elif env_index == 1:  # BipedalWalker-v3
185 |         if r <= -100:
186 |             r = -1
187 |     return r
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2']
192 |     env_index = 0
193 |     env = gym.make(env_name[env_index])
194 |     env_evaluate = gym.make(env_name[env_index])  # When evaluating the policy, we need to rebuild an environment
195 |     number = 1
196 |     # Set random seed
197 |     seed = 0
198 |     env.seed(seed)
199 |     env.action_space.seed(seed)
200 |     env_evaluate.seed(seed)
201 |     env_evaluate.action_space.seed(seed)
202 |     np.random.seed(seed)
203 |     torch.manual_seed(seed)
204 | 
205 |     state_dim = env.observation_space.shape[0]
206 |     action_dim = env.action_space.shape[0]
207 |     max_action = float(env.action_space.high[0])
208 |     max_episode_steps = env._max_episode_steps  # Maximum number of steps per episode
209 |     print("env={}".format(env_name[env_index]))
210 |     print("state_dim={}".format(state_dim))
211 |     print("action_dim={}".format(action_dim))
212 |     print("max_action={}".format(max_action))
213 |     print("max_episode_steps={}".format(max_episode_steps))
214 | 
215 |     agent = TD3(state_dim, action_dim, max_action)
216 |     replay_buffer = ReplayBuffer(state_dim, action_dim)
217 |     # Build a tensorboard
218 |     writer = SummaryWriter(log_dir='runs/TD3/TD3_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed))
219 | 
220 |     noise_std = 0.1 * max_action  # the std of Gaussian noise for exploration
221 |     max_train_steps = 3e6  # Maximum number of training steps
222 |     random_steps = 25e3  # Take the random actions in the beginning for the better exploration
223 |     evaluate_freq = 5e3  # Evaluate the policy every 'evaluate_freq' steps
224 |     evaluate_num = 0  # Record the number of evaluations
225 |     evaluate_rewards = []  # Record the rewards during the evaluating
226 |     total_steps = 0  # Record the total steps during the training
227 | 
228 |     while total_steps < max_train_steps:
229 |         s = env.reset()
230 |         episode_steps = 0
231 |         done = False
232 |         while not done:
233 |             episode_steps += 1
234 |             if total_steps < random_steps:  # Take random actions in the beginning for the better exploration
235 |                 a = env.action_space.sample()
236 |             else:
237 |                 # Add Gaussian noise to action for exploration
238 |                 a = agent.choose_action(s)
239 |                 a = (a + np.random.normal(0, noise_std, size=action_dim)).clip(-max_action, max_action)
240 |             s_, r, done, _ = env.step(a)
241 |             r = reward_adapter(r, env_index)  # Adjust rewards for better performance
242 |             # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
243 |             # dw means dead or win,there is no next state s';
244 |             # but when reaching the max_episode_steps,there is a next state s' actually.
245 |             if done and episode_steps != max_episode_steps:
246 |                 dw = True
247 |             else:
248 |                 dw = False
249 |             replay_buffer.store(s, a, r, s_, dw)  # Store the transition
250 |             s = s_
251 | 
252 |             # Update one step
253 |             if total_steps >= random_steps:
254 |                 agent.learn(replay_buffer)
255 | 
256 |             # Evaluate the policy every 'evaluate_freq' steps
257 |             if (total_steps + 1) % evaluate_freq == 0:
258 |                 evaluate_num += 1
259 |                 evaluate_reward = evaluate_policy(env_evaluate, agent)
260 |                 evaluate_rewards.append(evaluate_reward)
261 |                 print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward))
262 |                 writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
263 |                 # Save the rewards
264 |                 if evaluate_num % 10 == 0:
265 |                     np.save('./data_train/TD3_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
266 | 
267 |             total_steps += 1
268 | 


--------------------------------------------------------------------------------
/7.TD3/TD3_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/TD3_result.png


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_0.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_10.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_BipedalWalker-v3_number_1_seed_100.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_0.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_10.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_HalfCheetah-v2_number_1_seed_100.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_0.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_10.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Hopper-v2_number_1_seed_100.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_0.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_10.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Pendulum-v1_number_1_seed_100.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_0.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_10.npy


--------------------------------------------------------------------------------
/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/data_train/TD3_env_Walker2d-v2_number_1_seed_100.npy


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_0/events.out.tfevents.1648952137.李智.93956.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_0/events.out.tfevents.1648952137.李智.93956.0


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_10/events.out.tfevents.1648882414.李智.81744.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_10/events.out.tfevents.1648882414.李智.81744.0


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_100/events.out.tfevents.1648925401.李智.81744.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_BipedalWalker-v3_number_1_seed_100/events.out.tfevents.1648925401.李智.81744.1


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_0/events.out.tfevents.1648909506.李智.60360.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_0/events.out.tfevents.1648909506.李智.60360.2


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_10/events.out.tfevents.1648800524.李智.60360.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_10/events.out.tfevents.1648800524.李智.60360.0


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_100/events.out.tfevents.1648852975.李智.60360.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_HalfCheetah-v2_number_1_seed_100/events.out.tfevents.1648852975.李智.60360.1


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_0/events.out.tfevents.1649010066.李智.85868.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_0/events.out.tfevents.1649010066.李智.85868.2


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_10/events.out.tfevents.1648901654.李智.85868.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_10/events.out.tfevents.1648901654.李智.85868.0


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_100/events.out.tfevents.1648956951.李智.85868.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Hopper-v2_number_1_seed_100/events.out.tfevents.1648956951.李智.85868.1


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_0/events.out.tfevents.1649065960.李智.18392.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_0/events.out.tfevents.1649065960.李智.18392.2


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_10/events.out.tfevents.1649057339.李智.18392.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_10/events.out.tfevents.1649057339.李智.18392.0


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_100/events.out.tfevents.1649061632.李智.18392.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Pendulum-v1_number_1_seed_100/events.out.tfevents.1649061632.李智.18392.1


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_0/events.out.tfevents.1648846023.李智.76672.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_0/events.out.tfevents.1648846023.李智.76672.2


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_10/events.out.tfevents.1648735005.李智.76672.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_10/events.out.tfevents.1648735005.李智.76672.0


--------------------------------------------------------------------------------
/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_100/events.out.tfevents.1648793243.李智.76672.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/7.TD3/runs/TD3/TD3_env_Walker2d-v2_number_1_seed_100/events.out.tfevents.1648793243.李智.76672.1


--------------------------------------------------------------------------------
/8.SAC/SAC-continuous.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | import copy
  7 | from torch.utils.tensorboard import SummaryWriter
  8 | from torch.distributions import Normal
  9 | 
 10 | 
 11 | class Actor(nn.Module):
 12 |     def __init__(self, state_dim, action_dim, hidden_width, max_action):
 13 |         super(Actor, self).__init__()
 14 |         self.max_action = max_action
 15 |         self.l1 = nn.Linear(state_dim, hidden_width)
 16 |         self.l2 = nn.Linear(hidden_width, hidden_width)
 17 |         self.mean_layer = nn.Linear(hidden_width, action_dim)
 18 |         self.log_std_layer = nn.Linear(hidden_width, action_dim)
 19 | 
 20 |     def forward(self, x, deterministic=False, with_logprob=True):
 21 |         x = F.relu(self.l1(x))
 22 |         x = F.relu(self.l2(x))
 23 |         mean = self.mean_layer(x)
 24 |         log_std = self.log_std_layer(x)  # We output the log_std to ensure that std=exp(log_std)>0
 25 |         log_std = torch.clamp(log_std, -20, 2)
 26 |         std = torch.exp(log_std)
 27 | 
 28 |         dist = Normal(mean, std)  # Generate a Gaussian distribution
 29 |         if deterministic:  # When evaluating，we use the deterministic policy
 30 |             a = mean
 31 |         else:
 32 |             a = dist.rsample()  # reparameterization trick: mean+std*N(0,1)
 33 | 
 34 |         if with_logprob:  # The method refers to Open AI Spinning up, which is more stable.
 35 |             log_pi = dist.log_prob(a).sum(dim=1, keepdim=True)
 36 |             log_pi -= (2 * (np.log(2) - a - F.softplus(-2 * a))).sum(dim=1, keepdim=True)
 37 |         else:
 38 |             log_pi = None
 39 | 
 40 |         a = self.max_action * torch.tanh(a)  # Use tanh to compress the unbounded Gaussian distribution into a bounded action interval.
 41 | 
 42 |         return a, log_pi
 43 | 
 44 | 
 45 | class Critic(nn.Module):  # According to (s,a), directly calculate Q(s,a)
 46 |     def __init__(self, state_dim, action_dim, hidden_width):
 47 |         super(Critic, self).__init__()
 48 |         # Q1
 49 |         self.l1 = nn.Linear(state_dim + action_dim, hidden_width)
 50 |         self.l2 = nn.Linear(hidden_width, hidden_width)
 51 |         self.l3 = nn.Linear(hidden_width, 1)
 52 |         # Q2
 53 |         self.l4 = nn.Linear(state_dim + action_dim, hidden_width)
 54 |         self.l5 = nn.Linear(hidden_width, hidden_width)
 55 |         self.l6 = nn.Linear(hidden_width, 1)
 56 | 
 57 |     def forward(self, s, a):
 58 |         s_a = torch.cat([s, a], 1)
 59 |         q1 = F.relu(self.l1(s_a))
 60 |         q1 = F.relu(self.l2(q1))
 61 |         q1 = self.l3(q1)
 62 | 
 63 |         q2 = F.relu(self.l4(s_a))
 64 |         q2 = F.relu(self.l5(q2))
 65 |         q2 = self.l6(q2)
 66 | 
 67 |         return q1, q2
 68 | 
 69 | 
 70 | class ReplayBuffer(object):
 71 |     def __init__(self, state_dim, action_dim):
 72 |         self.max_size = int(1e6)
 73 |         self.count = 0
 74 |         self.size = 0
 75 |         self.s = np.zeros((self.max_size, state_dim))
 76 |         self.a = np.zeros((self.max_size, action_dim))
 77 |         self.r = np.zeros((self.max_size, 1))
 78 |         self.s_ = np.zeros((self.max_size, state_dim))
 79 |         self.dw = np.zeros((self.max_size, 1))
 80 | 
 81 |     def store(self, s, a, r, s_, dw):
 82 |         self.s[self.count] = s
 83 |         self.a[self.count] = a
 84 |         self.r[self.count] = r
 85 |         self.s_[self.count] = s_
 86 |         self.dw[self.count] = dw
 87 |         self.count = (self.count + 1) % self.max_size  # When the 'count' reaches max_size, it will be reset to 0.
 88 |         self.size = min(self.size + 1, self.max_size)  # Record the number of  transitions
 89 | 
 90 |     def sample(self, batch_size):
 91 |         index = np.random.choice(self.size, size=batch_size)  # Randomly sampling
 92 |         batch_s = torch.tensor(self.s[index], dtype=torch.float)
 93 |         batch_a = torch.tensor(self.a[index], dtype=torch.float)
 94 |         batch_r = torch.tensor(self.r[index], dtype=torch.float)
 95 |         batch_s_ = torch.tensor(self.s_[index], dtype=torch.float)
 96 |         batch_dw = torch.tensor(self.dw[index], dtype=torch.float)
 97 | 
 98 |         return batch_s, batch_a, batch_r, batch_s_, batch_dw
 99 | 
100 | 
101 | class SAC(object):
102 |     def __init__(self, state_dim, action_dim, max_action):
103 |         self.max_action = max_action
104 |         self.hidden_width = 256  # The number of neurons in hidden layers of the neural network
105 |         self.batch_size = 256  # batch size
106 |         self.GAMMA = 0.99  # discount factor
107 |         self.TAU = 0.005  # Softly update the target network
108 |         self.lr = 3e-4  # learning rate
109 |         self.adaptive_alpha = True  # Whether to automatically learn the temperature alpha
110 |         if self.adaptive_alpha:
111 |             # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
112 |             self.target_entropy = -action_dim
113 |             # We learn log_alpha instead of alpha to ensure that alpha=exp(log_alpha)>0
114 |             self.log_alpha = torch.zeros(1, requires_grad=True)
115 |             self.alpha = self.log_alpha.exp()
116 |             self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.lr)
117 |         else:
118 |             self.alpha = 0.2
119 | 
120 |         self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action)
121 |         self.critic = Critic(state_dim, action_dim, self.hidden_width)
122 |         self.critic_target = copy.deepcopy(self.critic)
123 | 
124 |         self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
125 |         self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
126 | 
127 |     def choose_action(self, s, deterministic=False):
128 |         s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
129 |         a, _ = self.actor(s, deterministic, False)  # When choosing actions, we do not need to compute log_pi
130 |         return a.data.numpy().flatten()
131 | 
132 |     def learn(self, relay_buffer):
133 |         batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size)  # Sample a batch
134 | 
135 |         with torch.no_grad():
136 |             batch_a_, log_pi_ = self.actor(batch_s_)  # a' from the current policy
137 |             # Compute target Q
138 |             target_Q1, target_Q2 = self.critic_target(batch_s_, batch_a_)
139 |             target_Q = batch_r + self.GAMMA * (1 - batch_dw) * (torch.min(target_Q1, target_Q2) - self.alpha * log_pi_)
140 | 
141 |         # Compute current Q
142 |         current_Q1, current_Q2 = self.critic(batch_s, batch_a)
143 |         # Compute critic loss
144 |         critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
145 |         # Optimize the critic
146 |         self.critic_optimizer.zero_grad()
147 |         critic_loss.backward()
148 |         self.critic_optimizer.step()
149 | 
150 |         # Freeze critic networks so you don't waste computational effort
151 |         for params in self.critic.parameters():
152 |             params.requires_grad = False
153 | 
154 |         # Compute actor loss
155 |         a, log_pi = self.actor(batch_s)
156 |         Q1, Q2 = self.critic(batch_s, a)
157 |         Q = torch.min(Q1, Q2)
158 |         actor_loss = (self.alpha * log_pi - Q).mean()
159 | 
160 |         # Optimize the actor
161 |         self.actor_optimizer.zero_grad()
162 |         actor_loss.backward()
163 |         self.actor_optimizer.step()
164 | 
165 |         # Unfreeze critic networks
166 |         for params in self.critic.parameters():
167 |             params.requires_grad = True
168 | 
169 |         # Update alpha
170 |         if self.adaptive_alpha:
171 |             # We learn log_alpha instead of alpha to ensure that alpha=exp(log_alpha)>0
172 |             alpha_loss = -(self.log_alpha.exp() * (log_pi + self.target_entropy).detach()).mean()
173 |             self.alpha_optimizer.zero_grad()
174 |             alpha_loss.backward()
175 |             self.alpha_optimizer.step()
176 |             self.alpha = self.log_alpha.exp()
177 | 
178 |         # Softly update target networks
179 |         for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
180 |             target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
181 | 
182 | 
183 | def evaluate_policy(env, agent):
184 |     times = 3  # Perform three evaluations and calculate the average
185 |     evaluate_reward = 0
186 |     for _ in range(times):
187 |         s = env.reset()
188 |         done = False
189 |         episode_reward = 0
190 |         while not done:
191 |             a = agent.choose_action(s, deterministic=True)  # We use the deterministic policy during the evaluating
192 |             s_, r, done, _ = env.step(a)
193 |             episode_reward += r
194 |             s = s_
195 |         evaluate_reward += episode_reward
196 | 
197 |     return int(evaluate_reward / times)
198 | 
199 | 
200 | def reward_adapter(r, env_index):
201 |     if env_index == 0:  # Pendulum-v1
202 |         r = (r + 8) / 8
203 |     elif env_index == 1:  # BipedalWalker-v3
204 |         if r <= -100:
205 |             r = -1
206 |     return r
207 | 
208 | 
209 | if __name__ == '__main__':
210 |     env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2']
211 |     env_index = 0
212 |     env = gym.make(env_name[env_index])
213 |     env_evaluate = gym.make(env_name[env_index])  # When evaluating the policy, we need to rebuild an environment
214 |     number = 1
215 |     seed = 0
216 |     # Set random seed
217 |     env.seed(seed)
218 |     env.action_space.seed(seed)
219 |     env_evaluate.seed(seed)
220 |     env_evaluate.action_space.seed(seed)
221 |     np.random.seed(seed)
222 |     torch.manual_seed(seed)
223 | 
224 |     state_dim = env.observation_space.shape[0]
225 |     action_dim = env.action_space.shape[0]
226 |     max_action = float(env.action_space.high[0])
227 |     max_episode_steps = env._max_episode_steps  # Maximum number of steps per episode
228 |     print("env={}".format(env_name[env_index]))
229 |     print("state_dim={}".format(state_dim))
230 |     print("action_dim={}".format(action_dim))
231 |     print("max_action={}".format(max_action))
232 |     print("max_episode_steps={}".format(max_episode_steps))
233 | 
234 |     agent = SAC(state_dim, action_dim, max_action)
235 |     replay_buffer = ReplayBuffer(state_dim, action_dim)
236 |     # Build a tensorboard
237 |     writer = SummaryWriter(log_dir='runs/SAC/SAC_env_{}_number_{}_seed_{}'.format(env_name[env_index], number, seed))
238 | 
239 |     max_train_steps = 3e6  # Maximum number of training steps
240 |     random_steps = 25e3  # Take the random actions in the beginning for the better exploration
241 |     evaluate_freq = 5e3  # Evaluate the policy every 'evaluate_freq' steps
242 |     evaluate_num = 0  # Record the number of evaluations
243 |     evaluate_rewards = []  # Record the rewards during the evaluating
244 |     total_steps = 0  # Record the total steps during the training
245 | 
246 |     while total_steps < max_train_steps:
247 |         s = env.reset()
248 |         episode_steps = 0
249 |         done = False
250 |         while not done:
251 |             episode_steps += 1
252 |             if total_steps < random_steps:  # Take the random actions in the beginning for the better exploration
253 |                 a = env.action_space.sample()
254 |             else:
255 |                 a = agent.choose_action(s)
256 |             s_, r, done, _ = env.step(a)
257 |             r = reward_adapter(r, env_index)  # Adjust rewards for better performance
258 |             # When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
259 |             # dw means dead or win,there is no next state s';
260 |             # but when reaching the max_episode_steps,there is a next state s' actually.
261 |             if done and episode_steps != max_episode_steps:
262 |                 dw = True
263 |             else:
264 |                 dw = False
265 |             replay_buffer.store(s, a, r, s_, dw)  # Store the transition
266 |             s = s_
267 | 
268 |             if total_steps >= random_steps:
269 |                 agent.learn(replay_buffer)
270 | 
271 |             # Evaluate the policy every 'evaluate_freq' steps
272 |             if (total_steps + 1) % evaluate_freq == 0:
273 |                 evaluate_num += 1
274 |                 evaluate_reward = evaluate_policy(env_evaluate, agent)
275 |                 evaluate_rewards.append(evaluate_reward)
276 |                 print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward))
277 |                 writer.add_scalar('step_rewards_{}'.format(env_name[env_index]), evaluate_reward, global_step=total_steps)
278 |                 # Save the rewards
279 |                 if evaluate_num % 10 == 0:
280 |                     np.save('./data_train/SAC_env_{}_number_{}_seed_{}.npy'.format(env_name[env_index], number, seed), np.array(evaluate_rewards))
281 | 
282 |             total_steps += 1
283 | 


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/PPO+RNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/PPO+RNN.png


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/PPO_discrete_rnn_main.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from torch.utils.tensorboard import SummaryWriter
  4 | import gym
  5 | import argparse
  6 | from normalization import Normalization, RewardScaling
  7 | from replaybuffer import ReplayBuffer
  8 | from ppo_discrete_rnn import PPO_discrete_RNN
  9 | 
 10 | 
 11 | class Runner:
 12 |     def __init__(self, args, env_name, number, seed):
 13 |         self.args = args
 14 |         self.env_name = env_name
 15 |         self.number = number
 16 |         self.seed = seed
 17 | 
 18 |         # Create env
 19 |         self.env = gym.make(env_name)
 20 |         # Set random seed
 21 |         np.random.seed(self.seed)
 22 |         torch.manual_seed(self.seed)
 23 |         self.env.seed(seed)
 24 |         self.env.action_space.seed(seed)
 25 | 
 26 |         self.args.state_dim = self.env.observation_space.shape[0]
 27 |         self.args.action_dim = self.env.action_space.n
 28 |         self.args.episode_limit = self.env._max_episode_steps  # Maximum number of steps per episode
 29 |         print("env={}".format(env_name))
 30 |         print("state_dim={}".format(args.state_dim))
 31 |         print("action_dim={}".format(args.action_dim))
 32 |         print("episode_limit={}".format(args.episode_limit))
 33 | 
 34 |         self.replay_buffer = ReplayBuffer(args)
 35 |         self.agent = PPO_discrete_RNN(args)
 36 | 
 37 |         # Create a tensorboard
 38 |         self.writer = SummaryWriter(log_dir='runs/PPO_discrete/env_{}_number_{}_seed_{}'.format(env_name, number, seed))
 39 | 
 40 |         self.evaluate_rewards = []  # Record the rewards during the evaluating
 41 |         self.total_steps = 0
 42 | 
 43 |         if self.args.use_state_norm:
 44 |             print("------use state normalization------")
 45 |             self.state_norm = Normalization(shape=args.state_dim)  # Trick 2:state normalization
 46 |         if self.args.use_reward_scaling:
 47 |             print("------use reward scaling------")
 48 |             self.reward_scaling = RewardScaling(shape=1, gamma=self.args.gamma)
 49 | 
 50 |     def run(self, ):
 51 |         evaluate_num = -1  # Record the number of evaluations
 52 |         while self.total_steps < self.args.max_train_steps:
 53 |             if self.total_steps // self.args.evaluate_freq > evaluate_num:
 54 |                 self.evaluate_policy()  # Evaluate the policy every 'evaluate_freq' steps
 55 |                 evaluate_num += 1
 56 | 
 57 |             _, episode_steps = self.run_episode()  # Run an episode
 58 |             self.total_steps += episode_steps
 59 | 
 60 |             if self.replay_buffer.episode_num == self.args.batch_size:
 61 |                 self.agent.train(self.replay_buffer, self.total_steps)  # Training
 62 |                 self.replay_buffer.reset_buffer()
 63 | 
 64 |         self.evaluate_policy()
 65 |         self.env.close()
 66 | 
 67 |     def run_episode(self, ):
 68 |         episode_reward = 0
 69 |         s = self.env.reset()
 70 |         if self.args.use_reward_scaling:
 71 |             self.reward_scaling.reset()
 72 |         self.agent.reset_rnn_hidden()
 73 |         for episode_step in range(self.args.episode_limit):
 74 |             if self.args.use_state_norm:
 75 |                 s = self.state_norm(s)
 76 |             a, a_logprob = self.agent.choose_action(s, evaluate=False)
 77 |             v = self.agent.get_value(s)
 78 |             s_, r, done, _ = self.env.step(a)
 79 |             episode_reward += r
 80 | 
 81 |             if done and episode_step + 1 != self.args.episode_limit:
 82 |                 dw = True
 83 |             else:
 84 |                 dw = False
 85 |             if self.args.use_reward_scaling:
 86 |                 r = self.reward_scaling(r)
 87 |             # Store the transition
 88 |             self.replay_buffer.store_transition(episode_step, s, v, a, a_logprob, r, dw)
 89 |             s = s_
 90 |             if done:
 91 |                 break
 92 | 
 93 |         # An episode is over, store v in the last step
 94 |         if self.args.use_state_norm:
 95 |             s = self.state_norm(s)
 96 |         v = self.agent.get_value(s)
 97 |         self.replay_buffer.store_last_value(episode_step + 1, v)
 98 | 
 99 |         return episode_reward, episode_step + 1
100 | 
101 |     def evaluate_policy(self, ):
102 |         evaluate_reward = 0
103 |         for _ in range(self.args.evaluate_times):
104 |             episode_reward, done = 0, False
105 |             s = self.env.reset()
106 |             self.agent.reset_rnn_hidden()
107 |             while not done:
108 |                 if self.args.use_state_norm:
109 |                     s = self.state_norm(s, update=False)
110 |                 a, a_logprob = self.agent.choose_action(s, evaluate=True)
111 |                 s_, r, done, _ = self.env.step(a)
112 |                 episode_reward += r
113 |                 s = s_
114 |             evaluate_reward += episode_reward
115 | 
116 |         evaluate_reward = evaluate_reward / self.args.evaluate_times
117 |         self.evaluate_rewards.append(evaluate_reward)
118 |         print("total_steps:{} \t evaluate_reward:{}".format(self.total_steps, evaluate_reward))
119 |         self.writer.add_scalar('evaluate_step_rewards_{}'.format(self.env_name), evaluate_reward, global_step=self.total_steps)
120 |         # Save the rewards and models
121 |         np.save('./data_train/PPO_env_{}_number_{}_seed_{}.npy'.format(self.env_name, self.number, self.seed), np.array(self.evaluate_rewards))
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     parser = argparse.ArgumentParser("Hyperparameter Setting for PPO-discrete")
126 |     parser.add_argument("--max_train_steps", type=int, default=int(2e5), help=" Maximum number of training steps")
127 |     parser.add_argument("--evaluate_freq", type=float, default=5e3, help="Evaluate the policy every 'evaluate_freq' steps")
128 |     parser.add_argument("--save_freq", type=int, default=20, help="Save frequency")
129 |     parser.add_argument("--evaluate_times", type=float, default=3, help="Evaluate times")
130 | 
131 |     parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
132 |     parser.add_argument("--mini_batch_size", type=int, default=2, help="Minibatch size")
133 |     parser.add_argument("--hidden_dim", type=int, default=64, help="The number of neurons in hidden layers of the neural network")
134 |     parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate of actor")
135 |     parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
136 |     parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter")
137 |     parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter")
138 |     parser.add_argument("--K_epochs", type=int, default=15, help="PPO parameter")
139 |     parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization")
140 |     parser.add_argument("--use_state_norm", type=bool, default=False, help="Trick 2:state normalization")
141 |     parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling")
142 |     parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy")
143 |     parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay")
144 |     parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip")
145 |     parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization")
146 |     parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5")
147 |     parser.add_argument("--use_tanh", type=float, default=False, help="Trick 10: tanh activation function")
148 |     parser.add_argument("--use_gru", type=bool, default=True, help="Whether to use GRU")
149 | 
150 |     args = parser.parse_args()
151 | 
152 |     env_names = ['CartPole-v1', 'LunarLander-v2']
153 |     env_index = 0
154 |     for seed in [0, 10, 100]:
155 |         runner = Runner(args, env_name=env_names[env_index], number=3, seed=seed)
156 |         runner.run()
157 | 


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/README.md:
--------------------------------------------------------------------------------
 1 | # PPO-discrete + RNN
 2 | This is a concise Pytorch implementation of PPO+RNN(GRU/LSTM) on discrete action space.<br />
 3 | 
 4 | 
 5 | ## How to use my code?
 6 | You can dircetly run 'PPO_discrete_rnn_main.py' in your own IDE.<br />
 7 | 
 8 | ## Trainning environments
 9 | You can set the 'env_index' in the codes to change the environments. Here, we train our code in 2 environments.<br />
10 | env_index=0 represent 'CartPole-v1'<br />
11 | env_index=1 represent 'LunarLander-v2'<br />
12 | 
13 | ## Training result
14 | ![image](https://github.com/Lizhi-sjtu/DRL-code-pytorch/blob/main/9.PPO-discrete-RNN/PPO%2BRNN.png)
15 | 
16 | 


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/__pycache__/normalization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/__pycache__/normalization.cpython-37.pyc


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/__pycache__/ppo_discrete_rnn.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/__pycache__/ppo_discrete_rnn.cpython-37.pyc


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/__pycache__/replaybuffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/__pycache__/replaybuffer.cpython-37.pyc


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_0.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_10.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_3_seed_100.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_0.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_10.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_CartPole-v1_number_5_seed_100.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_0.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_10.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_3_seed_100.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_0.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_10.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_10.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/data_train/PPO_env_LunarLander-v2_number_5_seed_100.npy


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/normalization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RunningMeanStd:
 5 |     # Dynamically calculate mean and std
 6 |     def __init__(self, shape):  # shape:the dimension of input data
 7 |         self.n = 0
 8 |         self.mean = np.zeros(shape)
 9 |         self.S = np.zeros(shape)
10 |         self.std = np.sqrt(self.S)
11 | 
12 |     def update(self, x):
13 |         x = np.array(x)
14 |         self.n += 1
15 |         if self.n == 1:
16 |             self.mean = x
17 |             self.std = x
18 |         else:
19 |             old_mean = self.mean.copy()
20 |             self.mean = old_mean + (x - old_mean) / self.n
21 |             self.S = self.S + (x - old_mean) * (x - self.mean)
22 |             self.std = np.sqrt(self.S / self.n)
23 | 
24 | 
25 | class Normalization:
26 |     def __init__(self, shape):
27 |         self.running_ms = RunningMeanStd(shape=shape)
28 | 
29 |     def __call__(self, x, update=True):
30 |         # Whether to update the mean and std,during the evaluating,update=False
31 |         if update:
32 |             self.running_ms.update(x)
33 |         x = (x - self.running_ms.mean) / (self.running_ms.std + 1e-8)
34 | 
35 |         return x
36 | 
37 | 
38 | class RewardScaling:
39 |     def __init__(self, shape, gamma):
40 |         self.shape = shape  # reward shape=1
41 |         self.gamma = gamma  # discount factor
42 |         self.running_ms = RunningMeanStd(shape=self.shape)
43 |         self.R = np.zeros(self.shape)
44 | 
45 |     def __call__(self, x):
46 |         self.R = self.gamma * self.R + x
47 |         self.running_ms.update(self.R)
48 |         x = x / (self.running_ms.std + 1e-8)  # Only divided std
49 |         return x
50 | 
51 |     def reset(self):  # When an episode is done,we should reset 'self.R'
52 |         self.R = np.zeros(self.shape)
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/ppo_discrete_rnn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler, SequentialSampler
  6 | from torch.distributions import Categorical
  7 | import copy
  8 | 
  9 | 
 10 | # Trick 8: orthogonal initialization
 11 | def orthogonal_init(layer, gain=np.sqrt(2)):
 12 |     for name, param in layer.named_parameters():
 13 |         if 'bias' in name:
 14 |             nn.init.constant_(param, 0)
 15 |         elif 'weight' in name:
 16 |             nn.init.orthogonal_(param, gain=gain)
 17 | 
 18 |     return layer
 19 | 
 20 | 
 21 | class Actor_Critic_RNN(nn.Module):
 22 |     def __init__(self, args):
 23 |         super(Actor_Critic_RNN, self).__init__()
 24 |         self.use_gru = args.use_gru
 25 |         self.activate_func = [nn.ReLU(), nn.Tanh()][args.use_tanh]  # Trick10: use tanh
 26 | 
 27 |         self.actor_rnn_hidden = None
 28 |         self.actor_fc1 = nn.Linear(args.state_dim, args.hidden_dim)
 29 |         if args.use_gru:
 30 |             print("------use GRU------")
 31 |             self.actor_rnn = nn.GRU(args.hidden_dim, args.hidden_dim, batch_first=True)
 32 |         else:
 33 |             print("------use LSTM------")
 34 |             self.actor_rnn = nn.LSTM(args.hidden_dim, args.hidden_dim, batch_first=True)
 35 |         self.actor_fc2 = nn.Linear(args.hidden_dim, args.action_dim)
 36 | 
 37 |         self.critic_rnn_hidden = None
 38 |         self.critic_fc1 = nn.Linear(args.state_dim, args.hidden_dim)
 39 |         if args.use_gru:
 40 |             self.critic_rnn = nn.GRU(args.hidden_dim, args.hidden_dim, batch_first=True)
 41 |         else:
 42 |             self.critic_rnn = nn.LSTM(args.hidden_dim, args.hidden_dim, batch_first=True)
 43 |         self.critic_fc2 = nn.Linear(args.hidden_dim, 1)
 44 | 
 45 |         if args.use_orthogonal_init:
 46 |             print("------use orthogonal init------")
 47 |             orthogonal_init(self.actor_fc1)
 48 |             orthogonal_init(self.actor_rnn)
 49 |             orthogonal_init(self.actor_fc2, gain=0.01)
 50 |             orthogonal_init(self.critic_fc1)
 51 |             orthogonal_init(self.critic_rnn)
 52 |             orthogonal_init(self.critic_fc2)
 53 | 
 54 |     def actor(self, s):
 55 |         s = self.activate_func(self.actor_fc1(s))
 56 |         output, self.actor_rnn_hidden = self.actor_rnn(s, self.actor_rnn_hidden)
 57 |         logit = self.actor_fc2(output)
 58 |         return logit
 59 | 
 60 |     def critic(self, s):
 61 |         s = self.activate_func(self.critic_fc1(s))
 62 |         output, self.critic_rnn_hidden = self.critic_rnn(s, self.critic_rnn_hidden)
 63 |         value = self.critic_fc2(output)
 64 |         return value
 65 | 
 66 | 
 67 | class PPO_discrete_RNN:
 68 |     def __init__(self, args):
 69 |         self.batch_size = args.batch_size
 70 |         self.mini_batch_size = args.mini_batch_size
 71 |         self.max_train_steps = args.max_train_steps
 72 |         self.lr = args.lr  # Learning rate of actor
 73 |         self.gamma = args.gamma  # Discount factor
 74 |         self.lamda = args.lamda  # GAE parameter
 75 |         self.epsilon = args.epsilon  # PPO clip parameter
 76 |         self.K_epochs = args.K_epochs  # PPO parameter
 77 |         self.entropy_coef = args.entropy_coef  # Entropy coefficient
 78 |         self.set_adam_eps = args.set_adam_eps
 79 |         self.use_grad_clip = args.use_grad_clip
 80 |         self.use_lr_decay = args.use_lr_decay
 81 |         self.use_adv_norm = args.use_adv_norm
 82 | 
 83 |         self.ac = Actor_Critic_RNN(args)
 84 |         if self.set_adam_eps:  # Trick 9: set Adam epsilon=1e-5
 85 |             self.optimizer = torch.optim.Adam(self.ac.parameters(), lr=self.lr, eps=1e-5)
 86 |         else:
 87 |             self.optimizer = torch.optim.Adam(self.ac.parameters(), lr=self.lr)
 88 | 
 89 |     def reset_rnn_hidden(self):
 90 |         self.ac.actor_rnn_hidden = None
 91 |         self.ac.critic_rnn_hidden = None
 92 | 
 93 |     def choose_action(self, s, evaluate=False):
 94 |         with torch.no_grad():
 95 |             s = torch.tensor(s, dtype=torch.float).unsqueeze(0)
 96 |             logit = self.ac.actor(s)
 97 |             if evaluate:
 98 |                 a = torch.argmax(logit)
 99 |                 return a.item(), None
100 |             else:
101 |                 dist = Categorical(logits=logit)
102 |                 a = dist.sample()
103 |                 a_logprob = dist.log_prob(a)
104 |                 return a.item(), a_logprob.item()
105 | 
106 |     def get_value(self, s):
107 |         with torch.no_grad():
108 |             s = torch.tensor(s, dtype=torch.float).unsqueeze(0)
109 |             value = self.ac.critic(s)
110 |             return value.item()
111 | 
112 |     def train(self, replay_buffer, total_steps):
113 |         batch = replay_buffer.get_training_data()  # Get training data
114 | 
115 |         # Optimize policy for K epochs:
116 |         for _ in range(self.K_epochs):
117 |             for index in BatchSampler(SequentialSampler(range(self.batch_size)), self.mini_batch_size, False):
118 |                 # If use RNN, we need to reset the rnn_hidden of the actor and critic.
119 |                 self.reset_rnn_hidden()
120 |                 logits_now = self.ac.actor(batch['s'][index])  # logits_now.shape=(mini_batch_size, max_episode_len, action_dim)
121 |                 values_now = self.ac.critic(batch['s'][index]).squeeze(-1)  # values_now.shape=(mini_batch_size, max_episode_len)
122 | 
123 |                 dist_now = Categorical(logits=logits_now)
124 |                 dist_entropy = dist_now.entropy()  # shape(mini_batch_size, max_episode_len)
125 |                 a_logprob_now = dist_now.log_prob(batch['a'][index])  # shape(mini_batch_size, max_episode_len)
126 |                 # a/b=exp(log(a)-log(b))
127 |                 ratios = torch.exp(a_logprob_now - batch['a_logprob'][index])  # shape(mini_batch_size, max_episode_len)
128 | 
129 |                 # actor loss
130 |                 surr1 = ratios * batch['adv'][index]
131 |                 surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * batch['adv'][index]
132 |                 actor_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy  # shape(mini_batch_size, max_episode_len)
133 |                 actor_loss = (actor_loss * batch['active'][index]).sum() / batch['active'][index].sum()
134 | 
135 |                 # critic_loss
136 |                 critic_loss = (values_now - batch['v_target'][index]) ** 2
137 |                 critic_loss = (critic_loss * batch['active'][index]).sum() / batch['active'][index].sum()
138 | 
139 |                 # Update
140 |                 self.optimizer.zero_grad()
141 |                 loss = actor_loss + critic_loss * 0.5
142 |                 loss.backward()
143 |                 if self.use_grad_clip:  # Trick 7: Gradient clip
144 |                     torch.nn.utils.clip_grad_norm_(self.ac.parameters(), 0.5)
145 |                 self.optimizer.step()
146 | 
147 |         if self.use_lr_decay:  # Trick 6:learning rate Decay
148 |             self.lr_decay(total_steps)
149 | 
150 |     def lr_decay(self, total_steps):
151 |         lr_now = 0.9 * self.lr * (1 - total_steps / self.max_train_steps) + 0.1 * self.lr
152 |         for p in self.optimizer.param_groups:
153 |             p['lr'] = lr_now
154 | 
155 |     def save_model(self, env_name, number, seed, total_steps):
156 |         torch.save(self.ac.state_dict(), "./model/PPO_actor_env_{}_number_{}_seed_{}_step_{}k.pth".format(env_name, number, seed, int(total_steps / 1000)))
157 | 
158 |     def load_model(self, env_name, number, seed, step):
159 |         self.ac.load_state_dict(torch.load("./model/PPO_actor_env_{}_number_{}_seed_{}_step_{}k.pth".format(env_name, number, seed, step)))
160 | 
161 | 


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/replaybuffer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import copy
 4 | 
 5 | 
 6 | class ReplayBuffer:
 7 |     def __init__(self, args):
 8 |         self.gamma = args.gamma
 9 |         self.lamda = args.lamda
10 |         self.use_adv_norm = args.use_adv_norm
11 |         self.state_dim = args.state_dim
12 |         self.action_dim = args.action_dim
13 |         self.episode_limit = args.episode_limit
14 |         self.batch_size = args.batch_size
15 |         self.episode_num = 0
16 |         self.max_episode_len = 0
17 |         self.buffer = None
18 |         self.reset_buffer()
19 | 
20 |     def reset_buffer(self):
21 |         self.buffer = {'s': np.zeros([self.batch_size, self.episode_limit, self.state_dim]),
22 |                        'v': np.zeros([self.batch_size, self.episode_limit + 1]),
23 |                        'a': np.zeros([self.batch_size, self.episode_limit]),
24 |                        'a_logprob': np.zeros([self.batch_size, self.episode_limit]),
25 |                        'r': np.zeros([self.batch_size, self.episode_limit]),
26 |                        'dw': np.ones([self.batch_size, self.episode_limit]),  # Note: We use 'np.ones' to initialize 'dw'
27 |                        'active': np.zeros([self.batch_size, self.episode_limit])
28 |                        }
29 |         self.episode_num = 0
30 |         self.max_episode_len = 0
31 | 
32 |     def store_transition(self, episode_step, s, v, a, a_logprob, r, dw):
33 |         self.buffer['s'][self.episode_num][episode_step] = s
34 |         self.buffer['v'][self.episode_num][episode_step] = v
35 |         self.buffer['a'][self.episode_num][episode_step] = a
36 |         self.buffer['a_logprob'][self.episode_num][episode_step] = a_logprob
37 |         self.buffer['r'][self.episode_num][episode_step] = r
38 |         self.buffer['dw'][self.episode_num][episode_step] = dw
39 | 
40 |         self.buffer['active'][self.episode_num][episode_step] = 1.0
41 | 
42 |     def store_last_value(self, episode_step, v):
43 |         self.buffer['v'][self.episode_num][episode_step] = v
44 |         self.episode_num += 1
45 |         # Record max_episode_len
46 |         if episode_step > self.max_episode_len:
47 |             self.max_episode_len = episode_step
48 | 
49 |     def get_adv(self):
50 |         # Calculate the advantage using GAE
51 |         v = self.buffer['v'][:, :self.max_episode_len]
52 |         v_next = self.buffer['v'][:, 1:self.max_episode_len + 1]
53 |         r = self.buffer['r'][:, :self.max_episode_len]
54 |         dw = self.buffer['dw'][:, :self.max_episode_len]
55 |         active = self.buffer['active'][:, :self.max_episode_len]
56 |         adv = np.zeros_like(r)  # adv.shape=(batch_size,max_episode_len)
57 |         gae = 0
58 |         with torch.no_grad():  # adv and v_target have no gradient
59 |             # deltas.shape=(batch_size,max_episode_len)
60 |             deltas = r + self.gamma * v_next * (1 - dw) - v
61 |             for t in reversed(range(self.max_episode_len)):
62 |                 gae = deltas[:, t] + self.gamma * self.lamda * gae  # gae.shape=(batch_size)
63 |                 adv[:, t] = gae
64 |             v_target = adv + v  # v_target.shape(batch_size,max_episode_len)
65 |             if self.use_adv_norm:  # Trick 1:advantage normalization
66 |                 adv_copy = copy.deepcopy(adv)
67 |                 adv_copy[active == 0] = np.nan  # 忽略掉active=0的那些adv
68 |                 adv = ((adv - np.nanmean(adv_copy)) / (np.nanstd(adv_copy) + 1e-5))
69 |         return adv, v_target
70 | 
71 |     def get_training_data(self):
72 |         adv, v_target = self.get_adv()
73 |         batch = {'s': torch.tensor(self.buffer['s'][:, :self.max_episode_len], dtype=torch.float32),
74 |                  'a': torch.tensor(self.buffer['a'][:, :self.max_episode_len], dtype=torch.long),  # 动作a的类型必须是long
75 |                  'a_logprob': torch.tensor(self.buffer['a_logprob'][:, :self.max_episode_len], dtype=torch.float32),
76 |                  'active': torch.tensor(self.buffer['active'][:, :self.max_episode_len], dtype=torch.float32),
77 |                  'adv': torch.tensor(adv, dtype=torch.float32),
78 |                  'v_target': torch.tensor(v_target, dtype=torch.float32)}
79 | 
80 |         return batch
81 | 


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_0/events.out.tfevents.1659015484.DESKTOP-LMKC0MO.6444.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_0/events.out.tfevents.1659015484.DESKTOP-LMKC0MO.6444.0


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_10/events.out.tfevents.1659019387.DESKTOP-LMKC0MO.6444.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_10/events.out.tfevents.1659019387.DESKTOP-LMKC0MO.6444.1


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_100/events.out.tfevents.1659022676.DESKTOP-LMKC0MO.6444.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_3_seed_100/events.out.tfevents.1659022676.DESKTOP-LMKC0MO.6444.2


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_0/events.out.tfevents.1659083525.DESKTOP-LMKC0MO.2204.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_0/events.out.tfevents.1659083525.DESKTOP-LMKC0MO.2204.0


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_10/events.out.tfevents.1659084659.DESKTOP-LMKC0MO.2204.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_10/events.out.tfevents.1659084659.DESKTOP-LMKC0MO.2204.1


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_100/events.out.tfevents.1659085747.DESKTOP-LMKC0MO.2204.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_CartPole-v1_number_5_seed_100/events.out.tfevents.1659085747.DESKTOP-LMKC0MO.2204.2


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_0/events.out.tfevents.1659015468.DESKTOP-LMKC0MO.13484.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_0/events.out.tfevents.1659015468.DESKTOP-LMKC0MO.13484.0


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_10/events.out.tfevents.1659031613.DESKTOP-LMKC0MO.13484.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_10/events.out.tfevents.1659031613.DESKTOP-LMKC0MO.13484.1


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_100/events.out.tfevents.1659045291.DESKTOP-LMKC0MO.13484.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_3_seed_100/events.out.tfevents.1659045291.DESKTOP-LMKC0MO.13484.2


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_0/events.out.tfevents.1659083526.DESKTOP-LMKC0MO.12096.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_0/events.out.tfevents.1659083526.DESKTOP-LMKC0MO.12096.0


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_10/events.out.tfevents.1659083528.DESKTOP-LMKC0MO.11420.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_10/events.out.tfevents.1659083528.DESKTOP-LMKC0MO.11420.0


--------------------------------------------------------------------------------
/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_100/events.out.tfevents.1659083530.DESKTOP-LMKC0MO.11484.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lizhi-sjtu/DRL-code-pytorch/8f767b99ad44990b49f6acf3159660c5594db77e/9.PPO-discrete-RNN/runs/PPO_discrete/env_LunarLander-v2_number_5_seed_100/events.out.tfevents.1659083530.DESKTOP-LMKC0MO.11484.0


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Lizhi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DRL-code-pytorch
 2 | Concise pytorch implementations of DRL algorithms, including REINFORCE, A2C, Rainbow DQN, PPO(discrete and continuous), DDPG, TD3, SAC, PPO-discrete-RNN(LSTM/GRU).
 3 | 
 4 | 
 5 | # Dependencies
 6 | python==3.7.9<br />
 7 | numpy==1.19.4<br />
 8 | pytorch==1.12.0<br />
 9 | tensorboard==0.6.0<br />
10 | gym==0.21.0<br />
11 | 


--------------------------------------------------------------------------------