├── .gitignore
├── LICENSE
├── README.md
├── algos
    ├── __init__.py
    ├── base.py
    ├── ddpg.py
    ├── ddqn.py
    ├── dqn.py
    ├── offline
    │   ├── __init__.py
    │   ├── bcq.py
    │   ├── bear.py
    │   ├── cql.py
    │   ├── plas.py
    │   ├── sac_offline.py
    │   └── td3_bc.py
    ├── ppo.py
    ├── sac.py
    └── td3.py
├── common
    ├── __init__.py
    ├── buffers.py
    └── networks.py
├── run
    ├── bcq_mujoco.py
    ├── bear_mujoco.py
    ├── cql_atari.py
    ├── cql_mujoco.py
    ├── ddpg_gym.py
    ├── ddpg_unity.py
    ├── ddqn_atari.py
    ├── ddqn_gym.py
    ├── dqn_atari.py
    ├── dqn_gym.py
    ├── plas_mujoco.py
    ├── ppo_gym.py
    ├── ppo_mujoco.py
    ├── sac_gym.py
    ├── sac_mujoco.py
    ├── sac_offline_mujoco.py
    ├── sac_unity.py
    ├── td3_bc_mujoco.py
    ├── td3_gym.py
    ├── td3_mujoco.py
    └── td3_unity.py
└── utils
    ├── __init__.py
    ├── atari_preprocess.py
    ├── atari_wrappers.py
    ├── data_tools.py
    ├── eval_plot.py
    ├── log_tools.py
    └── train_tools.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | 
 3 | __pycache__/
 4 | *.pyc
 5 | 
 6 | test/
 7 | 
 8 | /notes.txt
 9 | /run/RunNotes.txt
10 | 
11 | /run/results/
12 | 
13 | /algos/experiments


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 dragon-wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RL_Algorithms
  2 | A lightweight reinforcement learning algorithm library implemented by pytorch
  3 | ## Supported algorithms
  4 | 
  5 | ### Online RL
  6 | 
  7 | Interact with the environment during training.
  8 | 
  9 | | algorithm                                                    | discrete control | continuous control |
 10 | | ------------------------------------------------------------ | ---------------- | ------------------ |
 11 | | [Deep Q-Network (DQN)](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) | ✔                | ⛔                  |
 12 | | [Double DQN (DDQN)](https://arxiv.org/abs/1509.06461)        | ✔                | ⛔                  |
 13 | | [Deep Deterministic Policy Gradients (DDPG)](https://arxiv.org/abs/1509.02971) | ⛔                | ✔                  |
 14 | | [Proximal Policy Optimization (PPO)](https://arxiv.org/abs/1707.06347) | ✔                | ✔                  |
 15 | | [Soft Actor-Critic (SAC)](https://arxiv.org/abs/1812.05905)  | ⛔                | ✔                  |
 16 | | [Twin Delayed Deep Deterministic policy gradient(TD3)](https://arxiv.org/abs/1802.09477) | ⛔                | ✔                  |
 17 | 
 18 | ### Offline RL
 19 | 
 20 | Use the existing data set  for training, and there is no interaction with the environment during training.
 21 | 
 22 | | algorithm                                                    | discrete control | continuous control |
 23 | | ------------------------------------------------------------ | ---------------- | ------------------ |
 24 | | [Batch-Constrained deep Q-learning (BCQ)](https://arxiv.org/abs/1812.02900) | ⛔                | ✔                  |
 25 | | [Bootstrapping Error Accumulation Reduction (BEAR)](https://arxiv.org/abs/1906.00949) | ⛔                | ✔                  |
 26 | | [Policy in the Latent Action Space (PLAS)](https://arxiv.org/abs/2011.07213) | ⛔                | ✔                  |
 27 | | [Conservative Q-Learning (CQL)](https://arxiv.org/abs/2006.04779) | ✔                | ✔                  |
 28 | | [TD3 with behavior cloning(TD3-BC)](https://arxiv.org/abs/2106.06860) | ⛔                | ✔                  |
 29 | 
 30 | ## To do list
 31 | 
 32 | **Online algorithm:**
 33 | 
 34 | + [Trust Region Policy Optimization(TRPO)](https://proceedings.mlr.press/v37/schulman15.html)
 35 | 
 36 | **Offline algorithm:**
 37 | 
 38 | + [Discrete Batch-Constrained deep Q-Learning (BCQ-Discrete)](https://arxiv.org/abs/1910.01708)
 39 | + [Behavior Regularized Actor Critic (BRAC)](https://arxiv.org/abs/1911.11361)
 40 | + [Fisher-Behavior Regularized Critic(Fisher-BRC)](https://arxiv.org/abs/2103.08050)
 41 | 
 42 | ## Requirements
 43 | 
 44 | ```
 45 | |Python 3.7        |
 46 | |Pytorch 1.7.1	   |
 47 | |tensorboard 2.7.0 | To view the training curve in real time, 
 48 | |tqdm 4.62.3       | To show progress bar.
 49 | |numpy 1.21.3	   | 
 50 | 
 51 | |gym 0.19.0        | 
 52 | |box2d-py 2.3.8    | Include Box2d env, e.g,"BipedalWalker-v2" and "LunarLander-v2".
 53 | |atari-py 0.2.6    | Include Atari env, e.g, "Pong", "Breakout" and "SpaceInvaders".
 54 | |mujoco-py 2.0.2.8 | Include Mujoco env, e.g, "Hopper-v2", "Ant-v2" and "HalfCheetah-v2".
 55 | 
 56 | |d4rl 1.1          | Only used in Offline RL. Include offline dataset of Mujoco, CARLA and so on.
 57 |                      (Can be installed in "https://github.com/rail-berkeley/d4rl")
 58 | |d4rl-atari 0.1    | Only used in Offline RL. Include offline dataset of Atari.
 59 |                      (Can be installed in "https://github.com/takuseno/d4rl-atari")
 60 | |mlagents 0.27.0   | To train agents in unity's self built environment.
 61 |                      (Can be installed in "https://github.com/Unity-Technologies/ml-agents")
 62 | ```
 63 | 
 64 | ## Quick start
 65 | 
 66 | ### To train the agents on the environments
 67 | 
 68 | ```shell
 69 | git clone https://github.com/dragon-wang/RL_Algorithms.git
 70 | cd RL_Algorithms/run
 71 | 
 72 | # train DQN
 73 | python dqn_gym.py --env=CartPole-v0 --train_id=dqn_test  
 74 | 
 75 | # train DDPG
 76 | python ddpg_gym.py --env=Pendulum-v0 --train_id=ddpg_Pendulum-v0
 77 | python ddpg_unity.py --train_id=ddpg_unity_test
 78 | 
 79 | # train PPO
 80 | python ppo_gym.py --env=CartPole-v0 --train_id=ppo_CartPole-v0
 81 | python ppo_mujoco.py --env=Hopper-v2 --train_id=ppo_Hopper-v2
 82 | 
 83 | # train SAC
 84 | python sac_gym.py --env=Pendulum-v0 --train_id=sac_Pendulum-v0  
 85 | python sac_mujoco.py --env=Hopper-v2 --train_id=sac_Hopper-v2 --max_train_step=2000000 --auto
 86 | python sac_unity.py --train_id=sac_unity_test --auto
 87 | 
 88 | # train TD3
 89 | python td3_gym.py --env=Pendulum-v0 --train_id=td3_Pendulum-v0
 90 | python td3_mujoco.py --env=Hopper-v2 --train_id=td3_Hopper-v2  
 91 | python td3_unity.py --train_id=td3_unity_test
 92 | 
 93 | # train BCQ
 94 | python bcq_mujoco.py --train_id=bcq_hopper-mudium-v2 --env=hopper-medium-v2  --device=cuda
 95 | 
 96 | # train PLAS
 97 | python plas_mujoco.py --train_id=plas_hopper-mudium-v2 --env=hopper-medium-v2 --device=cuda
 98 | 
 99 | # train CQL
100 | python cql_mujoco.py --train_id=cql_hopper-mudium-v2 --env=hopper-medium-v2 --auto_alpha --entropy_backup --with_lagrange --lagrange_thresh=10.0 --device=cuda 
101 | 
102 | # train BEAR
103 | python bear_mujoco.py --env=hopper-medium-v2 --train_id=bear_hopper-mudium-v2 --kernel_type=laplacian --seed=10 --device=cuda
104 | ```
105 | 
106 | Some command line common parameters:
107 | 
108 | + `--env`: the name of environment.(`--env=xxx`)
109 | + `--capacity`: the max size of replay buffer.(`--capacity=xxx`)
110 | + `--batch_size`: the size of batch that sampled from buffer.(`--batch_size=xxx`)
111 | + `--explore_step`: the steps of exploration before train.(`--explore_step=xxx`)
112 | + `--eval_freq`: how often (time steps) we evaluate during training, and it will not evaluate if `eval_freq < 0`(but in offline algorithms, we must evaluate during training).(`--eval_freq=xxx`)
113 | + `--max_train_step`: the max train step.(`--max_train_step=xxx`)
114 | + `--log_interval`: the number of steps taken to record the model and the tensorboard.(`--log_interval=xxx`)
115 | + `--train_id`: path to save model and log tensorboard.(`--train_id=xxx`)
116 | + `--resume`: whether load the last saved model to train.(`--resume`)
117 | + `--device`: choose device.(`--device=cpu` or `--device=cuda`)
118 | + `--show`: show the trained model visually.(`--show`)
119 | + `--seed`: the random seed of env or neural network(`--seed=xxx`)
120 | 
121 | The specific parameters for each algorithm can be viewed in the "xxx.py" files under the "run" folder. Of course I have also provided some default parameters.
122 | 
123 | **Note that your trained model and tensorboard files are stored in the "results/your train_id" folder.**
124 | 
125 | ### Use tensorboard to view the training curve
126 | 
127 | ```
128 | cd run
129 | 
130 | tensorboard --logdir results
131 | ```
132 | 
133 | You can then view the training curve by typing "http://localhost:6006/" into your browser.
134 | 
135 | ## Continue to train from last checkpoint
136 | 
137 | You just need to add `--resume` after your command line, such as:
138 | 
139 | ```shell
140 | python sac_mujoco.py --env=Hopper-v2 --train_id=sac_Hopper-v2 --max_train_step=2000000 --auto --resume
141 | ```
142 | 
143 | **Note that the "train_id" must be the same as your last training id.**
144 | 
145 | ## Show trained agent
146 | 
147 | You can view the display of the trained agent via `--show`, such as:
148 | 
149 | ```shell
150 | python sac_mujoco.py --env=Hopper-v2 --train_id=sac_Hopper-v2 --show
151 | ```
152 | 
153 | **Note that the "train_id" must be the same as the id of the agent you want to see.**
154 | 


--------------------------------------------------------------------------------
/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dragon-wang/RL_Algorithms/3d43ece583e80f2828a42f28b790b1d7d73c07bf/algos/__init__.py


--------------------------------------------------------------------------------
/algos/base.py:
--------------------------------------------------------------------------------
  1 | from abc import abstractmethod, ABC, ABCMeta
  2 | import torch
  3 | from utils import log_tools
  4 | from utils.train_tools import explore_before_train, evaluate
  5 | import numpy as np
  6 | import os
  7 | 
  8 | class PolicyBase(ABC):
  9 |     def __init__(self,
 10 |                  env,  # RL environment object
 11 |                  gamma, # The decay factor
 12 |                  eval_freq, # How often (time steps) the policy is evaluated. it will not evaluate the agent during train if eval_freq < 0.
 13 |                  max_train_step, # The max train step
 14 |                  train_id, # The name and path to save model and log tensorboard
 15 |                  log_interval, # The number of steps taken to record the model and the tensorboard
 16 |                  resume, # Whether load the last saved model and continue to train
 17 |                  device,  # The device. Choose cpu or cuda
 18 |                  ):
 19 |         self.env = env
 20 |         self.gamma = gamma
 21 |         self.eval_freq = eval_freq
 22 |         self.max_train_step = max_train_step
 23 |         self.train_id = train_id
 24 |         self.log_interval = log_interval
 25 |         self.resume = resume
 26 |         self.device = torch.device(device)
 27 | 
 28 |         self.train_step = 0
 29 | 
 30 |         self.result_dir = os.path.join(log_tools.ROOT_DIR, "run/results", self.train_id)
 31 |         self.checkpoint_path = os.path.join(self.result_dir, "checkpoint.pth")
 32 |     
 33 |     @abstractmethod
 34 |     def choose_action(self, obs, eval=False):
 35 |         """Select an action according to the observation
 36 | 
 37 |         Args:
 38 |             obs (_type_): The observation
 39 |             eval (bool): Whether used in evaluation
 40 |         """
 41 |         pass
 42 | 
 43 |     @abstractmethod
 44 |     def train(self):
 45 |         """The main body of rl algorithm
 46 |         """
 47 |         pass
 48 | 
 49 |     @abstractmethod
 50 |     def learn(self):
 51 |         """The main loop of training process
 52 |         """
 53 |         pass
 54 | 
 55 |     @abstractmethod
 56 |     def store_agent_checkpoint(self):
 57 |         """Save training data. (e.g. neural network parameters, optimizer parameters, training steps, ...)
 58 |         """
 59 |         pass
 60 | 
 61 |     @abstractmethod
 62 |     def load_agent_checkpoint(self):
 63 |         """Load training data
 64 |         """
 65 |         pass
 66 | 
 67 | 
 68 | class OffPolicyBase(PolicyBase):
 69 |     def __init__(self, 
 70 |                  replay_buffer,  # The replay buffer
 71 |                  explore_step,  # Steps to explore the environment before training
 72 |                  **kwargs  # The parameters of the parent class
 73 |                  ):
 74 |         super().__init__(**kwargs)
 75 | 
 76 |         self.replay_buffer = replay_buffer
 77 |         self.explore_step = explore_step
 78 | 
 79 |         self.episode_num = 0
 80 | 
 81 |     def choose_action(self, obs, eval=False):
 82 |         raise NotImplementedError
 83 | 
 84 |     def train(self):
 85 |         raise NotImplementedError
 86 | 
 87 |     def learn(self):
 88 |         # Make the directory to save the training results that consist of checkpoint files and tensorboard files
 89 |         log_tools.make_dir(self.result_dir)
 90 |         tensorboard_writer = log_tools.TensorboardLogger(self.result_dir)
 91 | 
 92 |         if self.resume:
 93 |             self.load_agent_checkpoint()
 94 |         else:
 95 |             # delete tensorboard log file
 96 |             log_tools.del_all_files_in_dir(self.result_dir)
 97 |         
 98 |         explore_before_train(self.env, self.replay_buffer, self.explore_step)
 99 |         print("==============================start train===================================")
100 |         obs = self.env.reset()
101 | 
102 |         episode_reward = 0
103 |         episode_length = 0
104 | 
105 |         # The main loop of "choose action -> act action -> add buffer -> train policy"
106 |         while self.train_step < self.max_train_step:
107 |             action = self.choose_action(np.array(obs), eval=False)
108 |             next_obs, reward, done, _ = self.env.step(action)
109 |             episode_reward += reward
110 |             self.replay_buffer.add(obs, action, reward, next_obs, done)
111 |             obs = next_obs
112 |             episode_length += 1
113 | 
114 |             train_summaries = self.train()
115 | 
116 |             if done:
117 |                 self.episode_num += 1
118 |                 obs = self.env.reset()
119 |             
120 |                 print(f"Time Step: {self.train_step} Episode Num: {self.episode_num}"
121 |                       f"Episode Length: {episode_length} Episode Reward: {episode_reward:.2f}")
122 |                 tensorboard_writer.log_learn_data({"episode_length": episode_length,
123 |                                                    "episode_reward": episode_reward}, self.train_step)
124 |                 episode_reward = 0
125 |                 episode_length = 0
126 | 
127 |             if self.train_step % self.log_interval == 0:
128 |                 self.store_agent_checkpoint()
129 |                 tensorboard_writer.log_train_data(train_summaries, self.train_step)
130 | 
131 |             if self.eval_freq > 0 and self.train_step % self.eval_freq == 0:
132 |                 evaluate_summaries = evaluate(agent=self, episode_num=10)
133 |                 tensorboard_writer.log_eval_data(evaluate_summaries, self.train_step)
134 | 
135 |     def store_agent_checkpoint(self):
136 |         raise NotImplementedError
137 | 
138 |     def load_agent_checkpoint(self):
139 |         raise NotImplementedError
140 | 
141 | 
142 | class OfflineBase(PolicyBase):
143 |     def __init__(self, data_buffer, **kwargs):
144 |         super().__init__(**kwargs)
145 |         self.data_buffer = data_buffer
146 | 
147 |     def choose_action(self, obs, eval=True):
148 |         """In offline settings, 
149 |         since the agent does not interact with the environment during training, 
150 |         this function is only used during evaluation.
151 |         """
152 |         raise NotImplementedError
153 | 
154 |     def train(self):
155 |         raise NotImplementedError
156 | 
157 |     def learn(self):
158 |         # Make the directory to save the training results that consist of checkpoint files and tensorboard files
159 |         log_tools.make_dir(self.result_dir)
160 |         tensorboard_writer = log_tools.TensorboardLogger(self.result_dir)
161 | 
162 |         if self.resume:
163 |             self.load_agent_checkpoint()
164 |         else:
165 |             # delete tensorboard log file
166 |             log_tools.del_all_files_in_dir(self.result_dir)
167 | 
168 |         while self.train_step < self.max_train_step:
169 |             train_summaries = self.train()
170 | 
171 |             if self.train_step % self.log_interval == 0:
172 |                 self.store_agent_checkpoint()
173 |                 tensorboard_writer.log_train_data(train_summaries, self.train_step)
174 | 
175 |             if self.eval_freq > 0 and self.train_step % self.eval_freq == 0:
176 |                 evaluate_summaries = evaluate(agent=self, episode_num=10)
177 |                 tensorboard_writer.log_eval_data(evaluate_summaries, self.train_step)
178 | 
179 |     def store_agent_checkpoint(self):
180 |         raise NotImplementedError
181 | 
182 |     def load_agent_checkpoint(self):
183 |         raise NotImplementedError


--------------------------------------------------------------------------------
/algos/ddpg.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from algos.base import OffPolicyBase
  6 | from utils.train_tools import soft_target_update
  7 | 
  8 | 
  9 | class DDPG_Agent(OffPolicyBase):
 10 |     """
 11 |     Implementation of Deep Deterministic Policy Gradient (DDPG)
 12 |     https://arxiv.org/abs/1509.02971
 13 |     """
 14 |     def __init__(self,
 15 |                  actor_net: torch.nn.Module,
 16 |                  critic_net: torch.nn.Module,
 17 |                  actor_lr=1e-4,
 18 |                  critic_lr=1e-3,
 19 |                  tau=0.005,  # used to update target network, w' = tau*w + (1-tau)*w'
 20 |                  gaussian_noise_sigma=0.2, 
 21 |                  **kwargs        
 22 |                  ):
 23 |         super().__init__(**kwargs)
 24 | 
 25 |         self.action_num = self.env.action_space.shape[0]
 26 |         self.action_bound = self.env.action_space.high[0]
 27 | 
 28 |         # the network and optimizers
 29 |         self.actor_net = actor_net.to(self.device)
 30 |         self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device)
 31 |         self.critic_net = critic_net.to(self.device)
 32 |         self.target_critic_net = copy.deepcopy(self.critic_net).to(self.device)
 33 |         self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr)
 34 |         self.critic_optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=critic_lr)
 35 | 
 36 |         self.tau = tau
 37 |         self.gaussian_noise_sigma = gaussian_noise_sigma
 38 | 
 39 |     def choose_action(self, obs, eval=False):
 40 |         """Choose an action by deterministic policy with some gaussian noise"""
 41 |         obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device)
 42 |         with torch.no_grad():
 43 |             action = self.actor_net(obs).cpu().numpy().flatten()
 44 |         if eval:
 45 |             return action
 46 |         else:
 47 |             noise = np.random.normal(0, self.gaussian_noise_sigma, size=self.action_num)
 48 |             return (action + noise).clip(-self.action_bound, self.action_bound)
 49 | 
 50 |     def train(self):
 51 | 
 52 |         # Sample
 53 |         batch = self.replay_buffer.sample()
 54 |         obs = batch["obs"].to(self.device)
 55 |         acts = batch["acts"].to(self.device)
 56 |         rews = batch["rews"].to(self.device)
 57 |         next_obs = batch["next_obs"].to(self.device)
 58 |         done = batch["done"].to(self.device)
 59 | 
 60 |         # Compute target Q value
 61 |         with torch.no_grad():
 62 |             next_act = self.target_actor_net(next_obs)
 63 |             next_Q = self.target_critic_net(next_obs, next_act).squeeze(1)
 64 |             target_Q = rews + (1. - done) * self.gamma * next_Q
 65 | 
 66 |         # Compute current Q
 67 |         current_Q = self.critic_net(obs, acts).squeeze(1)
 68 | 
 69 |         # Compute critic loss
 70 |         critic_loss = F.mse_loss(current_Q, target_Q)
 71 | 
 72 |         # Compute actor loss
 73 |         actor_loss = -self.critic_net(obs, self.actor_net(obs)).mean()
 74 | 
 75 |         # Optimize actor net
 76 |         self.actor_optimizer.zero_grad()
 77 |         actor_loss.backward()
 78 |         self.actor_optimizer.step()
 79 | 
 80 |         # Optimize critic net
 81 |         self.critic_optimizer.zero_grad()
 82 |         critic_loss.backward()
 83 |         self.critic_optimizer.step()
 84 | 
 85 |         soft_target_update(self.actor_net, self.target_actor_net, tau=self.tau)
 86 |         soft_target_update(self.critic_net, self.target_critic_net, tau=self.tau)
 87 | 
 88 |         self.train_step += 1
 89 | 
 90 |         train_summaries = {"actor_loss": actor_loss.cpu().item(),
 91 |                            "critic_loss": critic_loss.cpu().item()}
 92 |         return train_summaries
 93 | 
 94 |     def store_agent_checkpoint(self):
 95 |         checkpoint = {
 96 |             "actor_net": self.actor_net.state_dict(),
 97 |             "critic_net": self.critic_net.state_dict(),
 98 |             "actor_optimizer": self.actor_optimizer.state_dict(),
 99 |             "critic_optimizer": self.critic_optimizer.state_dict(),
100 |             "train_step": self.train_step,
101 |             "episode_num": self.episode_num
102 |         }
103 |         torch.save(checkpoint, self.checkpoint_path)
104 | 
105 |     def load_agent_checkpoint(self):
106 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
107 |         self.actor_net.load_state_dict(checkpoint["actor_net"])
108 |         self.target_actor_net.load_state_dict(checkpoint["actor_net"])
109 |         self.critic_net.load_state_dict(checkpoint["critic_net"])
110 |         self.target_critic_net.load_state_dict(checkpoint["critic_net"])
111 |         self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"])
112 |         self.critic_optimizer.load_state_dict(checkpoint["critic_optimizer"])
113 |         self.train_step = checkpoint["train_step"]
114 |         self.episode_num = checkpoint["episode_num"]
115 |         print("load checkpoint from \"" + self.checkpoint_path +
116 |               "\" at " + str(self.train_step) + " time step")
117 | 


--------------------------------------------------------------------------------
/algos/ddqn.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import torch
  4 | from algos.base import OffPolicyBase
  5 | from utils.train_tools import hard_target_update
  6 | 
  7 | 
  8 | class DDQN_Agent(OffPolicyBase):
  9 |     """
 10 |     Implementation of Double DQN (DDQN)
 11 |     https://arxiv.org/abs/1509.06461
 12 |     """
 13 |     def __init__(self,
 14 |                  Q_net: torch.nn.Module,
 15 |                  qf_lr=0.001,
 16 |                  initial_eps=0.1,
 17 |                  end_eps=0.001,
 18 |                  eps_decay_period=2000,
 19 |                  eval_eps=0.001,
 20 |                  target_update_freq =10,
 21 |                  **kwargs
 22 |                  ):
 23 |         super().__init__(**kwargs)
 24 |         
 25 |         self.target_update_freq = target_update_freq
 26 | 
 27 |         self.Q_net = Q_net.to(self.device)
 28 |         self.target_Q_net = copy.deepcopy(self.Q_net).to(self.device)
 29 |         self.optimizer = torch.optim.Adam(self.Q_net.parameters(), lr=qf_lr)
 30 | 
 31 |         # Decay for epsilon
 32 |         self.initial_eps = initial_eps
 33 |         self.end_eps = end_eps
 34 |         self.slope = (self.end_eps - self.initial_eps) / eps_decay_period
 35 |         self.eval_eps = eval_eps
 36 | 
 37 |     def choose_action(self, obs, eval=False):
 38 |         eps = self.eval_eps if eval else max(self.slope * self.train_step + self.initial_eps, self.end_eps)
 39 | 
 40 |         if np.random.uniform(0, 1) > eps:
 41 |             with torch.no_grad():
 42 |                 obs = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
 43 |                 return int(self.Q_net(obs).argmax(dim=1).cpu())
 44 |         else:
 45 |             return self.env.action_space.sample()
 46 | 
 47 |     def train(self):
 48 |         """
 49 |         Sample a batch of data from replay buffer and train
 50 |         """
 51 | 
 52 |         # Sample
 53 |         batch = self.replay_buffer.sample()
 54 |         obs = batch["obs"].to(self.device)
 55 |         acts = batch["acts"].to(self.device)
 56 |         rews = batch["rews"].to(self.device)
 57 |         next_obs = batch["next_obs"].to(self.device)
 58 |         done = batch["done"].to(self.device)
 59 | 
 60 |         # Compute target Q value (Double DQN)
 61 |         with torch.no_grad():
 62 |             next_acts = self.Q_net(next_obs).max(dim=1)[1].unsqueeze(1)  # use Q net to get next actions, rather than target Q net
 63 |             target_Q = self.target_Q_net(next_obs).gather(1, next_acts).squeeze(1)
 64 |             target_Q = rews + (1. - done) * self.gamma * target_Q
 65 | 
 66 |         # Compute current Q value
 67 |         current_Q = self.Q_net(obs).gather(1, acts.long()).squeeze(1)
 68 | 
 69 |         # Compute Q loss
 70 |         q_loss = 0.5 * (target_Q - current_Q).pow(2).mean()
 71 |         # q_loss = F.mse_loss(current_Q, target_Q)
 72 | 
 73 |         # Optimize the Q network
 74 |         self.optimizer.zero_grad()
 75 |         q_loss.backward()
 76 |         self.optimizer.step()
 77 | 
 78 |         self.train_step += 1
 79 | 
 80 |         # update target Q
 81 |         if self.train_step % self.target_update_freq == 0:
 82 |             hard_target_update(self.Q_net, self.target_Q_net)
 83 | 
 84 |         train_summaries = {"q_loss": q_loss.cpu().item()}
 85 | 
 86 |         return train_summaries
 87 | 
 88 |     def store_agent_checkpoint(self):
 89 |         checkpoint = {
 90 |             "net": self.Q_net.state_dict(),
 91 |             "optimizer": self.optimizer.state_dict(),
 92 |             "train_step": self.train_step,
 93 |             "episode_num": self.episode_num
 94 |         }
 95 |         torch.save(checkpoint, self.checkpoint_path)
 96 | 
 97 |     def load_agent_checkpoint(self):
 98 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
 99 |         self.Q_net.load_state_dict(checkpoint["net"])
100 |         self.target_Q_net = copy.deepcopy(self.Q_net)
101 |         self.optimizer.load_state_dict(checkpoint["optimizer"])
102 |         self.train_step = checkpoint["train_step"]
103 |         self.episode_num = checkpoint["episode_num"]
104 |         print("load checkpoint from \"" + self.checkpoint_path +
105 |               "\" at " + str(self.train_step) + " time step")
106 | 


--------------------------------------------------------------------------------
/algos/dqn.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import torch
  4 | from algos.base import OffPolicyBase
  5 | from utils.train_tools import hard_target_update 
  6 | 
  7 | 
  8 | class DQN_Agent(OffPolicyBase):
  9 |     """
 10 |     Implementation of Deep Q-Network (DQN)
 11 |     https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
 12 |     """
 13 |     def __init__(self,
 14 |                  Q_net: torch.nn.Module,
 15 |                  qf_lr=0.001,
 16 |                  initial_eps=0.1,
 17 |                  end_eps=0.001,
 18 |                  eps_decay_period=2000,
 19 |                  eval_eps=0.001,
 20 |                  target_update_freq =10,
 21 |                  **kwargs
 22 |                  ):
 23 |         super().__init__(**kwargs)
 24 | 
 25 |         self.target_update_freq = target_update_freq
 26 | 
 27 |         self.Q_net = Q_net.to(self.device)
 28 |         self.target_Q_net = copy.deepcopy(self.Q_net).to(self.device)
 29 |         self.optimizer = torch.optim.Adam(self.Q_net.parameters(), lr=qf_lr)
 30 | 
 31 |         # Decay for epsilon
 32 |         self.initial_eps = initial_eps
 33 |         self.end_eps = end_eps
 34 |         self.slope = (self.end_eps - self.initial_eps) / eps_decay_period
 35 |         self.eval_eps = eval_eps
 36 | 
 37 |     def choose_action(self, obs, eval=False):
 38 |         eps = self.eval_eps if eval else max(self.slope * self.train_step + self.initial_eps, self.end_eps)
 39 | 
 40 |         if np.random.uniform(0, 1) > eps:
 41 |             with torch.no_grad():
 42 |                 obs = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
 43 |                 return int(self.Q_net(obs).argmax(dim=1).cpu())
 44 |         else:
 45 |             return self.env.action_space.sample()
 46 | 
 47 |     def train(self):
 48 |         """
 49 |         Sample a batch of data from replay buffer and train
 50 |         """
 51 | 
 52 |         # Sample
 53 |         batch = self.replay_buffer.sample()
 54 |         obs = batch["obs"].to(self.device)
 55 |         acts = batch["acts"].to(self.device)
 56 |         rews = batch["rews"].to(self.device)
 57 |         next_obs = batch["next_obs"].to(self.device)
 58 |         done = batch["done"].to(self.device)
 59 | 
 60 |         # Compute target Q value
 61 |         with torch.no_grad():
 62 |             target_q = rews + (1. - done) * self.gamma * self.target_Q_net(next_obs).max(dim=1)[0]
 63 | 
 64 |         # Compute current Q value
 65 |         current_q = self.Q_net(obs).gather(1, acts.long()).squeeze(1)
 66 | 
 67 |         # Compute Q loss
 68 |         q_loss = 0.5 * (target_q - current_q).pow(2).mean()
 69 |         # Q_loss = F.mse_loss(current_Q, target_Q)
 70 | 
 71 |         # Optimize the Q network
 72 |         self.optimizer.zero_grad()
 73 |         q_loss.backward()
 74 |         self.optimizer.step()
 75 | 
 76 |         self.train_step += 1
 77 | 
 78 |         # update target Q
 79 |         if self.train_step % self.target_update_freq == 0:
 80 |             hard_target_update(self.Q_net, self.target_Q_net)
 81 | 
 82 |         train_summaries = {"q_loss": q_loss.cpu().item()}
 83 | 
 84 |         return train_summaries
 85 | 
 86 |     def store_agent_checkpoint(self):
 87 |         checkpoint = {
 88 |             "net": self.Q_net.state_dict(),
 89 |             "optimizer": self.optimizer.state_dict(),
 90 |             "train_step": self.train_step,
 91 |             "episode_num": self.episode_num
 92 |         }
 93 |         torch.save(checkpoint, self.checkpoint_path)
 94 | 
 95 |     def load_agent_checkpoint(self):
 96 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
 97 |         self.Q_net.load_state_dict(checkpoint["net"])
 98 |         self.target_Q_net = copy.deepcopy(self.Q_net)
 99 |         self.optimizer.load_state_dict(checkpoint["optimizer"])
100 |         self.train_step = checkpoint["train_step"]
101 |         self.episode_num = checkpoint["episode_num"]
102 |         print("load checkpoint from \"" + self.checkpoint_path +
103 |               "\" at " + str(self.train_step) + " time step")


--------------------------------------------------------------------------------
/algos/offline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dragon-wang/RL_Algorithms/3d43ece583e80f2828a42f28b790b1d7d73c07bf/algos/offline/__init__.py


--------------------------------------------------------------------------------
/algos/offline/bcq.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from algos.base import OfflineBase
  5 | from utils.train_tools import soft_target_update
  6 | 
  7 | 
  8 | class BCQ_Agent(OfflineBase):
  9 |     """
 10 |     Implementation of Batch-Constrained deep Q-learning(BCQ) in continuous action space
 11 |     https://arxiv.org/abs/1812.02900
 12 |     """
 13 |     def __init__(self,
 14 |                  critic_net1: torch.nn.Module,
 15 |                  critic_net2: torch.nn.Module,
 16 |                  perturbation_net: torch.nn.Module,
 17 |                  cvae_net: torch.nn.Module,  # generation model
 18 |                  critic_lr=1e-3,
 19 |                  per_lr=1e-3,
 20 |                  cvae_lr=1e-3,
 21 |                  tau=0.005,
 22 |                  lmbda=0.75,  # used for double clipped double q-learning
 23 |                  **kwargs
 24 |                  ):
 25 |         super().__init__(**kwargs)
 26 | 
 27 |         self.critic_net1 = critic_net1.to(self.device)
 28 |         self.critic_net2 = critic_net2.to(self.device)
 29 |         self.target_critic_net1 = copy.deepcopy(self.critic_net1).to(self.device)
 30 |         self.target_critic_net2 = copy.deepcopy(self.critic_net2).to(self.device)
 31 |         self.perturbation_net = perturbation_net.to(self.device)
 32 |         self.target_perturbation_net = copy.deepcopy(self.perturbation_net).to(self.device)
 33 |         self.cvae_net = cvae_net.to(self.device)
 34 |         self.critic_optimizer1 = torch.optim.Adam(self.critic_net1.parameters(), lr=critic_lr)
 35 |         self.critic_optimizer2 = torch.optim.Adam(self.critic_net2.parameters(), lr=critic_lr)
 36 |         self.perturbation_optimizer = torch.optim.Adam(self.perturbation_net.parameters(), lr=per_lr)
 37 |         self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(), lr=cvae_lr)
 38 | 
 39 |         self.tau = tau
 40 |         self.lmbda = lmbda
 41 | 
 42 |     def choose_action(self, obs, eval=True):
 43 |         with torch.no_grad():
 44 |             obs = torch.FloatTensor(obs).reshape(1, -1).repeat(100, 1).to(self.device)
 45 |             generated_action = self.cvae_net.decode(obs, z_device=self.device)
 46 |             perturbed_action = self.perturbation_net(obs, generated_action)
 47 |             q1 = self.critic_net1(obs, perturbed_action)
 48 |             ind = q1.argmax(dim=0)
 49 |         return perturbed_action[ind].cpu().data.numpy().flatten()
 50 | 
 51 |     def train(self):
 52 |         # Sample
 53 |         batch = self.data_buffer.sample()
 54 |         obs = batch["obs"].to(self.device)
 55 |         acts = batch["acts"].to(self.device)
 56 |         rews = batch["rews"].to(self.device)
 57 |         next_obs = batch["next_obs"].to(self.device)
 58 |         done = batch["done"].to(self.device)
 59 | 
 60 |         """
 61 |         CVAE Loss (the generation model)
 62 |         """
 63 |         recon_action, mu, log_std = self.cvae_net(obs, acts)
 64 |         cvae_loss = self.cvae_net.loss_function(recon_action, acts, mu, log_std)
 65 | 
 66 |         self.cvae_optimizer.zero_grad()
 67 |         cvae_loss.backward()
 68 |         self.cvae_optimizer.step()
 69 | 
 70 |         """
 71 |         Critic Loss
 72 |         """
 73 |         with torch.no_grad():
 74 |             # generate 10 actions for every next_obs
 75 |             next_obs = torch.repeat_interleave(next_obs, repeats=10, dim=0).to(self.device)
 76 |             generated_action = self.cvae_net.decode(next_obs, z_device=self.device)
 77 |             # perturb the generated action
 78 |             perturbed_action = self.target_perturbation_net(next_obs, generated_action)
 79 |             # compute target Q value of perturbed action
 80 |             target_q1 = self.target_critic_net1(next_obs, perturbed_action)
 81 |             target_q2 = self.target_critic_net2(next_obs, perturbed_action)
 82 |             # soft clipped double q-learning
 83 |             target_q = self.lmbda * torch.min(target_q1, target_q2) + (1. - self.lmbda) * torch.max(target_q1, target_q2)
 84 |             # take max over each action sampled from the generation and perturbation model
 85 |             target_q = target_q.reshape(obs.shape[0], 10, 1).max(1)[0].squeeze(1)
 86 |             target_q = rews + self.gamma * (1. - done) * target_q
 87 | 
 88 |         # compute current Q
 89 |         current_q1 = self.critic_net1(obs, acts).squeeze(1)
 90 |         current_q2 = self.critic_net2(obs, acts).squeeze(1)
 91 |         # compute critic loss
 92 |         critic_loss1 = F.mse_loss(current_q1, target_q)
 93 |         critic_loss2 = F.mse_loss(current_q2, target_q)
 94 | 
 95 |         self.critic_optimizer1.zero_grad()
 96 |         critic_loss1.backward()
 97 |         self.critic_optimizer1.step()
 98 | 
 99 |         self.critic_optimizer2.zero_grad()
100 |         critic_loss2.backward()
101 |         self.critic_optimizer2.step()
102 | 
103 |         """
104 |         Perturbation Loss
105 |         """
106 |         generated_action_ = self.cvae_net.decode(obs, z_device=self.device)
107 |         perturbed_action_ = self.perturbation_net(obs, generated_action_)
108 |         perturbation_loss = -self.critic_net1(obs, perturbed_action_).mean()
109 | 
110 |         self.perturbation_optimizer.zero_grad()
111 |         perturbation_loss.backward()
112 |         self.perturbation_optimizer.step()
113 | 
114 |         """
115 |         Update target networks
116 |         """
117 |         soft_target_update(self.critic_net1, self.target_critic_net1, tau=self.tau)
118 |         soft_target_update(self.critic_net2, self.target_critic_net2, tau=self.tau)
119 |         soft_target_update(self.perturbation_net, self.target_perturbation_net, tau=self.tau)
120 | 
121 |         self.train_step += 1
122 | 
123 |         train_summaries = {"cvae_loss": cvae_loss.cpu().item(),
124 |                            "critic_loss1": critic_loss1.cpu().item(),
125 |                            "critic_loss2": critic_loss2.cpu().item(),
126 |                            "perturbation_loss": perturbation_loss.cpu().item()}
127 | 
128 |         return train_summaries
129 | 
130 |     def store_agent_checkpoint(self):
131 |         checkpoint = {
132 |             "critic_net1": self.critic_net1.state_dict(),
133 |             "critic_net2": self.critic_net2.state_dict(),
134 |             "perturbation_net": self.perturbation_net.state_dict(),
135 |             "cvae_net": self.cvae_net.state_dict(),
136 |             "critic_optimizer1": self.critic_optimizer1.state_dict(),
137 |             "critic_optimizer2": self.critic_optimizer2.state_dict(),
138 |             "perturbation_optimizer": self.perturbation_optimizer.state_dict(),
139 |             "cvae_optimizer": self.cvae_optimizer.state_dict(),
140 |             "train_step": self.train_step,
141 |         }
142 |         torch.save(checkpoint, self.checkpoint_path)
143 | 
144 |     def load_agent_checkpoint(self):
145 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
146 |         self.critic_net1.load_state_dict(checkpoint["critic_net1"])
147 |         self.critic_net2.load_state_dict(checkpoint["critic_net2"])
148 |         self.perturbation_net.load_state_dict(checkpoint["perturbation_net"])
149 |         self.cvae_net.load_state_dict(checkpoint["cvae_net"])
150 |         self.critic_optimizer1.load_state_dict(checkpoint["critic_optimizer1"])
151 |         self.critic_optimizer2.load_state_dict(checkpoint["critic_optimizer2"])
152 |         self.perturbation_optimizer.load_state_dict(checkpoint["perturbation_optimizer"])
153 |         self.cvae_optimizer.load_state_dict(checkpoint["cvae_optimizer"])
154 |         self.train_step = checkpoint["train_step"]
155 | 
156 |         print("load checkpoint from \"" + self.checkpoint_path +
157 |               "\" at " + str(self.train_step) + " time step")
158 | 


--------------------------------------------------------------------------------
/algos/offline/bear.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from algos.base import OfflineBase
  5 | from utils.train_tools import soft_target_update
  6 | from common.networks import MLPSquashedReparamGaussianPolicy, CVAE, MLPQsaNet
  7 | 
  8 | 
  9 | class BEAR_Agent(OfflineBase):
 10 |     """
 11 |     Implementation of Bootstrapping Error Accumulation Reduction (BEAR)
 12 |     https://arxiv.org/abs/1906.00949
 13 |     BEAR's MMD Loss's weight alpha_prime is tuned automatically by default.
 14 | 
 15 |     Actor Loss: alpha_prime * MMD Loss + -minQ(s,a)
 16 |     Critic Loss: Like BCQ
 17 |     Alpha_prime Loss: -(alpha_prime * (MMD Loss - threshold))
 18 |     """
 19 |     def __init__(self,
 20 |                  policy_net: MLPSquashedReparamGaussianPolicy,  # actor
 21 |                  q_net1: MLPQsaNet,  # critic
 22 |                  q_net2: MLPQsaNet,
 23 |                  cvae_net: CVAE,
 24 |                  policy_lr=1e-4,
 25 |                  qf_lr=3e-4,
 26 |                  cvae_lr=3e-4,
 27 |                  tau=0.05,
 28 | 
 29 |                  # BEAR
 30 |                  lmbda=0.75,  # used for double clipped double q-learning
 31 |                  mmd_sigma=20.0,  # the sigma used in mmd kernel
 32 |                  kernel_type='gaussian',  # the type of mmd kernel(gaussian or laplacian)
 33 |                  lagrange_thresh=0.05,  # the hyper-parameter used in automatic tuning alpha in cql loss
 34 |                  n_action_samples=100,  # the number of action samples to compute the best action when choose action
 35 |                  n_target_samples=10,  # the number of action samples to compute BCQ-like target value
 36 |                  n_mmd_action_samples=4,  # the number of action samples to compute MMD.
 37 |                  warmup_step=40000,  # do support matching with a warm start before policy(actor) train
 38 |                  **kwargs
 39 |                  ):
 40 |         super().__init__(**kwargs)
 41 |         
 42 |         # the network and optimizers
 43 |         self.policy_net = policy_net.to(self.device)
 44 |         self.q_net1 = q_net1.to(self.device)
 45 |         self.q_net2 = q_net2.to(self.device)
 46 |         self.target_q_net1 = copy.deepcopy(self.q_net1).to(self.device)
 47 |         self.target_q_net2 = copy.deepcopy(self.q_net2).to(self.device)
 48 |         self.cvae_net = cvae_net.to(self.device)
 49 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=policy_lr)
 50 |         self.q_optimizer1 = torch.optim.Adam(self.q_net1.parameters(), lr=qf_lr)
 51 |         self.q_optimizer2 = torch.optim.Adam(self.q_net2.parameters(), lr=qf_lr)
 52 |         self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(), lr=cvae_lr)
 53 | 
 54 |         self.tau = tau
 55 | 
 56 |         self.lmbda = lmbda
 57 |         self.mmd_sigma = mmd_sigma
 58 |         self.kernel_type = kernel_type
 59 |         self.lagrange_thresh = lagrange_thresh
 60 |         self.n_action_samples = n_action_samples
 61 |         self.n_target_samples = n_target_samples
 62 |         self.n_mmd_action_samples = n_mmd_action_samples
 63 |         self.warmup_step = warmup_step
 64 | 
 65 |         # mmd loss's temperature
 66 |         self.log_alpha_prime = torch.zeros(1, requires_grad=True, device=self.device)
 67 |         self.alpha_prime_optimizer = torch.optim.Adam([self.log_alpha_prime], lr=1e-3)
 68 | 
 69 |     def choose_action(self, obs, eval=True):
 70 |         with torch.no_grad():
 71 |             obs = torch.FloatTensor(obs).reshape(1, -1).repeat(self.n_action_samples, 1).to(self.device)
 72 |             action, _, _ = self.policy_net(obs)
 73 |             q1 = self.q_net1(obs, action)
 74 |             ind = q1.argmax(dim=0)
 75 |         return action[ind].cpu().numpy().flatten()
 76 | 
 77 |     def mmd_loss_laplacian(self, samples1, samples2, sigma=0.2):
 78 |         """MMD constraint with Laplacian kernel for support matching"""
 79 |         # sigma is set to 10.0 for hopper, cheetah and 20 for walker/ant
 80 |         diff_x_x = samples1.unsqueeze(2) - samples1.unsqueeze(1)  # B x N x N x d
 81 |         diff_x_x = torch.mean((-(diff_x_x.abs()).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2))
 82 | 
 83 |         diff_x_y = samples1.unsqueeze(2) - samples2.unsqueeze(1)
 84 |         diff_x_y = torch.mean((-(diff_x_y.abs()).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2))
 85 | 
 86 |         diff_y_y = samples2.unsqueeze(2) - samples2.unsqueeze(1)  # B x N x N x d
 87 |         diff_y_y = torch.mean((-(diff_y_y.abs()).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2))
 88 | 
 89 |         overall_loss = (diff_x_x + diff_y_y - 2.0 * diff_x_y + 1e-6).sqrt()
 90 |         return overall_loss
 91 | 
 92 |     def mmd_loss_gaussian(self, samples1, samples2, sigma=0.2):
 93 |         """MMD constraint with Gaussian Kernel support matching"""
 94 |         # sigma is set to 10.0 for hopper, cheetah and 20 for walker/ant
 95 |         diff_x_x = samples1.unsqueeze(2) - samples1.unsqueeze(1)  # B x N x N x d
 96 |         diff_x_x = torch.mean((-(diff_x_x.pow(2)).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2))
 97 | 
 98 |         diff_x_y = samples1.unsqueeze(2) - samples2.unsqueeze(1)
 99 |         diff_x_y = torch.mean((-(diff_x_y.pow(2)).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2))
100 | 
101 |         diff_y_y = samples2.unsqueeze(2) - samples2.unsqueeze(1)  # B x N x N x d
102 |         diff_y_y = torch.mean((-(diff_y_y.pow(2)).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2))
103 | 
104 |         overall_loss = (diff_x_x + diff_y_y - 2.0 * diff_x_y + 1e-6).sqrt()
105 |         return overall_loss
106 | 
107 |     def train(self):
108 |         # Sample
109 |         batch = self.data_buffer.sample()
110 |         obs = batch["obs"].to(self.device)
111 |         acts = batch["acts"].to(self.device)
112 |         rews = batch["rews"].to(self.device)
113 |         next_obs = batch["next_obs"].to(self.device)
114 |         done = batch["done"].to(self.device)
115 | 
116 |         """
117 |         Train the Behaviour cloning policy to be able to take more than 1 sample for MMD.
118 |         Conditional VAE is used as Behaviour cloning policy in BEAR.
119 |         """
120 |         recon_action, mu, log_std = self.cvae_net(obs, acts)
121 |         cvae_loss = self.cvae_net.loss_function(recon_action, acts, mu, log_std)
122 | 
123 |         self.cvae_optimizer.zero_grad()
124 |         cvae_loss.backward()
125 |         self.cvae_optimizer.step()
126 | 
127 |         """
128 |         Critic Training
129 |         """
130 |         with torch.no_grad():
131 |             # generate 10 actions for every next_obs(Same as BCQ)
132 |             next_obs = torch.repeat_interleave(next_obs, repeats=self.n_target_samples, dim=0).to(self.device)
133 |             # compute target Q value of generated action
134 |             target_q1 = self.target_q_net1(next_obs, self.policy_net(next_obs)[0])
135 |             target_q2 = self.target_q_net2(next_obs, self.policy_net(next_obs)[0])
136 |             # soft clipped double q-learning
137 |             target_q = self.lmbda * torch.min(target_q1, target_q2) + (1. - self.lmbda) * torch.max(target_q1, target_q2)
138 |             # take max over each action sampled from the generation and perturbation model
139 |             target_q = target_q.reshape(obs.shape[0], self.n_target_samples, 1).max(1)[0].squeeze(1)
140 |             target_q = rews + self.gamma * (1. - done) * target_q
141 | 
142 |         # compute current Q
143 |         current_q1 = self.q_net1(obs, acts).squeeze(1)
144 |         current_q2 = self.q_net2(obs, acts).squeeze(1)
145 |         # compute critic loss
146 |         critic_loss1 = F.mse_loss(current_q1, target_q)
147 |         critic_loss2 = F.mse_loss(current_q2, target_q)
148 | 
149 |         self.q_optimizer1.zero_grad()
150 |         critic_loss1.backward()
151 |         self.q_optimizer1.step()
152 | 
153 |         self.q_optimizer2.zero_grad()
154 |         critic_loss2.backward()
155 |         self.q_optimizer2.step()
156 | 
157 |         # MMD Loss
158 |         # sample actions from dataset and current policy(B x N x D)
159 |         raw_sampled_actions = self.cvae_net.decode_multiple_without_squash(obs, decode_num=self.n_mmd_action_samples,
160 |                                                                            z_device=self.device)
161 |         raw_actor_actions = self.policy_net.sample_multiple_without_squash(obs, sample_num=self.n_mmd_action_samples)
162 |         if self.kernel_type == 'gaussian':
163 |             mmd_loss = self.mmd_loss_gaussian(raw_sampled_actions, raw_actor_actions, sigma=self.mmd_sigma)
164 |         else:
165 |             mmd_loss = self.mmd_loss_laplacian(raw_sampled_actions, raw_actor_actions, sigma=self.mmd_sigma)
166 | 
167 |         """
168 |         Alpha prime training(lagrangian parameter update for MMD loss weight)
169 |         """
170 |         alpha_prime_loss = -(self.log_alpha_prime.exp() * (mmd_loss - self.lagrange_thresh)).mean()
171 |         self.alpha_prime_optimizer.zero_grad()
172 |         alpha_prime_loss.backward(retain_graph=True)
173 |         self.alpha_prime_optimizer.step()
174 | 
175 |         self.log_alpha_prime.data.clamp_(min=-5.0, max=10.0)  # clip for stability
176 | 
177 |         """
178 |         Actor Training
179 |         Actor Loss = alpha_prime * MMD Loss + -minQ(s,a)
180 |         """
181 |         a, log_prob, _ = self.policy_net(obs)
182 |         min_q = torch.min(self.q_net1(obs, a), self.q_net2(obs, a)).squeeze(1)
183 |         # policy_loss = (self.alpha * log_prob - min_q).mean()  # SAC Type
184 |         policy_loss = - (min_q.mean())
185 | 
186 |         # BEAR Actor Loss
187 |         actor_loss = (self.log_alpha_prime.exp() * mmd_loss).mean()
188 |         if self.train_step > self.warmup_step:
189 |             actor_loss = policy_loss + actor_loss
190 |         self.policy_optimizer.zero_grad()
191 |         actor_loss.backward()  # the mmd_loss will backward again in alpha_prime_loss.
192 |         self.policy_optimizer.step()
193 | 
194 |         soft_target_update(self.q_net1, self.target_q_net1, tau=self.tau)
195 |         soft_target_update(self.q_net2, self.target_q_net2, tau=self.tau)
196 | 
197 |         self.train_step += 1
198 | 
199 |         train_summaries = {"actor_loss": policy_loss.cpu().item(),
200 |                            "critic_loss1": critic_loss1.cpu().item(),
201 |                            "critic_loss2": critic_loss2.cpu().item(),
202 |                            "alpha_prime_loss": alpha_prime_loss.cpu().item()}
203 | 
204 |         return train_summaries
205 | 
206 |     def store_agent_checkpoint(self):
207 |         checkpoint = {
208 |             "q_net1": self.q_net1.state_dict(),
209 |             "q_net2": self.q_net2.state_dict(),
210 |             "policy_net": self.policy_net.state_dict(),
211 |             "q_optimizer1": self.q_optimizer1.state_dict(),
212 |             "q_optimizer2": self.q_optimizer2.state_dict(),
213 |             "policy_optimizer": self.policy_optimizer.state_dict(),
214 |             "log_alpha_prime": self.log_alpha_prime,
215 |             "alpha_prime_optimizer": self.alpha_prime_optimizer.state_dict(),
216 |             "train_step": self.train_step,
217 |         }
218 | 
219 |         torch.save(checkpoint, self.checkpoint_path)
220 | 
221 |     def load_agent_checkpoint(self):
222 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
223 |         self.q_net1.load_state_dict(checkpoint["q_net1"])
224 |         self.q_net2.load_state_dict(checkpoint["q_net2"])
225 |         self.policy_net.load_state_dict(checkpoint["policy_net"])
226 |         self.q_optimizer1.load_state_dict(checkpoint["q_optimizer1"])
227 |         self.q_optimizer2.load_state_dict(checkpoint["q_optimizer2"])
228 |         self.policy_optimizer.load_state_dict(checkpoint["policy_optimizer"])
229 |         self.log_alpha_prime = checkpoint["log_alpha_prime"]
230 |         self.alpha_prime_optimizer.load_state_dict(checkpoint["alpha_prime_optimizer"])
231 |         self.train_step = checkpoint["train_step"]
232 | 
233 |         print("load checkpoint from \"" + self.checkpoint_path +
234 |               "\" at " + str(self.train_step) + " time step")
235 | 


--------------------------------------------------------------------------------
/algos/offline/plas.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from algos.base import OfflineBase
  5 | from common.networks import MLPQsaNet, CVAE, PLAS_Actor
  6 | from utils.train_tools import soft_target_update, evaluate
  7 | from utils import log_tools
  8 | 
  9 | 
 10 | class PLAS_Agent(OfflineBase):
 11 |     """
 12 |     Implementation of Policy in the Latent Action Space(PLAS) in continuous action space
 13 |     https://arxiv.org/abs/2011.07213
 14 |     """
 15 |     def __init__(self,
 16 |                  critic_net1: MLPQsaNet,
 17 |                  critic_net2: MLPQsaNet,
 18 |                  actor_net: PLAS_Actor,
 19 |                  cvae_net: CVAE,  # generation model
 20 |                  critic_lr=1e-3,
 21 |                  actor_lr=1e-4,
 22 |                  cvae_lr=1e-4,
 23 |                  tau=0.005,
 24 |                  lmbda=0.75,  # used for double clipped double q-learning
 25 |                  max_cvae_iterations=500000,  # the num of iterations when training CVAE model
 26 |                  **kwargs
 27 |                  ):
 28 |         super().__init__(**kwargs)
 29 | 
 30 |         self.critic_net1 = critic_net1.to(self.device)
 31 |         self.critic_net2 = critic_net2.to(self.device)
 32 |         self.target_critic_net1 = copy.deepcopy(self.critic_net1).to(self.device)
 33 |         self.target_critic_net2 = copy.deepcopy(self.critic_net2).to(self.device)
 34 |         self.actor_net = actor_net.to(self.device)
 35 |         self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device)
 36 |         self.cvae_net = cvae_net.to(self.device)
 37 |         self.critic_optimizer1 = torch.optim.Adam(self.critic_net1.parameters(), lr=critic_lr)
 38 |         self.critic_optimizer2 = torch.optim.Adam(self.critic_net2.parameters(), lr=critic_lr)
 39 |         self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr)
 40 |         self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(), lr=cvae_lr)
 41 | 
 42 |         self.tau = tau
 43 |         self.lmbda = lmbda
 44 |         self.max_cvae_iterations = max_cvae_iterations
 45 |         self.cvae_iterations= 0
 46 |         
 47 |     def choose_action(self, obs, eval=True):
 48 |         with torch.no_grad():
 49 |             obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device)
 50 |             action = self.actor_net(obs, self.cvae_net.decode)
 51 |         return action.cpu().data.numpy().flatten()
 52 | 
 53 |     def train_cvae(self):
 54 |         """
 55 |         Train CVAE one step
 56 |         """
 57 |         # Sample
 58 |         batch = self.data_buffer.sample()
 59 |         obs = batch["obs"].to(self.device)
 60 |         acts = batch["acts"].to(self.device)
 61 | 
 62 |         recon_action, mu, log_std = self.cvae_net(obs, acts)
 63 |         cvae_loss = self.cvae_net.loss_function(recon_action, acts, mu, log_std)
 64 | 
 65 |         self.cvae_optimizer.zero_grad()
 66 |         cvae_loss.backward()
 67 |         self.cvae_optimizer.step()
 68 | 
 69 |         self.cvae_iterations += 1
 70 | 
 71 |         train_summaries = {"cvae_loss": cvae_loss.cpu().item()}
 72 | 
 73 |         return train_summaries
 74 | 
 75 |     def train(self):
 76 |         # Sample
 77 |         batch = self.data_buffer.sample()
 78 |         obs = batch["obs"].to(self.device)
 79 |         acts = batch["acts"].to(self.device)
 80 |         rews = batch["rews"].to(self.device)
 81 |         next_obs = batch["next_obs"].to(self.device)
 82 |         done = batch["done"].to(self.device)
 83 | 
 84 |         """
 85 |         Train Critic
 86 |         """
 87 |         with torch.no_grad():
 88 |             decode_action_next = self.target_actor_net(next_obs, self.cvae_net.decode)
 89 | 
 90 |             target_q1 = self.target_critic_net1(next_obs, decode_action_next)
 91 |             target_q2 = self.target_critic_net2(next_obs, decode_action_next)
 92 | 
 93 |             target_q = (self.lmbda * torch.min(target_q1, target_q2) + (1. - self.lmbda) * torch.max(target_q1, target_q2)).squeeze(1)
 94 |             target_q = rews + self.gamma * (1. - done) * target_q
 95 | 
 96 |         current_q1 = self.critic_net1(obs, acts).squeeze(1)
 97 |         current_q2 = self.critic_net2(obs, acts).squeeze(1)
 98 | 
 99 |         critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
100 | 
101 |         self.critic_optimizer1.zero_grad()
102 |         self.critic_optimizer2.zero_grad()
103 |         critic_loss.backward()
104 |         self.critic_optimizer1.step()
105 |         self.critic_optimizer2.step()
106 | 
107 |         """
108 |         Train Actor
109 |         """
110 |         decode_action = self.actor_net(obs, self.cvae_net.decode)
111 |         actor_loss = -self.critic_net1(obs, decode_action).mean()
112 | 
113 |         self.actor_optimizer.zero_grad()
114 |         actor_loss.backward()
115 |         self.actor_optimizer.step()
116 | 
117 |         """
118 |         Update target networks
119 |         """
120 |         soft_target_update(self.critic_net1, self.target_critic_net1, tau=self.tau)
121 |         soft_target_update(self.critic_net2, self.target_critic_net2, tau=self.tau)
122 |         soft_target_update(self.actor_net, self.target_actor_net, tau=self.tau)
123 | 
124 |         self.train_step += 1
125 | 
126 |         train_summaries = {"actor_loss": actor_loss.cpu().item(),
127 |                            "critic_loss": critic_loss.cpu().item()}
128 | 
129 |         return train_summaries
130 | 
131 |     def learn(self):
132 |         """Train PLAS without interacting with the environment (offline)"""
133 | 
134 |         log_tools.make_dir(self.result_dir)
135 |         tensorboard_writer = log_tools.TensorboardLogger(self.result_dir)
136 | 
137 |         if self.resume:
138 |             self.load_agent_checkpoint()
139 |         else:
140 |             # delete tensorboard log file
141 |             log_tools.del_all_files_in_dir(self.result_dir)
142 | 
143 |         # Train CVAE before train agent
144 |         print("==============================Start to train CVAE==============================")
145 | 
146 |         while self.cvae_iterations < self.max_cvae_iterations:
147 |             train_summaries_cvae = self.train_cvae()
148 |             if self.cvae_iterations % 1000 == 0:
149 |                 print("CVAE iteration:", self.cvae_iterations, "\t", "CVAE Loss:", train_summaries_cvae["cvae_loss"])
150 |                 tensorboard_writer.log_train_data(train_summaries_cvae, self.cvae_iterations)
151 | 
152 |         # Train Agent
153 |         print("==============================Start to train Agent==============================")
154 |         while self.train_step < self.max_train_step:
155 |             train_summaries = self.train()
156 | 
157 |             if self.train_step % self.log_interval == 0:
158 |                 self.store_agent_checkpoint()
159 |                 tensorboard_writer.log_train_data(train_summaries, self.train_step)
160 | 
161 |             if self.eval_freq > 0 and self.train_step % self.eval_freq == 0:
162 |                 evaluate_summaries = evaluate(agent=self, episode_num=10)
163 |                 tensorboard_writer.log_eval_data(evaluate_summaries, self.train_step)
164 | 
165 |     def store_agent_checkpoint(self):
166 |         checkpoint = {
167 |             "critic_net1": self.critic_net1.state_dict(),
168 |             "critic_net2": self.critic_net2.state_dict(),
169 |             "actor_net": self.actor_net.state_dict(),
170 |             "cvae_net": self.cvae_net.state_dict(),
171 |             "critic_optimizer1": self.critic_optimizer1.state_dict(),
172 |             "critic_optimizer2": self.critic_optimizer2.state_dict(),
173 |             "actor_optimizer": self.actor_optimizer.state_dict(),
174 |             "cvae_optimizer": self.cvae_optimizer.state_dict(),
175 |             "train_step": self.train_step,
176 |             "cvae_iterations": self.cvae_iterations,
177 |         }
178 |         torch.save(checkpoint, self.checkpoint_path)
179 | 
180 |     def load_agent_checkpoint(self):
181 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
182 |         self.critic_net1.load_state_dict(checkpoint["critic_net1"])
183 |         self.critic_net2.load_state_dict(checkpoint["critic_net2"])
184 |         self.actor_net.load_state_dict(checkpoint["actor_net"])
185 |         self.cvae_net.load_state_dict(checkpoint["cvae_net"])
186 |         self.critic_optimizer1.load_state_dict(checkpoint["critic_optimizer1"])
187 |         self.critic_optimizer2.load_state_dict(checkpoint["critic_optimizer2"])
188 |         self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"])
189 |         self.cvae_optimizer.load_state_dict(checkpoint["cvae_optimizer"])
190 |         self.train_step = checkpoint["train_step"]
191 |         self.cvae_iterations = checkpoint["cvae_iterations"]
192 | 
193 |         print("load checkpoint from \"" + self.checkpoint_path +
194 |               "\" at " + str(self.train_step) + " time step")
195 | 


--------------------------------------------------------------------------------
/algos/offline/sac_offline.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from algos.base import OfflineBase
  6 | from utils.train_tools import soft_target_update
  7 | 
  8 | 
  9 | class SAC_Offline_Agent(OfflineBase):
 10 |     """
 11 |     The SAC
 12 |     """
 13 |     def __init__(self,
 14 |                  policy_net: torch.nn.Module,  # actor
 15 |                  q_net1: torch.nn.Module,  # critic
 16 |                  q_net2: torch.nn.Module,
 17 |                  policy_lr=3e-4,
 18 |                  qf_lr=3e-4,
 19 |                  tau=0.05,
 20 |                  alpha=0.5,
 21 |                  auto_alpha_tuning=False,
 22 |                  **kwargs
 23 |                  ):
 24 |         super().__init__(**kwargs)
 25 | 
 26 |         # the network and optimizers
 27 |         self.policy_net = policy_net.to(self.device)
 28 |         self.q_net1 = q_net1.to(self.device)
 29 |         self.q_net2 = q_net2.to(self.device)
 30 |         self.target_q_net1 = copy.deepcopy(self.q_net1).to(self.device)
 31 |         self.target_q_net2 = copy.deepcopy(self.q_net2).to(self.device)
 32 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=policy_lr)
 33 |         self.q_optimizer1 = torch.optim.Adam(self.q_net1.parameters(), lr=qf_lr)
 34 |         self.q_optimizer2 = torch.optim.Adam(self.q_net2.parameters(), lr=qf_lr)
 35 | 
 36 |         self.tau = tau
 37 |         self.alpha = alpha
 38 |         self.auto_alpha_tuning = auto_alpha_tuning
 39 | 
 40 |         if self.auto_alpha_tuning:
 41 |             self.target_entropy = -np.prod(self.env.action_space.shape).item()
 42 |             self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
 43 |             self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=policy_lr)
 44 |             self.alpha = torch.exp(self.log_alpha)
 45 | 
 46 |     def choose_action(self, obs, eval=True):
 47 |         with torch.no_grad():
 48 |             obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device)
 49 |             _, _, mu_action = self.policy_net(obs)
 50 | 
 51 |         return mu_action.cpu().numpy().flatten()
 52 | 
 53 |     def train(self):
 54 | 
 55 |         # Sample
 56 |         batch = self.data_buffer.sample()
 57 |         obs = batch["obs"].to(self.device)
 58 |         acts = batch["acts"].to(self.device)
 59 |         rews = batch["rews"].to(self.device)
 60 |         next_obs = batch["next_obs"].to(self.device)
 61 |         done = batch["done"].to(self.device)
 62 | 
 63 |         # compute policy Loss
 64 |         a, log_prob, _ = self.policy_net(obs)
 65 |         min_q = torch.min(self.q_net1(obs, a), self.q_net2(obs, a)).squeeze(1)
 66 |         policy_loss = (self.alpha * log_prob - min_q).mean()
 67 | 
 68 |         # compute Q Loss
 69 |         q1 = self.q_net1(obs, acts).squeeze(1)
 70 |         q2 = self.q_net2(obs, acts).squeeze(1)
 71 |         with torch.no_grad():
 72 |             next_a, next_log_prob, _ = self.policy_net(next_obs)
 73 |             min_target_next_q = torch.min(self.target_q_net1(next_obs, next_a), self.target_q_net2(next_obs, next_a)).squeeze(1)
 74 |             y = rews + self.gamma * (1. - done) * (min_target_next_q - self.alpha * next_log_prob)
 75 | 
 76 |         q_loss1 = F.mse_loss(q1, y)
 77 |         q_loss2 = F.mse_loss(q2, y)
 78 | 
 79 |         # Update policy network parameter
 80 |         # policy network's update should be done before updating q network, or there will make some errors
 81 |         self.policy_optimizer.zero_grad()
 82 |         policy_loss.backward()
 83 |         self.policy_optimizer.step()
 84 | 
 85 |         # Update q network1 parameter
 86 |         self.q_optimizer1.zero_grad()
 87 |         q_loss1.backward()
 88 |         self.q_optimizer1.step()
 89 | 
 90 |         # Update q network2 parameter
 91 |         self.q_optimizer2.zero_grad()
 92 |         q_loss2.backward()
 93 |         self.q_optimizer2.step()
 94 | 
 95 |         if self.auto_alpha_tuning:
 96 |             alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean()
 97 |             self.alpha_optimizer.zero_grad()
 98 |             alpha_loss.backward()
 99 |             self.alpha_optimizer.step()
100 | 
101 |             self.alpha = self.log_alpha.exp()
102 |         else:
103 |             alpha_loss = torch.tensor(0)
104 | 
105 |         self.train_step += 1
106 | 
107 |         soft_target_update(self.q_net1, self.target_q_net1, tau=self.tau)
108 |         soft_target_update(self.q_net2, self.target_q_net2, tau=self.tau)
109 | 
110 |         train_summaries = {"actor_loss": policy_loss.cpu().item(),
111 |                            "critic_loss1": q_loss1.cpu().item(),
112 |                            "critic_loss2": q_loss2.cpu().item(),
113 |                            "alpha_loss": alpha_loss.cpu().item()}
114 | 
115 |         return train_summaries
116 |                 
117 |     def store_agent_checkpoint(self):
118 |         checkpoint = {
119 |             "q_net1": self.q_net1.state_dict(),
120 |             "q_net2": self.q_net2.state_dict(),
121 |             "policy_net": self.policy_net.state_dict(),
122 |             "q_optimizer1": self.q_optimizer1.state_dict(),
123 |             "q_optimizer2": self.q_optimizer2.state_dict(),
124 |             "policy_optimizer": self.policy_optimizer.state_dict(),
125 |             "train_step": self.train_step,
126 |         }
127 |         if self.auto_alpha_tuning:
128 |             checkpoint["log_alpha"] = self.log_alpha
129 |             checkpoint["alpha_optimizer"] = self.alpha_optimizer.state_dict()
130 |         torch.save(checkpoint, self.checkpoint_path)
131 | 
132 |     def load_agent_checkpoint(self):
133 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
134 |         self.q_net1.load_state_dict(checkpoint["q_net1"])
135 |         self.q_net2.load_state_dict(checkpoint["q_net2"])
136 |         self.policy_net.load_state_dict(checkpoint["policy_net"])
137 |         self.q_optimizer1.load_state_dict(checkpoint["q_optimizer1"])
138 |         self.q_optimizer2.load_state_dict(checkpoint["q_optimizer2"])
139 |         self.policy_optimizer.load_state_dict(checkpoint["policy_optimizer"])
140 |         self.train_step = checkpoint["train_step"]
141 |         if self.auto_alpha_tuning:
142 |             self.log_alpha = checkpoint["log_alpha"]
143 |             self.alpha_optimizer.load_state_dict(checkpoint["alpha_optimizer"])
144 |         
145 |         print("load checkpoint from \"" + self.checkpoint_path +
146 |               "\" at " + str(self.train_step) + " time step")


--------------------------------------------------------------------------------
/algos/offline/td3_bc.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from algos.base import OfflineBase
  5 | from utils.train_tools import soft_target_update
  6 | 
  7 | 
  8 | class TD3_BC_Agent(OfflineBase):
  9 |     """
 10 |     Implementation of TD3 with behavior cloning (TD3_BC)
 11 |     https://arxiv.org/abs/2106.06860
 12 |     """
 13 |     def __init__(self,
 14 |                  actor_net: torch.nn.Module,
 15 |                  critic_net1: torch.nn.Module,
 16 |                  critic_net2: torch.nn.Module,
 17 |                  actor_lr=3e-4,
 18 |                  critic_lr=3e-4,
 19 |                  tau=0.005,  # used to update target network, w' = tau*w + (1-tau)*w'
 20 |                  policy_noise=0.2,  # Noise added to target policy during critic update
 21 |                  noise_clip=0.5,  # Range to clip target policy noise
 22 |                  policy_delay=2,  # Frequency of delayed policy updates
 23 |                  alpha=2.5,  # The alpha to compute lambda
 24 |                  **kwargs
 25 |                  ):
 26 |         super().__init__(**kwargs)
 27 | 
 28 |         self.action_num = self.env.action_space.shape[0]
 29 |         self.action_bound = self.env.action_space.high[0]
 30 | 
 31 |         # the network and optimizers
 32 |         self.actor_net = actor_net.to(self.device)
 33 |         self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device)
 34 |         self.critic_net1 = critic_net1.to(self.device)
 35 |         self.target_critic_net1 = copy.deepcopy(self.critic_net1).to(self.device)
 36 |         self.critic_net2 = critic_net2.to(self.device)
 37 |         self.target_critic_net2 = copy.deepcopy(self.critic_net2).to(self.device)
 38 | 
 39 |         self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr)
 40 |         self.critic_optimizer1 = torch.optim.Adam(self.critic_net1.parameters(), lr=critic_lr)
 41 |         self.critic_optimizer2 = torch.optim.Adam(self.critic_net2.parameters(), lr=critic_lr)
 42 | 
 43 |         self.tau = tau
 44 |         self.policy_noise = policy_noise
 45 |         self.noise_clip = noise_clip
 46 |         self.policy_delay = policy_delay
 47 |         self.alpha = alpha
 48 | 
 49 |         self.actor_loss = 0
 50 | 
 51 |     def choose_action(self, obs, eval=True):
 52 |         """Choose an action by deterministic policy with some gaussian noise"""
 53 |         obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device)
 54 |         with torch.no_grad():
 55 |             action = self.actor_net(obs).cpu().numpy().flatten()
 56 |         return action
 57 | 
 58 |     def train(self):
 59 | 
 60 |         # Sample
 61 |         batch = self.data_buffer.sample()
 62 |         obs = batch["obs"].to(self.device)
 63 |         acts = batch["acts"].to(self.device)
 64 |         rews = batch["rews"].to(self.device)
 65 |         next_obs = batch["next_obs"].to(self.device)
 66 |         done = batch["done"].to(self.device)
 67 | 
 68 |         # Target Policy Smoothing. Add clipped noise to next actions when computing target Q.
 69 |         with torch.no_grad():
 70 |             noise = torch.normal(mean=0, std=self.policy_noise, size=acts.size()).to(self.device)
 71 |             noise = noise.clamp(-self.noise_clip, self.noise_clip)
 72 |             next_act = self.target_actor_net(next_obs) + noise
 73 |             next_act = next_act.clamp(-self.action_bound, self.action_bound)
 74 | 
 75 |             # Clipped Double Q-Learning. Compute the min of target Q1 and target Q2
 76 |             min_target_q = torch.min(self.target_critic_net1(next_obs, next_act),
 77 |                                      self.target_critic_net2(next_obs, next_act)).squeeze(1)
 78 |             y = rews + self.gamma * (1. - done) * min_target_q
 79 | 
 80 |         current_q1 = self.critic_net1(obs, acts).squeeze(1)
 81 |         current_q2 = self.critic_net2(obs, acts).squeeze(1)
 82 | 
 83 |         # TD3 Loss
 84 |         critic_loss1 = F.mse_loss(current_q1, y)
 85 |         critic_loss2 = F.mse_loss(current_q2, y)
 86 | 
 87 |         # Optimize critic net
 88 |         self.critic_optimizer1.zero_grad()
 89 |         critic_loss1.backward()
 90 |         self.critic_optimizer1.step()
 91 | 
 92 |         self.critic_optimizer2.zero_grad()
 93 |         critic_loss2.backward()
 94 |         self.critic_optimizer2.step()
 95 | 
 96 |         if (self.train_step+1) % self.policy_delay == 0:
 97 |             # Compute actor loss
 98 |             pi = self.actor_net(obs)
 99 |             Q = self.critic_net1(obs, pi)
100 |             lmbda = self.alpha / Q.abs().mean().detach()
101 |             actor_loss = -lmbda * Q.mean() + F.mse_loss(pi, acts)
102 | 
103 |             # Optimize actor net
104 |             self.actor_optimizer.zero_grad()
105 |             actor_loss.backward()
106 |             self.actor_optimizer.step()
107 | 
108 |             soft_target_update(self.actor_net, self.target_actor_net, tau=self.tau)
109 |             soft_target_update(self.critic_net1, self.target_critic_net1, tau=self.tau)
110 |             soft_target_update(self.critic_net2, self.target_critic_net2, tau=self.tau)
111 |         else:
112 |             actor_loss = torch.tensor(0)
113 | 
114 |         self.train_step += 1
115 | 
116 |         train_summaries = {"actor_loss": actor_loss.cpu().item(),
117 |                            "critic_loss1": critic_loss1.cpu().item(),
118 |                            "critic_loss2": critic_loss2.cpu().item()}
119 | 
120 |         return train_summaries
121 | 
122 |     def store_agent_checkpoint(self):
123 |         checkpoint = {
124 |             "actor_net": self.actor_net.state_dict(),
125 |             "critic_net1": self.critic_net1.state_dict(),
126 |             "critic_net2": self.critic_net2.state_dict(),
127 |             "actor_optimizer": self.actor_optimizer.state_dict(),
128 |             "critic_optimizer1": self.critic_optimizer1.state_dict(),
129 |             "critic_optimizer2": self.critic_optimizer2.state_dict(),
130 |             "train_step": self.train_step,
131 |         }
132 |         torch.save(checkpoint, self.checkpoint_path)
133 | 
134 |     def load_agent_checkpoint(self):
135 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
136 |         self.actor_net.load_state_dict(checkpoint["actor_net"])
137 |         self.target_actor_net.load_state_dict(checkpoint["actor_net"])
138 |         self.critic_net1.load_state_dict(checkpoint["critic_net1"])
139 |         self.target_critic_net1.load_state_dict(checkpoint["critic_net1"])
140 |         self.critic_net2.load_state_dict(checkpoint["critic_net2"])
141 |         self.target_critic_net2.load_state_dict(checkpoint["critic_net2"])
142 |         self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"])
143 |         self.critic_optimizer1.load_state_dict(checkpoint["critic_optimizer1"])
144 |         self.critic_optimizer2.load_state_dict(checkpoint["critic_optimizer2"])
145 |         self.train_step = checkpoint["train_step"]
146 |         print("load checkpoint from \"" + self.checkpoint_path +
147 |               "\" at " + str(self.train_step) + " time step")
148 | 


--------------------------------------------------------------------------------
/algos/ppo.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from algos.base import PolicyBase
  4 | from common.buffers import TrajectoryBuffer
  5 | from utils.train_tools import evaluate
  6 | from utils import log_tools
  7 | 
  8 | 
  9 | class PPO_Agent(PolicyBase):
 10 |     """
 11 |     Implementation of Proximal Policy Optimization (PPO)
 12 |     This is the version of "PPO-Clip"
 13 |     https://arxiv.org/abs/1707.06347
 14 |     """
 15 |     def __init__(self,
 16 |                  trajectory_buffer: TrajectoryBuffer,
 17 |                  actor_net: torch.nn.Module,
 18 |                  critic_net: torch.nn.Module,
 19 |                  actor_lr=1e-4,
 20 |                  critic_lr=1e-3,
 21 |                  gae_lambda=0.95,
 22 |                  gae_normalize=False,
 23 |                  clip_pram=0.2,
 24 |                  trajectory_length=128,  # the length of a trajectory_
 25 |                  train_actor_iters=10,
 26 |                  train_critic_iters=10,
 27 |                  **kwargs
 28 |                  ):
 29 |         super().__init__(**kwargs)
 30 | 
 31 |         self.trajectory_buffer = trajectory_buffer
 32 | 
 33 |         # the network and optimizers
 34 |         self.actor_net = actor_net.to(self.device)
 35 |         self.critic_net = critic_net.to(self.device)
 36 |         self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr)
 37 |         self.critic_optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=critic_lr)
 38 | 
 39 |         self.gae_lambda = gae_lambda
 40 |         self.gae_normalize = gae_normalize
 41 |         self.trajectory_length = trajectory_length
 42 |         self.train_actor_iters = train_actor_iters
 43 |         self.train_critic_iters = train_critic_iters
 44 |         self.clip_pram = clip_pram
 45 | 
 46 |         self.episode_num = 0
 47 | 
 48 |     def choose_action(self, obs, eval=False):
 49 |         with torch.no_grad():
 50 |             obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device)
 51 |             action, log_prob, eval_action = self.actor_net(obs)
 52 |             if eval:
 53 |                 action = eval_action
 54 |         return action.cpu().numpy().squeeze(0), log_prob.cpu().numpy()[0]
 55 | 
 56 |     def train(self):
 57 |         batch = self.trajectory_buffer.sample()
 58 |         obs = batch["obs"].to(self.device)
 59 |         acts = batch["acts"].to(self.device)
 60 |         log_probs = batch["log_probs"].to(self.device)
 61 |         gae_advs = batch["gae_advs"].to(self.device)
 62 |         rets = batch["rets"].to(self.device)
 63 | 
 64 |         # Train policy with multiple steps of gradient descent
 65 |         for i in range(self.train_actor_iters):
 66 |             _, new_log_probs, _ = self.actor_net(obs, acts.squeeze())
 67 |             ratios = torch.exp(new_log_probs - log_probs)
 68 | 
 69 |             surrogate = ratios * gae_advs
 70 |             clipped_surrogate = torch.clamp(ratios, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * gae_advs
 71 |             actor_loss = -(torch.min(surrogate, clipped_surrogate)).mean()
 72 | 
 73 |             self.actor_optimizer.zero_grad()
 74 |             actor_loss.backward()
 75 |             self.actor_optimizer.step()
 76 | 
 77 |         # Train value function with multiple steps of gradient descent
 78 |         for i in range(self.train_critic_iters):
 79 |             values = self.critic_net(obs).squeeze()
 80 |             critic_loss = 0.5 * ((rets - values) ** 2).mean()
 81 |             self.critic_optimizer.zero_grad()
 82 |             critic_loss.backward()
 83 |             self.critic_optimizer.step()
 84 | 
 85 |         train_summaries = {"actor_loss": actor_loss.cpu().item(),
 86 |                            "critic_loss": critic_loss.cpu().item()}
 87 | 
 88 |         return train_summaries 
 89 | 
 90 |     def learn(self):
 91 |         log_tools.make_dir(self.result_dir)
 92 |         tensorboard_writer = log_tools.TensorboardLogger(self.result_dir)
 93 | 
 94 |         if self.resume:
 95 |             self.load_agent_checkpoint()
 96 |         else:
 97 |             # delete tensorboard log file
 98 |             log_tools.del_all_files_in_dir(self.result_dir)
 99 |             
100 |         print("==============================start train===================================")
101 |         obs = self.env.reset()
102 |         done = False
103 | 
104 |         episode_reward = 0
105 |         episode_length = 0
106 |         trajectory_length = 0
107 | 
108 |         while self.train_step < self.max_train_step:
109 |             action, log_prob = self.choose_action(np.array(obs))
110 |             next_obs, reward, done, info = self.env.step(action)
111 |             value = self.critic_net(torch.tensor([obs], dtype=torch.float32)).item()
112 |             episode_reward += reward
113 |             self.trajectory_buffer.add(obs, action, reward, done, log_prob, value)
114 |             obs = next_obs
115 |             episode_length += 1
116 |             trajectory_length += 1
117 |             self.train_step += 1
118 | 
119 |             if done:
120 |                 obs = self.env.reset()
121 |                 self.episode_num += 1
122 | 
123 |                 print(f"Time Step: {self.train_step} Episode Num: {self.episode_num} "
124 |                       f"Episode Length: {episode_length} Episode Reward: {episode_reward:.2f}")
125 |                 tensorboard_writer.log_learn_data({"episode_length": episode_length,
126 |                                                         "episode_reward": episode_reward}, self.train_step)
127 |                 episode_reward = 0
128 |                 episode_length = 0
129 | 
130 |             if trajectory_length == self.trajectory_length:
131 |                 last_val = self.critic_net(torch.tensor([obs], dtype=torch.float32)).item() if done else 0
132 |                 self.trajectory_buffer.finish_path(last_val=last_val, gamma=self.gamma,
133 |                                                    gae_lambda=self.gae_lambda, gae_normalize=self.gae_normalize)
134 |                 train_summaries = self.train()
135 |                 trajectory_length = 0
136 | 
137 |             if self.train_step % self.log_interval == 0:
138 |                 self.store_agent_checkpoint()
139 |                 tensorboard_writer.log_train_data(train_summaries, self.train_step)
140 | 
141 |             if self.eval_freq > 0 and self.train_step % self.eval_freq == 0:
142 |                 evaluate_summaries = evaluate(agent=self, episode_num=10)
143 |                 tensorboard_writer.log_eval_data(evaluate_summaries, self.train_step)
144 | 
145 |     def store_agent_checkpoint(self):
146 |         checkpoint = {
147 |             "actor_net": self.actor_net.state_dict(),
148 |             "critic_net": self.critic_net.state_dict(),
149 |             "actor_optimizer": self.actor_optimizer.state_dict(),
150 |             "critic_optimizer": self.critic_optimizer.state_dict(),
151 |             "train_step": self.train_step,
152 |             "episode_num": self.episode_num
153 |         }
154 |         torch.save(checkpoint, self.checkpoint_path)
155 | 
156 |     def load_agent_checkpoint(self):
157 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
158 |         self.actor_net.load_state_dict(checkpoint["actor_net"])
159 |         self.critic_net.load_state_dict(checkpoint["critic_net"])
160 |         self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"])
161 |         self.critic_optimizer.load_state_dict(checkpoint["critic_optimizer"])
162 |         self.train_step = checkpoint["train_step"]
163 |         self.episode_num = checkpoint["episode_num"]
164 |         print("load checkpoint from \"" + self.checkpoint_path +
165 |               "\" at " + str(self.train_step) + " time step")
166 | 


--------------------------------------------------------------------------------
/algos/sac.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from algos.base import OffPolicyBase
  6 | from utils.train_tools import soft_target_update
  7 | 
  8 | 
  9 | class SAC_Agent(OffPolicyBase):
 10 |     """
 11 |     Implementation of Soft Actor-Critic (SAC)
 12 |     https://arxiv.org/abs/1812.05905(SAC 2019)
 13 |     """
 14 |     def __init__(self,
 15 |                  policy_net: torch.nn.Module,  # actor
 16 |                  q_net1: torch.nn.Module,  # critic
 17 |                  q_net2: torch.nn.Module,
 18 |                  policy_lr=4e-3,
 19 |                  qf_lr=4e-3,
 20 |                  tau=0.05,
 21 |                  alpha=0.5,
 22 |                  auto_alpha_tuning=False,
 23 |                  **kwargs
 24 |                  ):
 25 |         super().__init__(**kwargs)
 26 | 
 27 |         # the network and optimizers
 28 |         self.policy_net = policy_net.to(self.device)
 29 |         self.q_net1 = q_net1.to(self.device)
 30 |         self.q_net2 = q_net2.to(self.device)
 31 |         self.target_q_net1 = copy.deepcopy(self.q_net1).to(self.device)
 32 |         self.target_q_net2 = copy.deepcopy(self.q_net2).to(self.device)
 33 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=policy_lr)
 34 |         self.q_optimizer1 = torch.optim.Adam(self.q_net1.parameters(), lr=qf_lr)
 35 |         self.q_optimizer2 = torch.optim.Adam(self.q_net2.parameters(), lr=qf_lr)
 36 | 
 37 |         self.tau = tau
 38 |         self.alpha = alpha
 39 |         self.auto_alpha_tuning = auto_alpha_tuning
 40 | 
 41 |         if self.auto_alpha_tuning:
 42 |             self.target_entropy = -np.prod(self.env.action_space.shape).item()
 43 |             self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
 44 |             self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=policy_lr)
 45 |             self.alpha = torch.exp(self.log_alpha)
 46 | 
 47 |     def choose_action(self, obs, eval=False):
 48 |         with torch.no_grad():
 49 |             obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device)
 50 |             action, log_prob, mu_action = self.policy_net(obs)
 51 | 
 52 |             if eval:
 53 |                 action = mu_action  # if eval, use mu as the action
 54 | 
 55 |         return action.cpu().numpy().flatten()
 56 | 
 57 |     def train(self):
 58 | 
 59 |         # Sample
 60 |         batch = self.replay_buffer.sample()
 61 |         obs = batch["obs"].to(self.device)
 62 |         acts = batch["acts"].to(self.device)
 63 |         rews = batch["rews"].to(self.device)
 64 |         next_obs = batch["next_obs"].to(self.device)
 65 |         done = batch["done"].to(self.device)
 66 | 
 67 |         # compute policy Loss
 68 |         a, log_prob, _ = self.policy_net(obs)
 69 |         min_q = torch.min(self.q_net1(obs, a), self.q_net2(obs, a)).squeeze(1)
 70 |         policy_loss = (self.alpha * log_prob - min_q).mean()
 71 | 
 72 |         # compute Q Loss
 73 |         q1 = self.q_net1(obs, acts).squeeze(1)
 74 |         q2 = self.q_net2(obs, acts).squeeze(1)
 75 |         with torch.no_grad():
 76 |             next_a, next_log_prob, _ = self.policy_net(next_obs)
 77 |             min_target_next_q = torch.min(self.target_q_net1(next_obs, next_a), self.target_q_net2(next_obs, next_a)).squeeze(1)
 78 |             y = rews + self.gamma * (1. - done) * (min_target_next_q - self.alpha * next_log_prob)
 79 | 
 80 |         q_loss1 = F.mse_loss(q1, y)
 81 |         q_loss2 = F.mse_loss(q2, y)
 82 | 
 83 |         # Update policy network parameter
 84 |         # policy network's update should be done before updating q network, or there will make some errors
 85 |         self.policy_optimizer.zero_grad()
 86 |         policy_loss.backward()
 87 |         self.policy_optimizer.step()
 88 | 
 89 |         # Update q network1 parameter
 90 |         self.q_optimizer1.zero_grad()
 91 |         q_loss1.backward()
 92 |         self.q_optimizer1.step()
 93 | 
 94 |         # Update q network2 parameter
 95 |         self.q_optimizer2.zero_grad()
 96 |         q_loss2.backward()
 97 |         self.q_optimizer2.step()
 98 | 
 99 |         if self.auto_alpha_tuning:
100 |             alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean()
101 |             self.alpha_optimizer.zero_grad()
102 |             alpha_loss.backward()
103 |             self.alpha_optimizer.step()
104 | 
105 |             self.alpha = self.log_alpha.exp()
106 |         else:
107 |             alpha_loss = torch.tensor(0)
108 | 
109 |         self.train_step += 1
110 | 
111 |         soft_target_update(self.q_net1, self.target_q_net1, tau=self.tau)
112 |         soft_target_update(self.q_net2, self.target_q_net2, tau=self.tau)
113 | 
114 |         train_summaries = {"actor_loss": policy_loss.cpu().item(),
115 |                            "critic_loss1": q_loss1.cpu().item(),
116 |                            "critic_loss2": q_loss2.cpu().item(),
117 |                            "alpha_loss": alpha_loss.cpu().item()}
118 | 
119 |         return train_summaries
120 | 
121 |     def store_agent_checkpoint(self):
122 |         checkpoint = {
123 |             "q_net1": self.q_net1.state_dict(),
124 |             "q_net2": self.q_net2.state_dict(),
125 |             "policy_net": self.policy_net.state_dict(),
126 |             "q_optimizer1": self.q_optimizer1.state_dict(),
127 |             "q_optimizer2": self.q_optimizer2.state_dict(),
128 |             "policy_optimizer": self.policy_optimizer.state_dict(),
129 |             "train_step": self.train_step,
130 |             "episode_num": self.episode_num
131 |         }
132 |         if self.auto_alpha_tuning:
133 |             checkpoint["log_alpha"] = self.log_alpha
134 |             checkpoint["alpha_optimizer"] = self.alpha_optimizer.state_dict()
135 |         torch.save(checkpoint, self.checkpoint_path)
136 | 
137 |     def load_agent_checkpoint(self):
138 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
139 |         self.q_net1.load_state_dict(checkpoint["q_net1"])
140 |         self.q_net2.load_state_dict(checkpoint["q_net2"])
141 |         self.policy_net.load_state_dict(checkpoint["policy_net"])
142 |         self.q_optimizer1.load_state_dict(checkpoint["q_optimizer1"])
143 |         self.q_optimizer2.load_state_dict(checkpoint["q_optimizer2"])
144 |         self.policy_optimizer.load_state_dict(checkpoint["policy_optimizer"])
145 |         self.train_step = checkpoint["train_step"]
146 |         self.episode_num = checkpoint["episode_num"]
147 |         if self.auto_alpha_tuning:
148 |             self.log_alpha = checkpoint["log_alpha"]
149 |             self.alpha_optimizer.load_state_dict(checkpoint["alpha_optimizer"])
150 |         print("load checkpoint from \"" + self.checkpoint_path +
151 |               "\" at " + str(self.train_step) + " time step")
152 | 


--------------------------------------------------------------------------------
/algos/td3.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from algos.base import OffPolicyBase
  6 | from utils.train_tools import soft_target_update
  7 | 
  8 | 
  9 | class TD3_Agent(OffPolicyBase):
 10 |     """
 11 |     Implementation of Twin Delayed Deep Deterministic policy gradient (TD3)
 12 |     https://arxiv.org/abs/1802.09477
 13 |     """
 14 |     def __init__(self,
 15 |                  actor_net: torch.nn.Module,
 16 |                  critic_net1: torch.nn.Module,
 17 |                  critic_net2: torch.nn.Module,
 18 |                  actor_lr=1e-3,
 19 |                  critic_lr=1e-3,
 20 |                  tau=0.005,  # used to update target network, w' = tau*w + (1-tau)*w'
 21 |                  act_noise=0.1,  # Std of Gaussian exploration noise
 22 |                  policy_noise=0.2,  # Noise added to target policy during critic update
 23 |                  noise_clip=0.5,  # Range to clip target policy noise
 24 |                  policy_delay=2,  # Frequency of delayed policy updates
 25 |                  **kwargs
 26 |                  ):
 27 |         super().__init__(**kwargs)
 28 | 
 29 |         self.action_num = self.env.action_space.shape[0]
 30 |         self.action_bound = self.env.action_space.high[0]
 31 | 
 32 |         # the network and optimizers
 33 |         self.actor_net = actor_net.to(self.device)
 34 |         self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device)
 35 |         self.critic_net1 = critic_net1.to(self.device)
 36 |         self.target_critic_net1 = copy.deepcopy(self.critic_net1).to(self.device)
 37 |         self.critic_net2 = critic_net2.to(self.device)
 38 |         self.target_critic_net2 = copy.deepcopy(self.critic_net2).to(self.device)
 39 | 
 40 |         self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr)
 41 |         self.critic_optimizer1 = torch.optim.Adam(self.critic_net1.parameters(), lr=critic_lr)
 42 |         self.critic_optimizer2 = torch.optim.Adam(self.critic_net2.parameters(), lr=critic_lr)
 43 | 
 44 |         self.tau = tau
 45 |         self.act_noise = act_noise
 46 |         self.policy_noise = policy_noise
 47 |         self.noise_clip = noise_clip
 48 |         self.policy_delay = policy_delay
 49 |         self.actor_loss = 0
 50 | 
 51 |     def choose_action(self, obs, eval=False):
 52 |         """Choose an action by deterministic policy with some gaussian noise"""
 53 |         obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device)
 54 |         with torch.no_grad():
 55 |             action = self.actor_net(obs).cpu().numpy().flatten()
 56 |         if eval:
 57 |             return action
 58 |         else:
 59 |             noise = np.random.normal(0, self.act_noise, size=self.action_num)
 60 |             return (action + noise).clip(-self.action_bound, self.action_bound)
 61 | 
 62 |     def train(self):
 63 | 
 64 |         # Sample
 65 |         batch = self.replay_buffer.sample()
 66 |         obs = batch["obs"].to(self.device)
 67 |         acts = batch["acts"].to(self.device)
 68 |         rews = batch["rews"].to(self.device)
 69 |         next_obs = batch["next_obs"].to(self.device)
 70 |         done = batch["done"].to(self.device)
 71 | 
 72 |         # Target Policy Smoothing. Add clipped noise to next actions when computing target Q.
 73 |         with torch.no_grad():
 74 |             noise = torch.normal(mean=0, std=self.policy_noise, size=acts.size()).to(self.device)
 75 |             noise = noise.clamp(-self.noise_clip, self.noise_clip)
 76 |             next_act = self.target_actor_net(next_obs) + noise
 77 |             next_act = next_act.clamp(-self.action_bound, self.action_bound)
 78 | 
 79 |             # Clipped Double Q-Learning. Compute the min of target Q1 and target Q2
 80 |             min_target_q = torch.min(self.target_critic_net1(next_obs, next_act),
 81 |                                      self.target_critic_net2(next_obs, next_act)).squeeze(1)
 82 |             y = rews + self.gamma * (1. - done) * min_target_q
 83 | 
 84 |         current_q1 = self.critic_net1(obs, acts).squeeze(1)
 85 |         current_q2 = self.critic_net2(obs, acts).squeeze(1)
 86 | 
 87 |         # TD3 Loss
 88 |         critic_loss1 = F.mse_loss(current_q1, y)
 89 |         critic_loss2 = F.mse_loss(current_q2, y)
 90 | 
 91 |         # Optimize critic net
 92 |         self.critic_optimizer1.zero_grad()
 93 |         critic_loss1.backward()
 94 |         self.critic_optimizer1.step()
 95 | 
 96 |         self.critic_optimizer2.zero_grad()
 97 |         critic_loss2.backward()
 98 |         self.critic_optimizer2.step()
 99 | 
100 |         if (self.train_step+1) % self.policy_delay == 0:
101 |             # Compute actor loss
102 |             actor_loss = -self.critic_net1(obs, self.actor_net(obs)).mean()
103 |             # Optimize actor net
104 |             self.actor_optimizer.zero_grad()
105 |             actor_loss.backward()
106 |             self.actor_optimizer.step()
107 | 
108 |             soft_target_update(self.actor_net, self.target_actor_net, tau=self.tau)
109 |             soft_target_update(self.critic_net1, self.target_critic_net1, tau=self.tau)
110 |             soft_target_update(self.critic_net2, self.target_critic_net2, tau=self.tau)
111 |         else:
112 |             actor_loss = torch.tensor(0)
113 | 
114 |         self.train_step += 1
115 | 
116 |         train_summaries = {"actor_loss": actor_loss.cpu().item(),
117 |                            "critic_loss1": critic_loss1.cpu().item(),
118 |                            "critic_loss2": critic_loss2.cpu().item()}
119 | 
120 |         return train_summaries
121 | 
122 |     def store_agent_checkpoint(self):
123 |         checkpoint = {
124 |             "actor_net": self.actor_net.state_dict(),
125 |             "critic_net1": self.critic_net1.state_dict(),
126 |             "critic_net2": self.critic_net2.state_dict(),
127 |             "actor_optimizer": self.actor_optimizer.state_dict(),
128 |             "critic_optimizer1": self.critic_optimizer1.state_dict(),
129 |             "critic_optimizer2": self.critic_optimizer2.state_dict(),
130 |             "train_step": self.train_step,
131 |             "episode_num": self.episode_num
132 |         }
133 |         torch.save(checkpoint, self.checkpoint_path)
134 | 
135 |     def load_agent_checkpoint(self):
136 |         checkpoint = torch.load(self.checkpoint_path, map_location=self.device)  # can load gpu's data on cpu machine
137 |         self.actor_net.load_state_dict(checkpoint["actor_net"])
138 |         self.target_actor_net.load_state_dict(checkpoint["actor_net"])
139 |         self.critic_net1.load_state_dict(checkpoint["critic_net1"])
140 |         self.target_critic_net1.load_state_dict(checkpoint["critic_net1"])
141 |         self.critic_net2.load_state_dict(checkpoint["critic_net2"])
142 |         self.target_critic_net2.load_state_dict(checkpoint["critic_net2"])
143 |         self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"])
144 |         self.critic_optimizer1.load_state_dict(checkpoint["critic_optimizer1"])
145 |         self.critic_optimizer2.load_state_dict(checkpoint["critic_optimizer2"])
146 |         self.train_step = checkpoint["train_step"]
147 |         self.episode_num = checkpoint["episode_num"]
148 |         print("load checkpoint from \"" + self.checkpoint_path +
149 |               "\" at " + str(self.train_step) + " time step")


--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dragon-wang/RL_Algorithms/3d43ece583e80f2828a42f28b790b1d7d73c07bf/common/__init__.py


--------------------------------------------------------------------------------
/common/buffers.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import numpy as np
  4 | from typing import Sequence, Type, Optional, List, Union
  5 | 
  6 | 
  7 | class ReplayBuffer:
  8 |     def __init__(self, obs_dim: Union[int, Sequence[int]], act_dim: int, capacity: int, batch_size: int):
  9 | 
 10 |         # Transfer the "int" observation dimension to "list"
 11 |         if isinstance(obs_dim, int):
 12 |             self.obs_dim = [obs_dim]
 13 |         else:
 14 |             self.obs_dim = list(obs_dim)
 15 | 
 16 |         self.act_dim = act_dim
 17 |         self.max_size = capacity
 18 |         self.batch_size = batch_size
 19 |         self.ptr = 0  # Point to the current position in the buffer
 20 |         self.crt_size = 0  # The current size of the buffer
 21 | 
 22 |         # Use numpy.ndarray to initialize the replay buffer
 23 |         self.obs = np.zeros(shape=[self.max_size] + self.obs_dim, dtype=np.float32)
 24 |         self.acts = np.zeros((self.max_size, self.act_dim), dtype=np.float32)
 25 |         self.rews = np.zeros(self.max_size, dtype=np.float32)
 26 |         self.next_obs = np.zeros(shape=[self.max_size] + self.obs_dim, dtype=np.float32)
 27 |         self.done = np.zeros(self.max_size, dtype=np.float32)
 28 | 
 29 |     def add(self, obs, act, rew, next_obs, done):
 30 |         self.obs[self.ptr] = obs
 31 |         self.acts[self.ptr] = act
 32 |         self.rews[self.ptr] = rew
 33 |         self.next_obs[self.ptr] = next_obs
 34 |         self.done[self.ptr] = float(done)
 35 | 
 36 |         self.ptr = (self.ptr + 1) % self.max_size
 37 |         self.crt_size = min(self.crt_size + 1, self.max_size)
 38 | 
 39 |     def sample(self):
 40 |         ind = np.random.choice(self.crt_size, size=self.batch_size, replace=True)  # replace=False will make sample be slow
 41 |         return dict(obs=torch.FloatTensor(self.obs[ind]),
 42 |                     acts=torch.FloatTensor(self.acts[ind]),
 43 |                     rews=torch.FloatTensor(self.rews[ind]),  # 1D
 44 |                     next_obs=torch.FloatTensor(self.next_obs[ind]),
 45 |                     done=torch.FloatTensor(self.done[ind]))  # 1D
 46 | 
 47 | 
 48 | class TrajectoryBuffer:
 49 |     """
 50 |     Used to store experiences for a trajectory (e.g., in PPO)
 51 |     """
 52 |     def __init__(self, obs_dim: Union[int, Sequence[int]], act_dim: int, capacity: int):
 53 | 
 54 |         # Transfer the "int" observation dimension to "list"
 55 |         if isinstance(obs_dim, int):
 56 |             self.obs_dim = [obs_dim]
 57 |         else:
 58 |             self.obs_dim = list(obs_dim)
 59 |         self.act_dim = act_dim
 60 |         self.max_size = capacity
 61 |         self.ptr = 0  # Point to the current position in the buffer
 62 | 
 63 |         # Use numpy.ndarray to initialize the replay buffer
 64 |         self.obs = np.zeros(shape=[self.max_size] + self.obs_dim, dtype=np.float32)
 65 |         self.acts = np.zeros((self.max_size, self.act_dim), dtype=np.float32)
 66 |         self.rews = np.zeros(self.max_size, dtype=np.float32)
 67 |         self.done = np.zeros(self.max_size, dtype=np.float32)
 68 |         self.log_probs = np.zeros(self.max_size, dtype=np.float32)  # the log probability of choosing an action
 69 |         self.values = np.zeros(self.max_size + 1, dtype=np.float32)  # the value of the state. the length of values is T+1, while others are T
 70 |         self.rets = np.zeros(self.max_size, dtype=np.float32)  # the Return in time t, which is also known as G_t.
 71 |         self.gae_advs = np.zeros(self.max_size, dtype=np.float32)  # the GAE advantage
 72 | 
 73 |     def add(self, obs, act, rew, done, log_prob, value):
 74 |         self.obs[self.ptr] = obs
 75 |         self.acts[self.ptr] = act
 76 |         self.rews[self.ptr] = rew
 77 |         self.done[self.ptr] = float(done)
 78 |         self.log_probs[self.ptr] = log_prob
 79 |         self.values[self.ptr] = value
 80 |         self.ptr += 1
 81 | 
 82 |     def finish_path(self, last_val=0, gamma=0.99, gae_lambda=0.95, gae_normalize=False):
 83 |         """
 84 |         This method is called at the end of a trajectory
 85 |         """
 86 |         self.values[-1] = last_val
 87 | 
 88 |         g = self.values[-1]
 89 |         gae_adv = 0
 90 |         for i in reversed(range(len(self.rews))):
 91 |             # compute G_t
 92 |             g = self.rews[i] + gamma * g * (1-self.done[i])
 93 |             self.rets[i] = g
 94 |             # compute A_t
 95 |             delt = self.rews[i] + gamma * self.values[i + 1] * (1 - self.done[i]) - self.values[i]
 96 |             gae_adv = delt + gamma * gae_lambda * gae_adv * (1 - self.done[i])
 97 |             self.gae_advs[i] = gae_adv
 98 | 
 99 |         if gae_normalize:
100 |             self.gae_advs = (self.gae_advs - np.mean(self.gae_advs) / np.std(self.gae_advs))
101 | 
102 |         self.ptr = 0
103 | 
104 |     def sample(self):
105 |         return dict(obs=torch.FloatTensor(self.obs),
106 |                     acts=torch.FloatTensor(self.acts),
107 |                     rews=torch.FloatTensor(self.rews),
108 |                     done=torch.FloatTensor(self.done),
109 |                     log_probs=torch.FloatTensor(self.log_probs),
110 |                     gae_advs=torch.FloatTensor(self.gae_advs),
111 |                     rets=torch.FloatTensor(self.rets))
112 | 
113 | 
114 | class OfflineBuffer:
115 |     """
116 |     Used in offline setting
117 |     """
118 |     def __init__(self, data: dict, batch_size: int):
119 |         self.obs = data["obs"]
120 |         self.acts = data["acts"]
121 |         self.rews = data["rews"]
122 |         self.next_obs = data["next_obs"]
123 |         self.done = data["done"]
124 | 
125 |         self.data_num = self.acts.shape[0]
126 |         self.batch_size = batch_size
127 | 
128 |     def sample(self) -> dict:
129 |         ind = np.random.choice(self.data_num, size=self.batch_size, replace=True)  # replace=False will make sample be slow
130 |         return dict(obs=torch.FloatTensor(self.obs[ind]),
131 |                     acts=torch.FloatTensor(self.acts[ind]),
132 |                     rews=torch.FloatTensor(self.rews[ind]),  # 1D
133 |                     next_obs=torch.FloatTensor(self.next_obs[ind]),
134 |                     done=torch.FloatTensor(self.done[ind]))  # 1D
135 | 
136 | 
137 | class OfflineBufferAtari:
138 |     """
139 |     Used in offline setting
140 |     """
141 |     def __init__(self, data: dict, batch_size: int):
142 |         self.obs = data["obs"]  # list
143 |         self.acts = data["acts"]  # ndarray
144 |         self.rews = data["rews"]  # ndarray
145 |         self.done = data["done"]  # ndarray
146 | 
147 |         self.data_num = self.acts.shape[0]
148 |         self.batch_size = batch_size
149 | 
150 |     def sample(self) -> dict:
151 |         ind = np.random.choice(self.data_num-1, size=self.batch_size, replace=True)  # replace=False will make sample be slow
152 |         obs = [self.obs[i] for i in ind]
153 |         next_obs = [self.obs[i+1] for i in ind]
154 |         return dict(obs=torch.FloatTensor(obs),
155 |                     acts=torch.FloatTensor(self.acts[ind]).reshape(-1, 1),
156 |                     rews=torch.FloatTensor(self.rews[ind]),  # 1D
157 |                     next_obs=torch.FloatTensor(next_obs),
158 |                     done=torch.FloatTensor(self.done[ind]))  # 1D
159 | 
160 | 
161 | class OfflineToOnlineBuffer:
162 |     """
163 |     Used in offline to Online setting
164 |     """
165 |     def __init__(self, data: dict, batch_size: int):
166 |         self.obs = data["obs"]
167 |         self.acts = data["acts"]
168 |         self.rews = data["rews"]
169 |         self.next_obs = data["next_obs"]
170 |         self.done = data["done"]
171 | 
172 |         self.data_num = self.acts.shape[0]
173 |         self.batch_size = batch_size
174 |         self.max_size = self.data_num
175 | 
176 |         self.ptr = 0  # Point to the current position in the buffer
177 |         self.crt_size = 0  # The current size of the buffer
178 | 
179 |     def add(self, obs, act, rew, next_obs, done):
180 |         self.obs[self.ptr] = obs
181 |         self.acts[self.ptr] = act
182 |         self.rews[self.ptr] = rew
183 |         self.next_obs[self.ptr] = next_obs
184 |         self.done[self.ptr] = float(done)
185 | 
186 |         self.ptr = (self.ptr + 1) % self.max_size
187 |         self.crt_size = min(self.crt_size + 1, self.max_size)
188 | 
189 |     def sample(self) -> dict:
190 |         ind = np.random.choice(self.data_num, size=self.batch_size, replace=True)  # replace=False will make sample be slow
191 |         return dict(obs=torch.FloatTensor(self.obs[ind]),
192 |                     acts=torch.FloatTensor(self.acts[ind]),
193 |                     rews=torch.FloatTensor(self.rews[ind]),  # 1D
194 |                     next_obs=torch.FloatTensor(self.next_obs[ind]),
195 |                     done=torch.FloatTensor(self.done[ind]))  # 1D
196 | 


--------------------------------------------------------------------------------
/run/bcq_mujoco.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import gym
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | from algos.offline.bcq import BCQ_Agent
 11 | from common.buffers import OfflineBuffer
 12 | from common.networks import MLPQsaNet, CVAE, BCQ_Perturbation
 13 | from utils import train_tools, data_tools
 14 | 
 15 | 
 16 | if __name__ == '__main__':
 17 |     parser = argparse.ArgumentParser(description='BCQ algorithm in mujoco environment')
 18 |     parser.add_argument('--env', type=str, default='hopper-medium-v0',
 19 |                         help='the name of environment')
 20 |     parser.add_argument('--batch_size', type=int, default=100,
 21 |                         help='the size of batch that sampled from buffer')
 22 | 
 23 |     parser.add_argument('--max_train_step', type=int, default=1000000,
 24 |                         help='the max train step')
 25 |     parser.add_argument('--log_interval', type=int, default=1000,
 26 |                         help='The number of steps taken to record the model and the tensorboard')
 27 |     parser.add_argument('--train_id', type=str, default='bcq_mujoco_test',
 28 |                         help='Path to save model and log tensorboard')
 29 |     parser.add_argument('--resume', action='store_true', default=False,
 30 |                         help='whether load the last saved model to train')
 31 |     parser.add_argument('--device', type=str, default='cpu',
 32 |                         help='Choose cpu or cuda')
 33 |     parser.add_argument('--show', action='store_true', default=False,
 34 |                         help='show the trained model visually')
 35 |     parser.add_argument('--eval_freq', type=int, default=5000,
 36 |                         help='how often (time steps) we evaluate')
 37 |     parser.add_argument('--seed', type=int, default=10,
 38 |                         help='the random seed')
 39 | 
 40 |     args = parser.parse_args()
 41 | 
 42 |     torch.manual_seed(args.seed)
 43 |     np.random.seed(args.seed)
 44 | 
 45 |     # create environment
 46 |     env = gym.make(args.env)
 47 |     env.seed(args.seed)
 48 |     env.action_space.seed(args.seed)
 49 |     train_tools.EVAL_SEED = args.seed
 50 | 
 51 |     obs_dim = env.observation_space.shape[0]
 52 |     act_dim = env.action_space.shape[0]
 53 |     act_bound = env.action_space.high[0]
 54 | 
 55 |     critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300],
 56 |                             hidden_activation=nn.ReLU)
 57 |     critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300],
 58 |                             hidden_activation=nn.ReLU)
 59 | 
 60 |     perturbation_net = BCQ_Perturbation(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 61 |                                         hidden_size=[400, 300], hidden_activation=nn.ReLU,
 62 |                                         phi=0.05)
 63 | 
 64 |     cvae_net = CVAE(obs_dim=obs_dim, act_dim=act_dim,
 65 |                     latent_dim=2 * act_dim, act_bound=act_bound)
 66 | 
 67 |     # create buffer
 68 |     if args.show:
 69 |         data_buffer = None
 70 |     else:
 71 |         data = data_tools.get_d4rl_dataset(env)
 72 |         data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size)
 73 | 
 74 |     agent = BCQ_Agent(
 75 |         # parameters of PolicyBase
 76 |         env=env,
 77 |         gamma=0.99,
 78 |         eval_freq=args.eval_freq,
 79 |         max_train_step=args.max_train_step,
 80 |         train_id=args.train_id,
 81 |         log_interval=args.log_interval,
 82 |         resume=args.resume,
 83 |         device=args.device,
 84 | 
 85 |         # Parameters of OfflineBase
 86 |         data_buffer=data_buffer,
 87 | 
 88 |         # Parameters of BCQ_Agent
 89 |         critic_net1=critic_net1,
 90 |         critic_net2=critic_net2,
 91 |         perturbation_net=perturbation_net,
 92 |         cvae_net=cvae_net,  # generation model
 93 |         critic_lr=1e-3,
 94 |         per_lr=1e-3,
 95 |         cvae_lr=1e-3,
 96 |         tau=0.005,
 97 |         lmbda=0.75,  # used for double clipped double q-learning              
 98 |         )
 99 | 
100 |     if args.show:
101 |         train_tools.evaluate(agent, 10, show=True)
102 |     else:
103 |         agent.learn()
104 | 


--------------------------------------------------------------------------------
/run/bear_mujoco.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import gym
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | from algos.offline.bear import BEAR_Agent
 11 | from common.buffers import OfflineBuffer
 12 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy, CVAE
 13 | from utils import train_tools, data_tools
 14 | 
 15 | if __name__ == '__main__':
 16 |     parser = argparse.ArgumentParser(description='CQL algorithm in mujoco environment')
 17 |     parser.add_argument('--env', type=str, default='hopper-medium-v0',
 18 |                         help='the name of environment')
 19 |     parser.add_argument('--batch_size', type=int, default=256,
 20 |                         help='the size of batch that sampled from buffer')
 21 | 
 22 |     # BEAR
 23 |     parser.add_argument('--mmd_sigma', type=float, default=20.0,
 24 |                         help='the sigma used in mmd kernel')
 25 |     parser.add_argument('--kernel_type', type=str, default='gaussian',
 26 |                         help='the type of mmd kernel(gaussian or laplacian)')
 27 |     parser.add_argument('--lagrange_thresh', type=float, default=0.05,
 28 |                         help='the hyper-parameter used in automatic tuning alpha in cql loss')
 29 | 
 30 |     parser.add_argument('--max_train_step', type=int, default=1000000,
 31 |                         help='the max train step')
 32 |     parser.add_argument('--log_interval', type=int, default=1000,
 33 |                         help='The number of steps taken to record the model and the tensorboard')
 34 |     parser.add_argument('--train_id', type=str, default='bear_hopper-mujoco_test',
 35 |                         help='Path to save model and log tensorboard')
 36 |     parser.add_argument('--resume', action='store_true', default=False,
 37 |                         help='whether load the last saved model to train')
 38 |     parser.add_argument('--device', type=str, default='cpu',
 39 |                         help='Choose cpu or cuda')
 40 |     parser.add_argument('--show', action='store_true', default=False,
 41 |                         help='show the trained model visually')
 42 |     parser.add_argument('--eval_freq', type=int, default=5000,
 43 |                         help='how often (time steps) we evaluate')
 44 |     parser.add_argument('--seed', type=int, default=10,
 45 |                         help='the random seed')
 46 | 
 47 |     args = parser.parse_args()
 48 | 
 49 |     torch.manual_seed(args.seed)
 50 |     np.random.seed(args.seed)
 51 | 
 52 |     # create environment
 53 |     env = gym.make(args.env)
 54 |     env.seed(args.seed)
 55 |     env.action_space.seed(args.seed)
 56 |     train_tools.EVAL_SEED = args.seed
 57 | 
 58 |     obs_dim = env.observation_space.shape[0]
 59 |     act_dim = env.action_space.shape[0]
 60 |     act_bound = env.action_space.high[0]
 61 | 
 62 |     # create nets
 63 |     policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 64 |                                                   hidden_size=[256, 256], hidden_activation=nn.ReLU)
 65 |     q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 66 |                        hidden_activation=nn.ReLU)
 67 | 
 68 |     q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 69 |                        hidden_activation=nn.ReLU)
 70 | 
 71 |     cvae_net = CVAE(obs_dim=obs_dim, act_dim=act_dim,
 72 |                     latent_dim=2 * act_dim, act_bound=act_bound)
 73 | 
 74 |     # create buffer
 75 |     if args.show:
 76 |         data_buffer = None
 77 |     else:
 78 |         data = data_tools.get_d4rl_dataset(env)
 79 |         data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size)
 80 | 
 81 |     agent = BEAR_Agent(
 82 |         # parameters of PolicyBase
 83 |         env=env,
 84 |         gamma=0.99,
 85 |         eval_freq=args.eval_freq,
 86 |         max_train_step=args.max_train_step,
 87 |         train_id=args.train_id,
 88 |         log_interval=args.log_interval,
 89 |         resume=args.resume,
 90 |         device=args.device,
 91 | 
 92 |         # Parameters of OfflineBase
 93 |         data_buffer=data_buffer,
 94 | 
 95 |         # Parameters of BEAR_Agent
 96 |         policy_net=policy_net,
 97 |         q_net1=q_net1,
 98 |         q_net2=q_net2,
 99 |         cvae_net=cvae_net,
100 |         policy_lr=1e-4,
101 |         qf_lr=3e-4,
102 |         cvae_lr=3e-4,
103 |         tau=0.05,
104 |         lmbda=0.75,
105 |         mmd_sigma=args.mmd_sigma,
106 |         kernel_type=args.kernel_type,
107 |         lagrange_thresh=args.lagrange_thresh,
108 |         n_action_samples=100,
109 |         n_target_samples=10,
110 |         n_mmd_action_samples=4,
111 |         warmup_step=40000,
112 |         )
113 |     if args.show:
114 |         train_tools.evaluate(agent, 10, show=True)
115 |     else:
116 |         agent.learn()
117 | 


--------------------------------------------------------------------------------
/run/cql_atari.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | 
 5 | import argparse
 6 | import gym
 7 | import torch
 8 | import numpy as np
 9 | from algos.offline.cql import DiscreteCQL_Agent
10 | from common.buffers import OfflineBufferAtari
11 | from common.networks import ConvAtariQsNet
12 | from utils import train_tools
13 | from utils import data_tools
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     parser = argparse.ArgumentParser(description='DiscreteCQL algorithm in atari environment')
19 |     parser.add_argument('--env', type=str, default='pong-mixed-v0',
20 |                         help='the name of environment')
21 |     parser.add_argument('--batch_size', type=int, default=32,
22 |                         help='the size of batch that sampled from buffer')
23 | 
24 |     parser.add_argument('--min_q_weight', type=float, default=5.0,
25 |                         help='the value of alpha, set to 5.0 or 10.0 if not using lagrange')
26 | 
27 |     parser.add_argument('--max_train_step', type=int, default=2000000,
28 |                         help='the max train step')
29 |     parser.add_argument('--log_interval', type=int, default=1000,
30 |                         help='The number of steps taken to record the model and the tensorboard')
31 |     parser.add_argument('--train_id', type=str, default='cql_atari_test',
32 |                         help='Path to save model and log tensorboard')
33 |     parser.add_argument('--resume', action='store_true', default=False,
34 |                         help='whether load the last saved model to train')
35 |     parser.add_argument('--device', type=str, default='cpu',
36 |                         help='Choose cpu or cuda')
37 |     parser.add_argument('--show', action='store_true', default=False,
38 |                         help='show the trained model visually')
39 |     parser.add_argument('--eval_freq', type=int, default=5000,
40 |                         help='how often (time steps) we evaluate')
41 |     parser.add_argument('--seed', type=int, default=10,
42 |                         help='the random seed')
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     torch.manual_seed(args.seed)
47 |     np.random.seed(args.seed)
48 | 
49 |     env = gym.make(args.env, stack=True)
50 |     env.seed(args.seed)
51 |     env.action_space.seed(args.seed)
52 |     train_tools.EVAL_SEED = args.seed
53 | 
54 |     obs_dim = env.observation_space.shape
55 |     act_dim = env.action_space.n
56 | 
57 |     Q_net = ConvAtariQsNet(num_frames_stack=4, act_dim=act_dim)
58 | 
59 |     # create buffer
60 |     if args.show:
61 |         data_buffer = None
62 |     else:
63 |         data = data_tools.get_d4rl_dataset_atari(env)
64 |         data_buffer = OfflineBufferAtari(data=data, batch_size=args.batch_size)
65 | 
66 |     agent = DiscreteCQL_Agent(
67 |         # parameters of PolicyBase
68 |         env=env,
69 |         gamma=0.99,
70 |         eval_freq=args.eval_freq,
71 |         max_train_step=args.max_train_step,
72 |         train_id=args.train_id,
73 |         log_interval=args.log_interval,
74 |         resume=args.resume,
75 |         device=args.device,
76 | 
77 |         # Parameters of OfflineBase
78 |         data_buffer=data_buffer,
79 | 
80 |         # Parameters of CQL_Agent
81 |         Q_net=Q_net,
82 |         qf_lr=1e-4,
83 |         eval_eps=0.001,
84 |         target_update_freq=8000,
85 |         min_q_weight=args.min_q_weight,  # the value of alpha in CQL loss, set to 5.0 or 10.0 if not using lagrange
86 |         )
87 | 
88 |     if args.show:
89 |         train_tools.evaluate(agent, 10, show=True)
90 |     else:
91 |         agent.learn()
92 | 


--------------------------------------------------------------------------------
/run/cql_mujoco.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import gym
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | from algos.offline.cql import CQL_Agent
 11 | from common.buffers import OfflineBuffer
 12 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy
 13 | from utils import train_tools, data_tools
 14 | 
 15 | if __name__ == '__main__':
 16 |     parser = argparse.ArgumentParser(description='CQL algorithm in mujoco environment')
 17 |     parser.add_argument('--env', type=str, default='hopper-medium-v0',
 18 |                         help='the name of environment')
 19 |     parser.add_argument('--batch_size', type=int, default=256,
 20 |                         help='the size of batch that sampled from buffer')
 21 |     parser.add_argument('--auto_alpha_tuning', action='store_true', default=False,
 22 |                         help='whether automatic tune alpha')
 23 | 
 24 |     # CQL
 25 |     parser.add_argument('--min_q_weight', type=float, default=5.0,
 26 |                         help='the value of alpha, set to 5.0 or 10.0 if not using lagrange')
 27 |     parser.add_argument('--entropy_backup', action='store_true', default=False,
 28 |                         help='whether use sac style target Q with entropy')
 29 |     parser.add_argument('--max_q_backup', action='store_true', default=False,
 30 |                         help='whether use max q backup')
 31 |     parser.add_argument('--with_lagrange', action='store_true', default=False,
 32 |                         help='whether auto tune alpha in Conservative Q Loss(different from the alpha in sac)')
 33 |     parser.add_argument('--lagrange_thresh', type=float, default=5.0,
 34 |                         help='the hyper-parameter used in automatic tuning alpha in cql loss')
 35 | 
 36 |     parser.add_argument('--max_train_step', type=int, default=1000000,
 37 |                         help='the max train step')
 38 |     parser.add_argument('--log_interval', type=int, default=1000,
 39 |                         help='The number of steps taken to record the model and the tensorboard')
 40 |     parser.add_argument('--train_id', type=str, default='cql_hopper-mujoco_test',
 41 |                         help='Path to save model and log tensorboard')
 42 |     parser.add_argument('--resume', action='store_true', default=False,
 43 |                         help='whether load the last saved model to train')
 44 |     parser.add_argument('--device', type=str, default='cpu',
 45 |                         help='Choose cpu or cuda')
 46 |     parser.add_argument('--show', action='store_true', default=False,
 47 |                         help='show the trained model visually')
 48 |     parser.add_argument('--eval_freq', type=int, default=5000,
 49 |                         help='how often (time steps) we evaluate')
 50 |     parser.add_argument('--seed', type=int, default=10,
 51 |                         help='the random seed')
 52 | 
 53 |     args = parser.parse_args()
 54 | 
 55 |     torch.manual_seed(args.seed)
 56 |     np.random.seed(args.seed)
 57 | 
 58 |     # create environment
 59 |     env = gym.make(args.env)
 60 |     env.seed(args.seed)
 61 |     env.action_space.seed(args.seed)
 62 |     train_tools.EVAL_SEED = args.seed
 63 | 
 64 |     obs_dim = env.observation_space.shape[0]
 65 |     act_dim = env.action_space.shape[0]
 66 |     act_bound = env.action_space.high[0]
 67 | 
 68 |     # create nets
 69 |     policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 70 |                                                   hidden_size=[256, 256], hidden_activation=nn.ReLU)
 71 |     q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 72 |                        hidden_activation=nn.ReLU)
 73 | 
 74 |     q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 75 |                        hidden_activation=nn.ReLU)
 76 | 
 77 |     # create buffer
 78 |     if args.show:
 79 |         data_buffer = None
 80 |     else:
 81 |         data = data_tools.get_d4rl_dataset(env)
 82 |         data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size)
 83 | 
 84 |     agent = CQL_Agent(
 85 |         # parameters of PolicyBase
 86 |         env=env,
 87 |         gamma=0.99,
 88 |         eval_freq=args.eval_freq,
 89 |         max_train_step=args.max_train_step,
 90 |         train_id=args.train_id,
 91 |         log_interval=args.log_interval,
 92 |         resume=args.resume,
 93 |         device=args.device,
 94 | 
 95 |         # Parameters of OfflineBase
 96 |         data_buffer=data_buffer,
 97 | 
 98 |         # Parameters of CQL_Agent
 99 |         policy_net=policy_net,
100 |         q_net1=q_net1,
101 |         q_net2=q_net2,
102 |         policy_lr=1e-4,
103 |         qf_lr=3e-4,
104 |         tau=0.05,
105 |         alpha=0.5,
106 |         auto_alpha_tuning=args.auto_alpha_tuning,
107 |         min_q_weight=args.min_q_weight,
108 |         entropy_backup=args.entropy_backup,
109 |         max_q_backup=args.max_q_backup,
110 |         with_lagrange=args.with_lagrange,
111 |         lagrange_thresh=args.lagrange_thresh,
112 |         n_action_samples=10,
113 |         )
114 | 
115 |     if args.show:
116 |         train_tools.evaluate(agent, 10, show=True)
117 |     else:
118 |         agent.learn()
119 | 


--------------------------------------------------------------------------------
/run/ddpg_gym.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | 
 5 | import argparse
 6 | import gym
 7 | import torch
 8 | import torch.nn as nn
 9 | import numpy as np
10 | from algos.ddpg import DDPG_Agent
11 | from common.buffers import ReplayBuffer
12 | from common.networks import MLPQsaNet, DDPGMLPActor
13 | from utils import train_tools
14 | 
15 | if __name__ == '__main__':
16 |     parser = argparse.ArgumentParser(description='DDPG algorithm in gym environment')
17 |     parser.add_argument('--env', type=str, default='Pendulum-v0',
18 |                         help='the name of environment')
19 |     parser.add_argument('--capacity', type=int, default=50000,
20 |                         help='the max size of data buffer')
21 |     parser.add_argument('--batch_size', type=int, default=64,
22 |                         help='the size of batch that sampled from buffer')
23 |     parser.add_argument('--explore_step', type=int, default=2000,
24 |                         help='the steps of exploration before train')
25 |     parser.add_argument('--eval_freq', type=int, default=1000,
26 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
27 |     parser.add_argument('--max_train_step', type=int, default=100000,
28 |                         help='the max train step')
29 |     parser.add_argument('--log_interval', type=int, default=1000,
30 |                         help='The number of steps taken to record the model and the tensorboard')
31 |     parser.add_argument('--train_id', type=str, default='ddpg_gym_test',
32 |                         help='Path to save model and log tensorboard')
33 |     parser.add_argument('--resume', action='store_true', default=False,
34 |                         help='whether load the last saved model to train')
35 |     parser.add_argument('--device', type=str, default='cpu',
36 |                         help='Choose cpu or cuda')
37 |     parser.add_argument('--show', action='store_true', default=False,
38 |                         help='show the trained model visually')
39 |     parser.add_argument('--seed', type=int, default=10,
40 |                         help='the random seed')
41 | 
42 |     args = parser.parse_args()
43 | 
44 |     torch.manual_seed(args.seed)
45 |     np.random.seed(args.seed)
46 | 
47 |     # create environment
48 |     env = gym.make(args.env)
49 |     env.seed(args.seed)
50 |     env.action_space.seed(args.seed)
51 |     train_tools.EVAL_SEED = args.seed
52 | 
53 |     obs_dim = env.observation_space.shape[0]
54 |     act_dim = env.action_space.shape[0]
55 |     act_bound = env.action_space.high[0]
56 | 
57 |     # create nets
58 |     actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
59 |                              hidden_size=[400, 300], hidden_activation=nn.ReLU)
60 | 
61 |     critic_net = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
62 |                            hidden_size=[400, 300], hidden_activation=nn.ReLU)
63 | 
64 |     # create buffer
65 |     if args.show:
66 |         replay_buffer = None
67 |     else:
68 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim,
69 |                                      act_dim=act_dim,
70 |                                      capacity=args.capacity,
71 |                                      batch_size=args.batch_size)
72 | 
73 |     agent = DDPG_Agent(
74 |         # parameters of PolicyBase
75 |         env=env,
76 |         gamma=0.99,
77 |         eval_freq=args.eval_freq,
78 |         max_train_step=args.max_train_step,
79 |         train_id=args.train_id,
80 |         log_interval=args.log_interval,
81 |         resume=args.resume,
82 |         device=args.device,
83 | 
84 |         # Parameters of OffPolicyBase
85 |         replay_buffer=replay_buffer,
86 |         explore_step=args.explore_step,
87 | 
88 |         # Parameters of DDPG_Agent
89 |         actor_net=actor_net, critic_net=critic_net,
90 |         actor_lr=1e-4, critic_lr=1e-3,
91 |         tau=0.005,
92 |         gaussian_noise_sigma=0.1,
93 |         )
94 | 
95 |     if args.show:
96 |         train_tools.evaluate(agent, 10, show=True)
97 |     else:
98 |         agent.learn()
99 | 


--------------------------------------------------------------------------------
/run/ddpg_unity.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import torch
  7 | import torch.nn as nn
  8 | import numpy as np
  9 | from algos.ddpg import DDPG_Agent
 10 | from common.buffers import ReplayBuffer
 11 | from common.networks import MLPQsaNet, DDPGMLPActor
 12 | from utils import train_tools
 13 | from mlagents_envs.environment import UnityEnvironment
 14 | from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
 15 | from gym_unity.envs import UnityToGymWrapper
 16 | 
 17 | 
 18 | if __name__ == '__main__':
 19 |     parser = argparse.ArgumentParser(description='DDPG algorithm in gym environment')
 20 |     parser.add_argument('--env', type=str, default=None,
 21 |                         help='the path of unity environment')
 22 |     parser.add_argument('--capacity', type=int, default=50000,
 23 |                         help='the max size of data buffer')
 24 |     parser.add_argument('--batch_size', type=int, default=64,
 25 |                         help='the size of batch that sampled from buffer')
 26 |     parser.add_argument('--explore_step', type=int, default=2000,
 27 |                         help='the steps of exploration before train')
 28 |     # parser.add_argument('--eval_freq', type=int, default=1000,
 29 |     #                     help='how often (time steps) we evaluate')
 30 |     parser.add_argument('--max_train_step', type=int, default=100000,
 31 |                         help='the max train step')
 32 |     parser.add_argument('--log_interval', type=int, default=1000,
 33 |                         help='The number of steps taken to record the model and the tensorboard')
 34 |     parser.add_argument('--train_id', type=str, default='ddpg_unity_test',
 35 |                         help='Path to save model and log tensorboard')
 36 |     parser.add_argument('--resume', action='store_true', default=False,
 37 |                         help='whether load the last saved model to train')
 38 |     parser.add_argument('--device', type=str, default='cpu',
 39 |                         help='Choose cpu or cuda')
 40 |     parser.add_argument('--show', action='store_true', default=False,
 41 |                         help='show the trained model visually')
 42 |     parser.add_argument('--seed', type=int, default=10,
 43 |                         help='the random seed')
 44 | 
 45 |     args = parser.parse_args()
 46 | 
 47 |     torch.manual_seed(args.seed)
 48 |     np.random.seed(args.seed)
 49 | 
 50 |     engine_configuration_channel = EngineConfigurationChannel()
 51 |     unity_env = UnityEnvironment(side_channels=[engine_configuration_channel], file_name=args.env)
 52 |     engine_configuration_channel.set_configuration_parameters(
 53 |         width=200,
 54 |         height=200,
 55 |         quality_level=5,
 56 |         time_scale=1 if args.show else 20,
 57 |         target_frame_rate=-1,
 58 |         capture_frame_rate=60)
 59 | 
 60 |     env = UnityToGymWrapper(unity_env=unity_env)
 61 |     env.seed(args.seed)
 62 |     env.action_space.seed(args.seed)
 63 |     train_tools.EVAL_SEED = args.seed
 64 | 
 65 |     obs_dim = env.observation_space.shape[0]
 66 |     act_dim = env.action_space.shape[0]
 67 |     act_bound = env.action_space.high[0]
 68 | 
 69 |     # create nets
 70 |     actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 71 |                              hidden_size=[400, 300], hidden_activation=nn.ReLU)
 72 | 
 73 |     critic_net = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
 74 |                            hidden_size=[400, 300], hidden_activation=nn.ReLU)
 75 | 
 76 |     # create buffer
 77 |     if args.show:
 78 |         replay_buffer = None
 79 |     else:
 80 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim,
 81 |                                      act_dim=act_dim,
 82 |                                      capacity=args.capacity,
 83 |                                      batch_size=args.batch_size)
 84 | 
 85 |     agent = DDPG_Agent(
 86 |         # parameters of PolicyBase
 87 |         env=env,
 88 |         gamma=0.99,
 89 |         eval_freq=args.eval_freq,
 90 |         max_train_step=args.max_train_step,
 91 |         train_id=args.train_id,
 92 |         log_interval=args.log_interval,
 93 |         resume=args.resume,
 94 |         device=args.device,
 95 | 
 96 |         # Parameters of OffPolicyBase
 97 |         replay_buffer=replay_buffer,
 98 |         explore_step=args.explore_step,
 99 | 
100 |         # Parameters of DDPG_Agent
101 |         actor_net=actor_net, critic_net=critic_net,
102 |         actor_lr=1e-4, critic_lr=1e-3,
103 |         tau=0.005,
104 |         gaussian_noise_sigma=0.1,
105 |         )
106 | 
107 |     if args.show:
108 |         train_tools.evaluate_unity(agent, 10)
109 |     else:
110 |         agent.learn()
111 | 


--------------------------------------------------------------------------------
/run/ddqn_atari.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | 
 5 | import argparse
 6 | import torch
 7 | import numpy as np
 8 | from algos.ddqn import DDQN_Agent
 9 | from common.buffers import ReplayBuffer
10 | from common.networks import ConvAtariQsNet
11 | from utils import train_tools
12 | from utils.atari_wrappers import make_atari_env
13 | 
14 | 
15 | if __name__ == '__main__':
16 | 
17 |     parser = argparse.ArgumentParser(description='DDQN algorithm in atari environment')
18 |     parser.add_argument('--env', type=str, default='PongNoFrameskip-v4',
19 |                         help='the name of environment')
20 |     parser.add_argument('--capacity', type=int, default=100000,
21 |                         help='the max size of data buffer')
22 |     parser.add_argument('--batch_size', type=int, default=32,
23 |                         help='the size of batch that sampled from buffer')
24 |     parser.add_argument('--explore_step', type=int, default=20000,
25 |                         help='the steps of exploration before train')
26 |     parser.add_argument('--eval_freq', type=int, default=10000,
27 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
28 |     parser.add_argument('--max_train_step', type=int, default=2000000,
29 |                         help='the max train step')
30 |     parser.add_argument('--log_interval', type=int, default=1000,
31 |                         help='The number of steps taken to record the model and the tensorboard')
32 |     parser.add_argument('--resume', action='store_true', default=False,
33 |                         help='whether load the last saved model to train')
34 |     parser.add_argument('--train_id', type=str, default='ddqn_atari_test',
35 |                         help='Path to save model and log tensorboard')
36 |     parser.add_argument('--device', type=str, default='cpu',
37 |                         help='Choose cpu or cuda')
38 |     parser.add_argument('--show', action='store_true', default=False,
39 |                         help='show the trained model visually')
40 |     parser.add_argument('--seed', type=int, default=10,
41 |                         help='the random seed')
42 |     parser.add_argument('--scale_obs', action='store_true', default=False,
43 |                         help='whether scale the obs to 0-1')
44 |     args = parser.parse_args()
45 | 
46 |     torch.manual_seed(args.seed)
47 |     np.random.seed(args.seed)
48 | 
49 |     env = make_atari_env(args.env, scale_obs=args.scale_obs)
50 |     env.seed(args.seed)
51 |     env.action_space.seed(args.seed)
52 |     train_tools.EVAL_SEED = args.seed
53 | 
54 |     obs_dim = env.observation_space.shape
55 |     act_dim = env.action_space.n
56 | 
57 |     Q_net = ConvAtariQsNet(num_frames_stack=4, act_dim=act_dim)
58 | 
59 |     # create buffer
60 |     if args.show:
61 |         replay_buffer = None
62 |     else:
63 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=1,
64 |                                      capacity=args.capacity, batch_size=args.batch_size)
65 | 
66 |     agent = DDQN_Agent(
67 |         # parameters of PolicyBase
68 |         env=env,
69 |         gamma=0.99,
70 |         eval_freq=args.eval_freq,
71 |         max_train_step=args.max_train_step,
72 |         train_id=args.train_id,
73 |         log_interval=args.log_interval,
74 |         resume=args.resume,
75 |         device=args.device,
76 | 
77 |         # Parameters of OffPolicyBase
78 |         replay_buffer=replay_buffer,
79 |         explore_step=args.explore_step,
80 | 
81 |         # Parameters of DDQN_Agent
82 |         Q_net=Q_net,
83 |         qf_lr=1e-4,
84 |         initial_eps=0.1,
85 |         end_eps=0.001,
86 |         eps_decay_period=1000000,
87 |         eval_eps=0.001,
88 |         target_update_freq=1000,
89 |         )
90 | 
91 |     if args.show:
92 |         train_tools.evaluate(agent, 10, show=True)
93 |     else:
94 |         agent.learn()
95 | 
96 | 


--------------------------------------------------------------------------------
/run/ddqn_gym.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | 
 5 | import argparse
 6 | import gym
 7 | import torch
 8 | import torch.nn as nn
 9 | import numpy as np
10 | from algos.ddqn import DDQN_Agent
11 | from common.buffers import ReplayBuffer
12 | from common.networks import MLPQsNet
13 | from utils import train_tools
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     parser = argparse.ArgumentParser(description='DDQN algorithm in gym environment')
19 |     parser.add_argument('--env', type=str, default='CartPole-v0',
20 |                         help='the name of environment')
21 |     parser.add_argument('--capacity', type=int, default=5000,
22 |                         help='the max size of data buffer')
23 |     parser.add_argument('--batch_size', type=int, default=128,
24 |                         help='the size of batch that sampled from buffer')
25 |     parser.add_argument('--explore_step', type=int, default=500,
26 |                         help='the steps of exploration before train')
27 |     parser.add_argument('--eval_freq', type=int, default=1000,
28 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
29 |     parser.add_argument('--max_train_step', type=int, default=10000,
30 |                         help='the max train step')
31 |     parser.add_argument('--log_interval', type=int, default=500,
32 |                         help='The number of steps taken to record the model and the tensorboard')
33 |     parser.add_argument('--resume', action='store_true', default=False,
34 |                         help='whether load the last saved model to train')
35 |     parser.add_argument('--train_id', type=str, default='ddqn_gym_test',
36 |                         help='Path to save model and log tensorboard')
37 |     parser.add_argument('--device', type=str, default='cpu',
38 |                         help='Choose cpu or cuda')
39 |     parser.add_argument('--show', action='store_true', default=False,
40 |                         help='show the trained model visually')
41 |     parser.add_argument('--seed', type=int, default=10,
42 |                         help='the random seed')
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     torch.manual_seed(args.seed)
47 |     np.random.seed(args.seed)
48 | 
49 |     env = gym.make(args.env)
50 |     env.seed(args.seed)
51 |     env.action_space.seed(args.seed)
52 |     train_tools.EVAL_SEED = args.seed
53 | 
54 |     obs_dim = env.observation_space.shape[0]
55 |     act_dim = env.action_space.n
56 | 
57 |     Q_net = MLPQsNet(obs_dim=obs_dim, act_dim=act_dim,
58 |                      hidden_size=[256, 256], hidden_activation=nn.ReLU)
59 | 
60 |     # create buffer
61 |     if args.show:
62 |         replay_buffer = None
63 |     else:
64 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=1,
65 |                                      capacity=args.capacity, batch_size=args.batch_size)
66 | 
67 |     agent = DDQN_Agent(
68 |         # parameters of PolicyBase
69 |         env=env,
70 |         gamma=0.99,
71 |         eval_freq=args.eval_freq,
72 |         max_train_step=args.max_train_step,
73 |         train_id=args.train_id,
74 |         log_interval=args.log_interval,
75 |         resume=args.resume,
76 |         device=args.device,
77 | 
78 |         # Parameters of OffPolicyBase
79 |         replay_buffer=replay_buffer,
80 |         explore_step=args.explore_step,
81 | 
82 |         # Parameters of DDQN_Agent
83 |         Q_net=Q_net,
84 |         qf_lr=0.001,
85 |         initial_eps=0.1,
86 |         end_eps=0.001,
87 |         eps_decay_period=2000,
88 |         eval_eps=0.001,
89 |         target_update_freq=10,
90 |         )
91 | 
92 |     if args.show:
93 |         train_tools.evaluate(agent, 10, show=True)
94 |     else:
95 |         agent.learn()
96 | 
97 | 


--------------------------------------------------------------------------------
/run/dqn_atari.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | 
 5 | import argparse
 6 | import torch
 7 | import numpy as np
 8 | from algos.dqn import DQN_Agent
 9 | from common.buffers import ReplayBuffer
10 | from common.networks import ConvAtariQsNet
11 | from utils import train_tools
12 | from utils.atari_wrappers import make_atari_env
13 | 
14 | 
15 | if __name__ == '__main__':
16 | 
17 |     parser = argparse.ArgumentParser(description='DQN algorithm in atari environment')
18 |     parser.add_argument('--env', type=str, default='PongNoFrameskip-v4',
19 |                         help='the name of environment')
20 |     parser.add_argument('--capacity', type=int, default=100000,
21 |                         help='the max size of data buffer')
22 |     parser.add_argument('--batch_size', type=int, default=32,
23 |                         help='the size of batch that sampled from buffer')
24 |     parser.add_argument('--explore_step', type=int, default=20000,
25 |                         help='the steps of exploration before train')
26 |     parser.add_argument('--eval_freq', type=int, default=10000,
27 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
28 |     parser.add_argument('--max_train_step', type=int, default=2000000,
29 |                         help='the max train step')
30 |     parser.add_argument('--log_interval', type=int, default=1000,
31 |                         help='The number of steps taken to record the model and the tensorboard')
32 |     parser.add_argument('--resume', action='store_true', default=False,
33 |                         help='whether load the last saved model to train')
34 |     parser.add_argument('--train_id', type=str, default='dqn_atari_test',
35 |                         help='Path to save model and log tensorboard')
36 |     parser.add_argument('--device', type=str, default='cpu',
37 |                         help='Choose cpu or cuda')
38 |     parser.add_argument('--show', action='store_true', default=False,
39 |                         help='show the trained model visually')
40 |     parser.add_argument('--seed', type=int, default=10,
41 |                         help='the random seed')
42 |     parser.add_argument('--scale_obs', action='store_true', default=False,
43 |                         help='whether scale the obs to 0-1')
44 |     args = parser.parse_args()
45 | 
46 |     torch.manual_seed(args.seed)
47 |     np.random.seed(args.seed)
48 | 
49 |     env = make_atari_env(args.env, scale_obs=args.scale_obs)
50 |     env.seed(args.seed)
51 |     env.action_space.seed(args.seed)
52 |     train_tools.EVAL_SEED = args.seed
53 | 
54 |     obs_dim = env.observation_space.shape
55 |     act_dim = env.action_space.n
56 | 
57 |     Q_net = ConvAtariQsNet(num_frames_stack=4, act_dim=act_dim)
58 | 
59 |     # create buffer
60 |     if args.show:
61 |         replay_buffer = None
62 |     else:
63 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=1,
64 |                                      capacity=args.capacity, batch_size=args.batch_size)
65 | 
66 |     agent = DQN_Agent(
67 |         # parameters of PolicyBase
68 |         env=env,
69 |         gamma=0.99,
70 |         eval_freq=args.eval_freq,
71 |         max_train_step=args.max_train_step,
72 |         train_id=args.train_id,
73 |         log_interval=args.log_interval,
74 |         resume=args.resume,
75 |         device=args.device,
76 | 
77 |         # Parameters of OffPolicyBase
78 |         replay_buffer=replay_buffer,
79 |         explore_step=args.explore_step,
80 | 
81 |         # Parameters of DQN_Agent
82 |         Q_net=Q_net,
83 |         qf_lr=1e-4,
84 |         initial_eps=0.1,
85 |         end_eps=0.001,
86 |         eps_decay_period=1000000,
87 |         eval_eps=0.001,
88 |         target_update_freq=1000,
89 |         )
90 | 
91 |     if args.show:
92 |         train_tools.evaluate(agent, 10, show=True)
93 |     else:
94 |         agent.learn()
95 | 


--------------------------------------------------------------------------------
/run/dqn_gym.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | 
 5 | import argparse
 6 | import gym
 7 | import torch
 8 | import torch.nn as nn
 9 | import numpy as np
10 | from algos.dqn import DQN_Agent
11 | from common.buffers import ReplayBuffer
12 | from common.networks import MLPQsNet
13 | from utils import train_tools
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     parser = argparse.ArgumentParser(description='DQN algorithm in gym environment')
19 |     parser.add_argument('--env', type=str, default='CartPole-v0',
20 |                         help='the name of environment')
21 |     parser.add_argument('--capacity', type=int, default=5000,
22 |                         help='the max size of data buffer')
23 |     parser.add_argument('--batch_size', type=int, default=128,
24 |                         help='the size of batch that sampled from buffer')
25 |     parser.add_argument('--explore_step', type=int, default=500,
26 |                         help='the steps of exploration before train')
27 |     parser.add_argument('--eval_freq', type=int, default=1000,
28 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
29 |     parser.add_argument('--max_train_step', type=int, default=10000,
30 |                         help='the max train step')
31 |     parser.add_argument('--log_interval', type=int, default=500,
32 |                         help='The number of steps taken to record the model and the tensorboard')
33 |     parser.add_argument('--resume', action='store_true', default=False,
34 |                         help='whether load the last saved model to train')
35 |     parser.add_argument('--train_id', type=str, default='dqn_gym_test',
36 |                         help='Path to save model and log tensorboard')
37 |     parser.add_argument('--device', type=str, default='cpu',
38 |                         help='Choose cpu or cuda')
39 |     parser.add_argument('--show', action='store_true', default=False,
40 |                         help='show the trained model visually')
41 |     parser.add_argument('--seed', type=int, default=10,
42 |                         help='the random seed')
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     torch.manual_seed(args.seed)
47 |     np.random.seed(args.seed)
48 | 
49 |     env = gym.make(args.env)
50 |     env.seed(args.seed)
51 |     env.action_space.seed(args.seed)
52 |     train_tools.EVAL_SEED = args.seed
53 | 
54 |     obs_dim = env.observation_space.shape[0]
55 |     act_dim = env.action_space.n
56 | 
57 |     Q_net = MLPQsNet(obs_dim=obs_dim, act_dim=act_dim,
58 |                      hidden_size=[256, 256], hidden_activation=nn.ReLU)
59 | 
60 |     # create buffer
61 |     if args.show:
62 |         replay_buffer = None
63 |     else:
64 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=1,
65 |                                      capacity=args.capacity, batch_size=args.batch_size)
66 | 
67 |     agent = DQN_Agent(
68 |         # parameters of PolicyBase
69 |         env=env,
70 |         gamma=0.99,
71 |         eval_freq=args.eval_freq,
72 |         max_train_step=args.max_train_step,
73 |         train_id=args.train_id,
74 |         log_interval=args.log_interval,
75 |         resume=args.resume,
76 |         device=args.device,
77 | 
78 |         # Parameters of OffPolicyBase
79 |         replay_buffer=replay_buffer,
80 |         explore_step=args.explore_step,
81 | 
82 |         # Parameters of DQN_Agent
83 |         Q_net=Q_net,
84 |         qf_lr=0.001,
85 |         initial_eps=0.1,
86 |         end_eps=0.001,
87 |         eps_decay_period=2000,
88 |         eval_eps=0.001,
89 |         target_update_freq=10,
90 |         )
91 | 
92 |     if args.show:
93 |         train_tools.evaluate(agent, 10, show=True)
94 |     else:
95 |         agent.learn()
96 | 
97 | 


--------------------------------------------------------------------------------
/run/plas_mujoco.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import gym
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | from algos.offline.plas import PLAS_Agent
 11 | from common.buffers import OfflineBuffer
 12 | from common.networks import MLPQsaNet, CVAE, PLAS_Actor, DDPGMLPActor
 13 | from utils import train_tools, data_tools
 14 | 
 15 | 
 16 | if __name__ == '__main__':
 17 |     parser = argparse.ArgumentParser(description='PLAS algorithm in mujoco environment')
 18 |     parser.add_argument('--env', type=str, default='hopper-medium-v0',
 19 |                         help='the name of environment')
 20 |     parser.add_argument('--batch_size', type=int, default=100,
 21 |                         help='the size of batch that sampled from buffer')
 22 |     parser.add_argument('--max_train_step', type=int, default=500000,
 23 |                         help='the max train step')
 24 |     parser.add_argument('--max_cvae_iterations', type=int, default=500000,
 25 |                         help='the num of iterations when training CVAE model')
 26 |     parser.add_argument('--use_ptb', action='store_true', default=False,
 27 |                         help='whether use perturbation layer')
 28 |     parser.add_argument('--log_interval', type=int, default=1000,
 29 |                         help='The number of steps taken to record the model and the tensorboard')
 30 |     parser.add_argument('--train_id', type=str, default='plas_mujoco_test',
 31 |                         help='Path to save model and log tensorboard')
 32 |     parser.add_argument('--resume', action='store_true', default=False,
 33 |                         help='whether load the last saved model to train')
 34 |     parser.add_argument('--device', type=str, default='cpu',
 35 |                         help='Choose cpu or cuda')
 36 |     parser.add_argument('--show', action='store_true', default=False,
 37 |                         help='show the trained model visually')
 38 |     parser.add_argument('--eval_freq', type=int, default=5000,
 39 |                         help='how often (time steps) we evaluate')
 40 |     parser.add_argument('--seed', type=int, default=10,
 41 |                         help='the random seed')
 42 | 
 43 |     args = parser.parse_args()
 44 | 
 45 |     torch.manual_seed(args.seed)
 46 |     np.random.seed(args.seed)
 47 | 
 48 |     # create environment
 49 |     env = gym.make(args.env)
 50 |     env.seed(args.seed)
 51 |     env.action_space.seed(args.seed)
 52 |     train_tools.EVAL_SEED = args.seed
 53 | 
 54 |     obs_dim = env.observation_space.shape[0]
 55 |     act_dim = env.action_space.shape[0]
 56 |     act_bound = env.action_space.high[0]
 57 | 
 58 |     critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300],
 59 |                             hidden_activation=nn.ReLU)
 60 |     critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300],
 61 |                             hidden_activation=nn.ReLU)
 62 | 
 63 |     cvae_net = CVAE(obs_dim=obs_dim, act_dim=act_dim,
 64 |                     latent_dim=2 * act_dim, act_bound=act_bound)
 65 | 
 66 |     actor_net = PLAS_Actor(obs_dim=obs_dim, act_dim=act_dim, latent_act_dim=2 * act_dim,
 67 |                            act_bound=act_bound, latent_act_bound=2,
 68 |                            actor_hidden_size=[400, 300], ptb_hidden_size=[400, 300], hidden_activation=nn.ReLU,
 69 |                            use_ptb=args.use_ptb, phi=0.05)
 70 | 
 71 |     # create buffer
 72 |     if args.show:
 73 |         data_buffer = None
 74 |     else:
 75 |         data = data_tools.get_d4rl_dataset(env)
 76 |         data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size)
 77 | 
 78 |     agent = PLAS_Agent(
 79 |         # parameters of PolicyBase
 80 |         env=env,
 81 |         gamma=0.99,
 82 |         eval_freq=args.eval_freq,
 83 |         max_train_step=args.max_train_step,
 84 |         train_id=args.train_id,
 85 |         log_interval=args.log_interval,
 86 |         resume=args.resume,
 87 |         device=args.device,
 88 | 
 89 |         # Parameters of OfflineBase
 90 |         data_buffer=data_buffer,
 91 | 
 92 |         # Parameters of PLAS_Agent
 93 |         critic_net1=critic_net1,
 94 |         critic_net2=critic_net2,
 95 |         actor_net=actor_net,
 96 |         cvae_net=cvae_net,  # generation model
 97 |         critic_lr=1e-3,
 98 |         actor_lr=1e-4,
 99 |         cvae_lr=1e-4,
100 |         tau=0.005,
101 |         lmbda=1,  # used for double clipped double q-learning
102 |         max_cvae_iterations=args.max_cvae_iterations,
103 |         )
104 | 
105 |     if args.show:
106 |         train_tools.evaluate(agent, 10, show=True)
107 |     else:
108 |         agent.learn()
109 | 


--------------------------------------------------------------------------------
/run/ppo_gym.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from pathlib import Path
  4 | # sys.path.append(str(Path(__file__).absolute().parent.parent))
  5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  6 | 
  7 | import argparse
  8 | import gym
  9 | from gym.spaces import Box, Discrete
 10 | import torch
 11 | import torch.nn as nn
 12 | import numpy as np
 13 | from algos.ppo import PPO_Agent
 14 | from common.buffers import TrajectoryBuffer
 15 | from common.networks import MLPVsNet, MLPCategoricalActor, MLPGaussianActor
 16 | from utils import train_tools
 17 | 
 18 | if __name__ == '__main__':
 19 |     parser = argparse.ArgumentParser(description='PPO algorithm in gym environment')
 20 |     parser.add_argument('--env', type=str, default='CartPole-v0',
 21 |                         help='the name of environment')
 22 |     parser.add_argument('--gae_norm', action='store_true', default=False,
 23 |                         help='whether normalize the GAE')
 24 |     parser.add_argument('--traj_length', type=int, default=128,
 25 |                         help='the length of trajectory')
 26 |     parser.add_argument('--eval_freq', type=int, default=1000,
 27 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
 28 |     parser.add_argument('--max_train_step', type=int, default=100000,
 29 |                         help='the max time step to train')
 30 |     parser.add_argument('--log_interval', type=int, default=1000,
 31 |                         help='The number of steps taken to record the model and the tensorboard')
 32 |     parser.add_argument('--train_id', type=str, default='ppo_gym_test',
 33 |                         help='Path to save model and log tensorboard')
 34 |     parser.add_argument('--resume', action='store_true', default=False,
 35 |                         help='whether load the last saved model to train')
 36 |     parser.add_argument('--device', type=str, default='cpu',
 37 |                         help='Choose cpu or cuda')
 38 |     parser.add_argument('--show', action='store_true', default=False,
 39 |                         help='show the trained model visually')
 40 |     parser.add_argument('--seed', type=int, default=10,
 41 |                         help='the random seed')
 42 | 
 43 |     args = parser.parse_args()
 44 | 
 45 |     torch.manual_seed(args.seed)
 46 |     np.random.seed(args.seed)
 47 | 
 48 |     # create environment
 49 |     env = gym.make(args.env)
 50 |     env.seed(args.seed)
 51 |     env.action_space.seed(args.seed)
 52 |     train_tools.EVAL_SEED = args.seed
 53 | 
 54 |     obs_dim = env.observation_space.shape[0]
 55 | 
 56 |     # create nets
 57 |     if isinstance(env.action_space, Discrete):
 58 |         act_num = env.action_space.n
 59 |         buffer_act_dim = 1
 60 |         actor_net = MLPCategoricalActor(obs_dim=obs_dim, act_num=act_num,
 61 |                                         hidden_size=[64, 64], hidden_activation=nn.Tanh)
 62 |     elif isinstance(env.action_space, Box):
 63 |         act_dim = env.action_space.shape[0]
 64 |         buffer_act_dim = act_dim
 65 |         actor_net = MLPGaussianActor(obs_dim=obs_dim, act_dim=act_dim,
 66 |                                      hidden_size=[64, 64], hidden_activation=nn.Tanh)
 67 | 
 68 |     critic_net = MLPVsNet(obs_dim=obs_dim, hidden_size=[64, 64], hidden_activation=nn.Tanh)
 69 | 
 70 | 
 71 |     # create buffer
 72 |     if args.show:
 73 |         trajectory_buffer = None
 74 |     else:
 75 |         trajectory_buffer = TrajectoryBuffer(obs_dim=obs_dim,
 76 |                                              act_dim=buffer_act_dim,
 77 |                                              capacity=args.traj_length)
 78 | 
 79 |     agent = PPO_Agent(
 80 |         # parameters of PolicyBase
 81 |         env=env,
 82 |         gamma=0.99,
 83 |         eval_freq=args.eval_freq,
 84 |         max_train_step=args.max_train_step,
 85 |         train_id=args.train_id,
 86 |         log_interval=args.log_interval,
 87 |         resume=args.resume,
 88 |         device=args.device,
 89 | 
 90 |         # Parameters of PPO_Agent
 91 |         trajectory_buffer=trajectory_buffer,
 92 |         actor_net=actor_net,
 93 |         critic_net=critic_net,
 94 |         actor_lr=3e-4,
 95 |         critic_lr=1e-3,
 96 |         gae_lambda=0.95,
 97 |         gae_normalize=args.gae_norm,
 98 |         clip_pram=0.2,
 99 |         trajectory_length=args.traj_length,  # the length of a trajectory_
100 |         train_actor_iters=10,
101 |         train_critic_iters=10,
102 |         )
103 | 
104 |     if args.show:
105 |         train_tools.evaluate(agent, 10, show=True)
106 |     else:
107 |         agent.learn()
108 | 


--------------------------------------------------------------------------------
/run/ppo_mujoco.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from pathlib import Path
  4 | # sys.path.append(str(Path(__file__).absolute().parent.parent))
  5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  6 | 
  7 | import argparse
  8 | import gym
  9 | from gym.spaces import Box, Discrete
 10 | import torch
 11 | import torch.nn as nn
 12 | import numpy as np
 13 | from algos.ppo import PPO_Agent
 14 | from common.buffers import TrajectoryBuffer
 15 | from common.networks import MLPVsNet, MLPCategoricalActor, MLPGaussianActor
 16 | from utils import train_tools
 17 | 
 18 | if __name__ == '__main__':
 19 |     parser = argparse.ArgumentParser(description='PPO algorithm in mujoco environment')
 20 |     parser.add_argument('--env', type=str, default='Hopper-v2',
 21 |                         help='the name of environment')
 22 |     parser.add_argument('--gae_norm', action='store_true', default=False,
 23 |                         help='whether normalize the GAE')
 24 |     parser.add_argument('--traj_length', type=int, default=2048,
 25 |                         help='the length of trajectory')
 26 |     parser.add_argument('--eval_freq', type=int, default=5000,
 27 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
 28 |     parser.add_argument('--max_train_step', type=int, default=1000000,
 29 |                         help='the max time step to train')
 30 |     parser.add_argument('--log_interval', type=int, default=5000,
 31 |                         help='The number of steps taken to record the model and the tensorboard')
 32 |     parser.add_argument('--train_id', type=str, default='ppo_mujoco_test',
 33 |                         help='Path to save model and log tensorboard')
 34 |     parser.add_argument('--resume', action='store_true', default=False,
 35 |                         help='whether load the last saved model to train')
 36 |     parser.add_argument('--device', type=str, default='cpu',
 37 |                         help='Choose cpu or cuda')
 38 |     parser.add_argument('--show', action='store_true', default=False,
 39 |                         help='show the trained model visually')
 40 |     parser.add_argument('--seed', type=int, default=10,
 41 |                         help='the random seed')
 42 | 
 43 |     args = parser.parse_args()
 44 | 
 45 |     torch.manual_seed(args.seed)
 46 |     np.random.seed(args.seed)
 47 | 
 48 |     # create environment
 49 |     env = gym.make(args.env)
 50 |     env.seed(args.seed)
 51 |     env.action_space.seed(args.seed)
 52 |     train_tools.EVAL_SEED = args.seed
 53 | 
 54 |     obs_dim = env.observation_space.shape[0]
 55 | 
 56 |     # create nets
 57 |     if isinstance(env.action_space, Discrete):
 58 |         act_num = env.action_space.n
 59 |         buffer_act_dim = 1
 60 |         actor_net = MLPCategoricalActor(obs_dim=obs_dim, act_num=act_num,
 61 |                                         hidden_size=[64, 64], hidden_activation=nn.Tanh)
 62 |     elif isinstance(env.action_space, Box):
 63 |         act_dim = env.action_space.shape[0]
 64 |         buffer_act_dim = act_dim
 65 |         actor_net = MLPGaussianActor(obs_dim=obs_dim, act_dim=act_dim,
 66 |                                      hidden_size=[64, 64], hidden_activation=nn.Tanh)
 67 | 
 68 |     critic_net = MLPVsNet(obs_dim=obs_dim, hidden_size=[64, 64], hidden_activation=nn.Tanh)
 69 | 
 70 | 
 71 |     # create buffer
 72 |     if args.show:
 73 |         trajectory_buffer = None
 74 |     else:
 75 |         trajectory_buffer = TrajectoryBuffer(obs_dim=obs_dim,
 76 |                                              act_dim=buffer_act_dim,
 77 |                                              capacity=args.traj_length)
 78 | 
 79 |     agent = PPO_Agent(
 80 |         # parameters of PolicyBase
 81 |         env=env,
 82 |         gamma=0.99,
 83 |         eval_freq=args.eval_freq,
 84 |         max_train_step=args.max_train_step,
 85 |         train_id=args.train_id,
 86 |         log_interval=args.log_interval,
 87 |         resume=args.resume,
 88 |         device=args.device,
 89 | 
 90 |         # Parameters of PPO_Agent
 91 |         trajectory_buffer=trajectory_buffer,
 92 |         actor_net=actor_net,
 93 |         critic_net=critic_net,
 94 |         actor_lr=3e-4,
 95 |         critic_lr=1e-3,
 96 |         gae_lambda=0.95,
 97 |         gae_normalize=args.gae_norm,
 98 |         clip_pram=0.2,
 99 |         trajectory_length=args.traj_length,  # the length of a trajectory_
100 |         train_actor_iters=80,
101 |         train_critic_iters=80,
102 |         )
103 | 
104 |     if args.show:
105 |         train_tools.evaluate(agent, 10, show=True)
106 |     else:
107 |         agent.learn()
108 | 


--------------------------------------------------------------------------------
/run/sac_gym.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from pathlib import Path
  4 | # sys.path.append(str(Path(__file__).absolute().parent.parent))
  5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  6 | 
  7 | import argparse
  8 | import gym
  9 | import torch
 10 | import torch.nn as nn
 11 | import numpy as np
 12 | from algos.sac import SAC_Agent
 13 | from common.buffers import ReplayBuffer
 14 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy
 15 | from utils import train_tools
 16 | 
 17 | if __name__ == '__main__':
 18 |     parser = argparse.ArgumentParser(description='SAC algorithm in gym environment')
 19 |     parser.add_argument('--env', type=str, default='Pendulum-v0',
 20 |                         help='the name of environment')
 21 |     parser.add_argument('--capacity', type=int, default=50000,
 22 |                         help='the max size of data buffer')
 23 |     parser.add_argument('--batch_size', type=int, default=128,
 24 |                         help='the size of batch that sampled from buffer')
 25 |     parser.add_argument('--alpha', type=float, default=0.5,
 26 |                         help='the coefficient of entropy')
 27 |     parser.add_argument('--auto_alpha_tuning', action='store_true', default=False,
 28 |                         help='whether automatic tune alpha')
 29 |     parser.add_argument('--explore_step', type=int, default=2000,
 30 |                         help='the steps of exploration before train')
 31 |     parser.add_argument('--eval_freq', type=int, default=1000,
 32 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
 33 |     parser.add_argument('--max_train_step', type=int, default=100000,
 34 |                         help='the max train step')
 35 |     parser.add_argument('--log_interval', type=int, default=1000,
 36 |                         help='The number of steps taken to record the model and the tensorboard')
 37 |     parser.add_argument('--train_id', type=str, default='sac_gym_test',
 38 |                         help='Path to save model and log tensorboard')
 39 |     parser.add_argument('--resume', action='store_true', default=False,
 40 |                         help='whether load the last saved model to train')
 41 |     parser.add_argument('--device', type=str, default='cpu',
 42 |                         help='Choose cpu or cuda')
 43 |     parser.add_argument('--show', action='store_true', default=False,
 44 |                         help='show the trained model visually')
 45 |     parser.add_argument('--seed', type=int, default=10,
 46 |                         help='the random seed')
 47 | 
 48 |     args = parser.parse_args()
 49 | 
 50 |     torch.manual_seed(args.seed)
 51 |     np.random.seed(args.seed)
 52 | 
 53 |     # create environment
 54 |     env = gym.make(args.env)
 55 |     env.seed(args.seed)
 56 |     env.action_space.seed(args.seed)
 57 |     train_tools.EVAL_SEED = args.seed
 58 | 
 59 |     obs_dim = env.observation_space.shape[0]
 60 |     act_dim = env.action_space.shape[0]
 61 |     act_bound = env.action_space.high[0]
 62 | 
 63 |     # create nets
 64 |     policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 65 |                                                   hidden_size=[256, 256], hidden_activation=nn.ReLU)
 66 |     q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 67 |                        hidden_activation=nn.ReLU)
 68 | 
 69 |     q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 70 |                        hidden_activation=nn.ReLU)
 71 | 
 72 |     # create buffer
 73 |     if args.show:
 74 |         replay_buffer = None
 75 |     else:
 76 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim,
 77 |                                      act_dim=act_dim,
 78 |                                      capacity=args.capacity,
 79 |                                      batch_size=args.batch_size)
 80 | 
 81 |     agent = SAC_Agent(
 82 |         # parameters of PolicyBase
 83 |         env=env,
 84 |         gamma=0.99,
 85 |         eval_freq=args.eval_freq,
 86 |         max_train_step=args.max_train_step,
 87 |         train_id=args.train_id,
 88 |         log_interval=args.log_interval,
 89 |         resume=args.resume,
 90 |         device=args.device,
 91 | 
 92 |         # Parameters of OffPolicyBase
 93 |         replay_buffer=replay_buffer,
 94 |         explore_step=args.explore_step,
 95 | 
 96 |         # Parameters of SAC_Agent
 97 |         policy_net=policy_net,
 98 |         q_net1=q_net1,  # critic
 99 |         q_net2=q_net2,
100 |         policy_lr=4e-3,
101 |         qf_lr=4e-3,
102 |         tau=0.005,
103 |         alpha=args.alpha,
104 |         auto_alpha_tuning=args.auto_alpha_tuning,
105 |         )
106 | 
107 |     if args.show:
108 |         train_tools.evaluate(agent, 10, show=True)
109 |     else:
110 |         agent.learn()
111 | 


--------------------------------------------------------------------------------
/run/sac_mujoco.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import gym
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | from algos.sac import SAC_Agent
 11 | from common.buffers import ReplayBuffer
 12 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy
 13 | from utils import train_tools
 14 | 
 15 | if __name__ == '__main__':
 16 |     parser = argparse.ArgumentParser(description='SAC algorithm in mujoco environment')
 17 |     parser.add_argument('--env', type=str, default='Hopper-v2',
 18 |                         help='the name of environment')
 19 |     parser.add_argument('--capacity', type=int, default=1000000,
 20 |                         help='the max size of data buffer')
 21 |     parser.add_argument('--batch_size', type=int, default=256,
 22 |                         help='the size of batch that sampled from buffer')
 23 |     parser.add_argument('--alpha', type=float, default=0.5,
 24 |                         help='the coefficient of entropy')
 25 |     parser.add_argument('--auto_alpha_tuning', action='store_true', default=False,
 26 |                         help='whether automatic tune alpha')
 27 |     parser.add_argument('--explore_step', type=int, default=20000,
 28 |                         help='the steps of exploration before train')
 29 |     parser.add_argument('--eval_freq', type=int, default=5000,
 30 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
 31 |     parser.add_argument('--max_train_step', type=int, default=3000000,
 32 |                         help='the max train step')
 33 |     parser.add_argument('--log_interval', type=int, default=1000,
 34 |                         help='The number of steps taken to record the model and the tensorboard')
 35 |     parser.add_argument('--train_id', type=str, default='sac_mujoco_test',
 36 |                         help='Path to save model and log tensorboard')
 37 |     parser.add_argument('--resume', action='store_true', default=False,
 38 |                         help='whether load the last saved model to train')
 39 |     parser.add_argument('--device', type=str, default='cpu',
 40 |                         help='Choose cpu or cuda')
 41 |     parser.add_argument('--show', action='store_true', default=False,
 42 |                         help='show the trained model visually')
 43 |     parser.add_argument('--seed', type=int, default=10,
 44 |                         help='the random seed')
 45 | 
 46 |     args = parser.parse_args()
 47 | 
 48 |     torch.manual_seed(args.seed)
 49 |     np.random.seed(args.seed)
 50 | 
 51 |     # create environment
 52 |     env = gym.make(args.env)
 53 |     env.seed(args.seed)
 54 |     env.action_space.seed(args.seed)
 55 |     train_tools.EVAL_SEED = args.seed
 56 | 
 57 |     obs_dim = env.observation_space.shape[0]
 58 |     act_dim = env.action_space.shape[0]
 59 |     act_bound = env.action_space.high[0]
 60 | 
 61 |     # create nets
 62 |     policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 63 |                                                   hidden_size=[256, 256], hidden_activation=nn.ReLU)
 64 |     q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 65 |                        hidden_activation=nn.ReLU)
 66 | 
 67 |     q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 68 |                        hidden_activation=nn.ReLU)
 69 | 
 70 |     # create buffer
 71 |     if args.show:
 72 |         replay_buffer = None
 73 |     else:
 74 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim,
 75 |                                      act_dim=act_dim,
 76 |                                      capacity=args.capacity,
 77 |                                      batch_size=args.batch_size)
 78 | 
 79 |     agent = SAC_Agent(
 80 |         # parameters of PolicyBase
 81 |         env=env,
 82 |         gamma=0.99,
 83 |         eval_freq=args.eval_freq,
 84 |         max_train_step=args.max_train_step,
 85 |         train_id=args.train_id,
 86 |         log_interval=args.log_interval,
 87 |         resume=args.resume,
 88 |         device=args.device,
 89 | 
 90 |         # Parameters of OffPolicyBase
 91 |         replay_buffer=replay_buffer,
 92 |         explore_step=args.explore_step,
 93 | 
 94 |         # Parameters of SAC_Agent
 95 |         policy_net=policy_net,
 96 |         q_net1=q_net1,  # critic
 97 |         q_net2=q_net2,
 98 |         policy_lr=3e-4,
 99 |         qf_lr=3e-4,
100 |         tau=0.005,
101 |         alpha=args.alpha,
102 |         auto_alpha_tuning=args.auto_alpha_tuning,
103 |         )
104 | 
105 |     if args.show:
106 |         train_tools.evaluate(agent, 10, show=True)
107 |     else:
108 |         agent.learn()
109 | 


--------------------------------------------------------------------------------
/run/sac_offline_mujoco.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import gym
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | from algos.offline.sac_offline import SAC_Offline_Agent
 11 | from common.buffers import OfflineBuffer
 12 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy
 13 | from utils import train_tools, data_tools
 14 | 
 15 | if __name__ == '__main__':
 16 |     parser = argparse.ArgumentParser(description='Offline SAC in mujoco environment')
 17 |     parser.add_argument('--env', type=str, default='hopper-medium-v0',
 18 |                         help='the name of environment')
 19 |     parser.add_argument('--batch_size', type=int, default=256,
 20 |                         help='the size of batch that sampled from buffer')
 21 |     parser.add_argument('--auto_alpha_tuning', action='store_true', default=False,
 22 |                         help='whether automatic tune alpha')
 23 | 
 24 |     parser.add_argument('--max_train_step', type=int, default=1000000,
 25 |                         help='the max train step')
 26 |     parser.add_argument('--log_interval', type=int, default=1000,
 27 |                         help='The number of steps taken to record the model and the tensorboard')
 28 |     parser.add_argument('--train_id', type=str, default='sac_offline_mujoco_test',
 29 |                         help='Path to save model and log tensorboard')
 30 |     parser.add_argument('--resume', action='store_true', default=False,
 31 |                         help='whether load the last saved model to train')
 32 |     parser.add_argument('--device', type=str, default='cpu',
 33 |                         help='Choose cpu or cuda')
 34 |     parser.add_argument('--show', action='store_true', default=False,
 35 |                         help='show the trained model visually')
 36 |     parser.add_argument('--eval_freq', type=int, default=5000,
 37 |                         help='how often (time steps) we evaluate')
 38 |     parser.add_argument('--seed', type=int, default=10,
 39 |                         help='the random seed')
 40 | 
 41 |     args = parser.parse_args()
 42 | 
 43 |     torch.manual_seed(args.seed)
 44 |     np.random.seed(args.seed)
 45 | 
 46 |     # create environment
 47 |     env = gym.make(args.env)
 48 |     env.seed(args.seed)
 49 |     env.action_space.seed(args.seed)
 50 |     train_tools.EVAL_SEED = args.seed
 51 | 
 52 |     obs_dim = env.observation_space.shape[0]
 53 |     act_dim = env.action_space.shape[0]
 54 |     act_bound = env.action_space.high[0]
 55 | 
 56 |     # create nets
 57 |     policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 58 |                                                   hidden_size=[256, 256], hidden_activation=nn.ReLU)
 59 |     q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 60 |                        hidden_activation=nn.ReLU)
 61 | 
 62 |     q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 63 |                        hidden_activation=nn.ReLU)
 64 | 
 65 |     # create buffer
 66 |     if args.show:
 67 |         data_buffer = None
 68 |     else:
 69 |         data = data_tools.get_d4rl_dataset(env)
 70 |         data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size)
 71 | 
 72 |     agent = SAC_Offline_Agent(
 73 |         # parameters of PolicyBase
 74 |         env=env,
 75 |         gamma=0.99,
 76 |         eval_freq=args.eval_freq,
 77 |         max_train_step=args.max_train_step,
 78 |         train_id=args.train_id,
 79 |         log_interval=args.log_interval,
 80 |         resume=args.resume,
 81 |         device=args.device,
 82 | 
 83 |         # Parameters of OfflineBase
 84 |         data_buffer=data_buffer,
 85 | 
 86 |         # Parameters of SAC_Offline_Agent
 87 |         policy_net=policy_net,
 88 |         q_net1=q_net1,
 89 |         q_net2=q_net2,
 90 |         policy_lr=1e-4,
 91 |         qf_lr=3e-4,
 92 |         tau=0.05,
 93 |         alpha=0.5,
 94 |         auto_alpha_tuning=args.auto_alpha_tuning
 95 |         )
 96 | 
 97 |     if args.show:
 98 |         train_tools.evaluate(agent, 10, show=True)
 99 |     else:
100 |         agent.learn()
101 | 


--------------------------------------------------------------------------------
/run/sac_unity.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import torch
  7 | import torch.nn as nn
  8 | import numpy as np
  9 | from algos.sac import SAC_Agent
 10 | from common.buffers import ReplayBuffer
 11 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy
 12 | from utils import train_tools
 13 | from mlagents_envs.environment import UnityEnvironment
 14 | from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
 15 | from gym_unity.envs import UnityToGymWrapper
 16 | 
 17 | 
 18 | if __name__ == '__main__':
 19 |     parser = argparse.ArgumentParser(description='SAC algorithm in unity environment')
 20 |     parser.add_argument('--env', type=str, default=None,
 21 |                         help='the path of unity environment')
 22 |     parser.add_argument('--capacity', type=int, default=50000,
 23 |                         help='the max size of data buffer')
 24 |     parser.add_argument('--batch_size', type=int, default=128,
 25 |                         help='the size of batch that sampled from buffer')
 26 |     parser.add_argument('--alpha', type=float, default=0.5,
 27 |                         help='the coefficient of entropy')
 28 |     parser.add_argument('--auto_alpha_tuning', action='store_true', default=False,
 29 |                         help='whether automatic tune alpha')
 30 |     parser.add_argument('--explore_step', type=int, default=2000,
 31 |                         help='the steps of exploration before train')
 32 |     # parser.add_argument('--eval_freq', type=int, default=1000,
 33 |     #                     help='how often (time steps) we evaluate')
 34 |     parser.add_argument('--max_train_step', type=int, default=100000,
 35 |                         help='the max train step')
 36 |     parser.add_argument('--log_interval', type=int, default=1000,
 37 |                         help='The number of steps taken to record the model and the tensorboard')
 38 |     parser.add_argument('--train_id', type=str, default='sac_unity_test',
 39 |                         help='Path to save model and log tensorboard')
 40 |     parser.add_argument('--resume', action='store_true', default=False,
 41 |                         help='whether load the last saved model to train')
 42 |     parser.add_argument('--device', type=str, default='cpu',
 43 |                         help='Choose cpu or cuda')
 44 |     parser.add_argument('--show', action='store_true', default=False,
 45 |                         help='show the trained model visually')
 46 |     parser.add_argument('--seed', type=int, default=10,
 47 |                         help='the random seed')
 48 | 
 49 |     args = parser.parse_args()
 50 | 
 51 |     torch.manual_seed(args.seed)
 52 |     np.random.seed(args.seed)
 53 | 
 54 |     engine_configuration_channel = EngineConfigurationChannel()
 55 |     unity_env = UnityEnvironment(side_channels=[engine_configuration_channel], file_name=args.env)
 56 |     engine_configuration_channel.set_configuration_parameters(
 57 |         width=200,
 58 |         height=200,
 59 |         quality_level=5,
 60 |         time_scale=1 if args.show else 20,
 61 |         target_frame_rate=-1,
 62 |         capture_frame_rate=60)
 63 | 
 64 |     env = UnityToGymWrapper(unity_env=unity_env)
 65 |     env.seed(args.seed)
 66 |     env.action_space.seed(args.seed)
 67 |     train_tools.EVAL_SEED = args.seed
 68 | 
 69 |     obs_dim = env.observation_space.shape[0]
 70 |     act_dim = env.action_space.shape[0]
 71 |     act_bound = env.action_space.high[0]
 72 | 
 73 |     # create nets
 74 |     policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 75 |                                                   hidden_size=[256, 256], hidden_activation=nn.ReLU)
 76 |     q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 77 |                        hidden_activation=nn.ReLU)
 78 | 
 79 |     q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
 80 |                        hidden_activation=nn.ReLU)
 81 | 
 82 |     # create buffer
 83 |     if args.show:
 84 |         replay_buffer = None
 85 |     else:
 86 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim,
 87 |                                      act_dim=act_dim,
 88 |                                      capacity=args.capacity,
 89 |                                      batch_size=args.batch_size)
 90 | 
 91 |     agent = SAC_Agent(
 92 |         # parameters of PolicyBase
 93 |         env=env,
 94 |         gamma=0.99,
 95 |         eval_freq=args.eval_freq,
 96 |         max_train_step=args.max_train_step,
 97 |         train_id=args.train_id,
 98 |         log_interval=args.log_interval,
 99 |         resume=args.resume,
100 |         device=args.device,
101 | 
102 |         # Parameters of OffPolicyBase
103 |         replay_buffer=replay_buffer,
104 |         explore_step=args.explore_step,
105 | 
106 |         # Parameters of SAC_Agent
107 |         policy_net=policy_net,
108 |         q_net1=q_net1,  # critic
109 |         q_net2=q_net2,
110 |         policy_lr=3e-4,
111 |         qf_lr=3e-4,
112 |         tau=0.005,
113 |         alpha=args.alpha,
114 |         auto_alpha_tuning=args.auto_alpha_tuning,
115 |         )
116 | 
117 |     if args.show:
118 |         train_tools.evaluate_unity(agent, 10)
119 |     else:
120 |         agent.learn()
121 | 


--------------------------------------------------------------------------------
/run/td3_bc_mujoco.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | 
 5 | import argparse
 6 | import gym
 7 | import torch
 8 | import torch.nn as nn
 9 | import numpy as np
10 | from algos.offline.td3_bc import TD3_BC_Agent
11 | from common.buffers import OfflineBuffer
12 | from common.networks import MLPQsaNet, DDPGMLPActor
13 | from utils import train_tools, data_tools
14 | 
15 | if __name__ == '__main__':
16 |     parser = argparse.ArgumentParser(description='TD3_BC algorithm in mujoco environment')
17 |     parser.add_argument('--env', type=str, default='hopper-medium-v0',
18 |                         help='the name of environment')
19 |     parser.add_argument('--batch_size', type=int, default=256,
20 |                         help='the size of batch that sampled from buffer')
21 | 
22 |     parser.add_argument('--max_train_step', type=int, default=1000000,
23 |                         help='the max train step')
24 |     parser.add_argument('--log_interval', type=int, default=1000,
25 |                         help='The number of steps taken to record the model and the tensorboard')
26 |     parser.add_argument('--train_id', type=str, default='td3bc_mujoco_test',
27 |                         help='Path to save model and log tensorboard')
28 |     parser.add_argument('--resume', action='store_true', default=False,
29 |                         help='whether load the last saved model to train')
30 |     parser.add_argument('--device', type=str, default='cpu',
31 |                         help='Choose cpu or cuda')
32 |     parser.add_argument('--show', action='store_true', default=False,
33 |                         help='show the trained model visually')
34 |     parser.add_argument('--eval_freq', type=int, default=5000,
35 |                         help='how often (time steps) we evaluate')
36 |     parser.add_argument('--seed', type=int, default=10,
37 |                         help='the random seed')
38 | 
39 |     args = parser.parse_args()
40 | 
41 |     torch.manual_seed(args.seed)
42 |     np.random.seed(args.seed)
43 | 
44 |     # create environment
45 |     env = gym.make(args.env)
46 |     env.seed(args.seed)
47 |     env.action_space.seed(args.seed)
48 |     train_tools.EVAL_SEED = args.seed
49 | 
50 |     obs_dim = env.observation_space.shape[0]
51 |     act_dim = env.action_space.shape[0]
52 |     act_bound = env.action_space.high[0]
53 | 
54 |     # create nets
55 |     actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
56 |                              hidden_size=[256, 256], hidden_activation=nn.ReLU)
57 | 
58 |     critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
59 |                             hidden_size=[256, 256], hidden_activation=nn.ReLU)
60 |     critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
61 |                             hidden_size=[256, 256], hidden_activation=nn.ReLU)
62 | 
63 |     # create buffer
64 |     if args.show:
65 |         data_buffer = None
66 |     else:
67 |         data = data_tools.get_d4rl_dataset(env)
68 |         data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size)
69 | 
70 |     # create agent
71 |     agent = TD3_BC_Agent(
72 |         # parameters of PolicyBase
73 |         env=env,
74 |         gamma=0.99,
75 |         eval_freq=args.eval_freq,
76 |         max_train_step=args.max_train_step,
77 |         train_id=args.train_id,
78 |         log_interval=args.log_interval,
79 |         resume=args.resume,
80 |         device=args.device,
81 | 
82 |         # Parameters of OfflineBase
83 |         data_buffer=data_buffer,
84 | 
85 |         # Parameters of TD3BC_Agent
86 |         actor_net=actor_net, critic_net1=critic_net1, critic_net2=critic_net2,
87 |         actor_lr=3e-4, critic_lr=3e-4,
88 |         tau=0.005,
89 |         policy_noise=0.2,
90 |         noise_clip=0.5,
91 |         policy_delay=2,
92 |         alpha=2.5,
93 |         )
94 | 
95 |     if args.show:
96 |         train_tools.evaluate(agent, 10, show=True)
97 |     else:
98 |         agent.learn()
99 | 


--------------------------------------------------------------------------------
/run/td3_gym.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import gym
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | from algos.td3 import TD3_Agent
 11 | from common.buffers import ReplayBuffer
 12 | from common.networks import MLPQsaNet, DDPGMLPActor
 13 | from utils import train_tools
 14 | 
 15 | if __name__ == '__main__':
 16 |     parser = argparse.ArgumentParser(description='TD3 algorithm in gym environment')
 17 |     parser.add_argument('--env', type=str, default='Pendulum-v0',
 18 |                         help='the name of environment')
 19 |     parser.add_argument('--capacity', type=int, default=50000,
 20 |                         help='the max size of data buffer')
 21 |     parser.add_argument('--batch_size', type=int, default=100,
 22 |                         help='the size of batch that sampled from buffer')
 23 |     parser.add_argument('--explore_step', type=int, default=2000,
 24 |                         help='the steps of exploration before train')
 25 |     parser.add_argument('--eval_freq', type=int, default=1000,
 26 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
 27 |     parser.add_argument('--max_train_step', type=int, default=100000,
 28 |                         help='the max train step')
 29 |     parser.add_argument('--log_interval', type=int, default=1000,
 30 |                         help='The number of steps taken to record the model and the tensorboard')
 31 |     parser.add_argument('--train_id', type=str, default='td3_gym_test',
 32 |                         help='Path to save model and log tensorboard')
 33 |     parser.add_argument('--resume', action='store_true', default=False,
 34 |                         help='whether load the last saved model to train')
 35 |     parser.add_argument('--device', type=str, default='cpu',
 36 |                         help='Choose cpu or cuda')
 37 |     parser.add_argument('--show', action='store_true', default=False,
 38 |                         help='show the trained model visually')
 39 |     parser.add_argument('--seed', type=int, default=10,
 40 |                         help='the random seed')
 41 | 
 42 |     args = parser.parse_args()
 43 | 
 44 |     torch.manual_seed(args.seed)
 45 |     np.random.seed(args.seed)
 46 | 
 47 |     # create environment
 48 |     env = gym.make(args.env)
 49 |     env.seed(args.seed)
 50 |     env.action_space.seed(args.seed)
 51 |     train_tools.EVAL_SEED = args.seed
 52 | 
 53 |     obs_dim = env.observation_space.shape[0]
 54 |     act_dim = env.action_space.shape[0]
 55 |     act_bound = env.action_space.high[0]
 56 | 
 57 |     # create nets
 58 |     actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 59 |                              hidden_size=[400, 300], hidden_activation=nn.ReLU)
 60 | 
 61 |     critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
 62 |                             hidden_size=[400, 300], hidden_activation=nn.ReLU)
 63 |     critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
 64 |                             hidden_size=[400, 300], hidden_activation=nn.ReLU)
 65 | 
 66 |     # create buffer
 67 |     if args.show:
 68 |         replay_buffer = None
 69 |     else:
 70 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim,
 71 |                                      act_dim=act_dim,
 72 |                                      capacity=args.capacity,
 73 |                                      batch_size=args.batch_size)
 74 | 
 75 |     agent = TD3_Agent(
 76 |         # parameters of PolicyBase
 77 |         env=env,
 78 |         gamma=0.99,
 79 |         eval_freq=args.eval_freq,
 80 |         max_train_step=args.max_train_step,
 81 |         train_id=args.train_id,
 82 |         log_interval=args.log_interval,
 83 |         resume=args.resume,
 84 |         device=args.device,
 85 | 
 86 |         # Parameters of OffPolicyBase
 87 |         replay_buffer=replay_buffer,
 88 |         explore_step=args.explore_step,
 89 | 
 90 |         # Parameters of TD3_Agent
 91 |         actor_net=actor_net, critic_net1=critic_net1, critic_net2=critic_net2,
 92 |         actor_lr=1e-3, critic_lr=1e-3,  # Or 3e-4
 93 |         tau=0.005,
 94 |         act_noise=0.1,
 95 |         policy_noise=0.2,
 96 |         noise_clip=0.5,
 97 |         policy_delay=2,
 98 |         )
 99 | 
100 |     if args.show:
101 |         train_tools.evaluate(agent, 10, show=True)
102 |     else:
103 |         agent.learn()
104 | 


--------------------------------------------------------------------------------
/run/td3_mujoco.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import gym
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | from algos.td3 import TD3_Agent
 11 | from common.buffers import ReplayBuffer
 12 | from common.networks import MLPQsaNet, DDPGMLPActor
 13 | from utils import train_tools
 14 | 
 15 | if __name__ == '__main__':
 16 |     parser = argparse.ArgumentParser(description='TD3 algorithm in mujoco environment')
 17 |     parser.add_argument('--env', type=str, default='Hopper-v2',
 18 |                         help='the name of environment')
 19 |     parser.add_argument('--capacity', type=int, default=1000000,
 20 |                         help='the max size of data buffer')
 21 |     parser.add_argument('--batch_size', type=int, default=100,
 22 |                         help='the size of batch that sampled from buffer')
 23 |     parser.add_argument('--explore_step', type=int, default=10000,
 24 |                         help='the steps of exploration before train')
 25 |     parser.add_argument('--eval_freq', type=int, default=5000,
 26 |                         help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0')
 27 |     parser.add_argument('--max_train_step', type=int, default=1000000,
 28 |                         help='the max train step')
 29 |     parser.add_argument('--log_interval', type=int, default=1000,
 30 |                         help='The number of steps taken to record the model and the tensorboard')
 31 |     parser.add_argument('--train_id', type=str, default='td3_mujoco_test',
 32 |                         help='Path to save model and log tensorboard')
 33 |     parser.add_argument('--resume', action='store_true', default=False,
 34 |                         help='whether load the last saved model to train')
 35 |     parser.add_argument('--device', type=str, default='cpu',
 36 |                         help='Choose cpu or cuda')
 37 |     parser.add_argument('--show', action='store_true', default=False,
 38 |                         help='show the trained model visually')
 39 |     parser.add_argument('--seed', type=int, default=10,
 40 |                         help='the random seed')
 41 | 
 42 |     args = parser.parse_args()
 43 | 
 44 |     torch.manual_seed(args.seed)
 45 |     np.random.seed(args.seed)
 46 | 
 47 |     # create environment
 48 |     env = gym.make(args.env)
 49 |     env.seed(args.seed)
 50 |     env.action_space.seed(args.seed)
 51 |     train_tools.EVAL_SEED = args.seed
 52 | 
 53 |     obs_dim = env.observation_space.shape[0]
 54 |     act_dim = env.action_space.shape[0]
 55 |     act_bound = env.action_space.high[0]
 56 | 
 57 |     # create nets
 58 |     actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 59 |                              hidden_size=[400, 300], hidden_activation=nn.ReLU)
 60 | 
 61 |     critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
 62 |                             hidden_size=[400, 300], hidden_activation=nn.ReLU)
 63 |     critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
 64 |                             hidden_size=[400, 300], hidden_activation=nn.ReLU)
 65 | 
 66 |     # create buffer
 67 |     if args.show:
 68 |         replay_buffer = None
 69 |     else:
 70 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim,
 71 |                                      act_dim=act_dim,
 72 |                                      capacity=args.capacity,
 73 |                                      batch_size=args.batch_size)
 74 | 
 75 |     agent = TD3_Agent(
 76 |         # parameters of PolicyBase
 77 |         env=env,
 78 |         gamma=0.99,
 79 |         eval_freq=args.eval_freq,
 80 |         max_train_step=args.max_train_step,
 81 |         train_id=args.train_id,
 82 |         log_interval=args.log_interval,
 83 |         resume=args.resume,
 84 |         device=args.device,
 85 | 
 86 |         # Parameters of OffPolicyBase
 87 |         replay_buffer=replay_buffer,
 88 |         explore_step=args.explore_step,
 89 | 
 90 |         # Parameters of TD3_Agent
 91 |         actor_net=actor_net, critic_net1=critic_net1, critic_net2=critic_net2,
 92 |         actor_lr=1e-3, critic_lr=1e-3,  # Or 3e-4
 93 |         tau=0.005,
 94 |         act_noise=0.1,
 95 |         policy_noise=0.2,
 96 |         noise_clip=0.5,
 97 |         policy_delay=2,
 98 |         )
 99 | 
100 |     if args.show:
101 |         train_tools.evaluate(agent, 10, show=True)
102 |     else:
103 |         agent.learn()
104 | 


--------------------------------------------------------------------------------
/run/td3_unity.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | 
  5 | import argparse
  6 | import torch
  7 | import torch.nn as nn
  8 | import numpy as np
  9 | from algos.td3 import TD3_Agent
 10 | from common.buffers import ReplayBuffer
 11 | from common.networks import MLPQsaNet, DDPGMLPActor
 12 | from utils import train_tools
 13 | from mlagents_envs.environment import UnityEnvironment
 14 | from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
 15 | from gym_unity.envs import UnityToGymWrapper
 16 | 
 17 | 
 18 | if __name__ == '__main__':
 19 |     parser = argparse.ArgumentParser(description='TD3 algorithm in gym environment')
 20 |     parser.add_argument('--env', type=str, default=None,
 21 |                         help='the path of unity environment')
 22 |     parser.add_argument('--capacity', type=int, default=50000,
 23 |                         help='the max size of data buffer')
 24 |     parser.add_argument('--batch_size', type=int, default=100,
 25 |                         help='the size of batch that sampled from buffer')
 26 |     parser.add_argument('--explore_step', type=int, default=2000,
 27 |                         help='the steps of exploration before train')
 28 |     # parser.add_argument('--eval_freq', type=int, default=1000,
 29 |     #                     help='how often (time steps) we evaluate')
 30 |     parser.add_argument('--max_train_step', type=int, default=100000,
 31 |                         help='the max train step')
 32 |     parser.add_argument('--log_interval', type=int, default=1000,
 33 |                         help='The number of steps taken to record the model and the tensorboard')
 34 |     parser.add_argument('--train_id', type=str, default='td3_gym_test',
 35 |                         help='Path to save model and log tensorboard')
 36 |     parser.add_argument('--resume', action='store_true', default=False,
 37 |                         help='whether load the last saved model to train')
 38 |     parser.add_argument('--device', type=str, default='cpu',
 39 |                         help='Choose cpu or cuda')
 40 |     parser.add_argument('--show', action='store_true', default=False,
 41 |                         help='show the trained model visually')
 42 |     parser.add_argument('--seed', type=int, default=10,
 43 |                         help='the random seed')
 44 | 
 45 |     args = parser.parse_args()
 46 | 
 47 |     torch.manual_seed(args.seed)
 48 |     np.random.seed(args.seed)
 49 | 
 50 |     engine_configuration_channel = EngineConfigurationChannel()
 51 |     unity_env = UnityEnvironment(side_channels=[engine_configuration_channel], file_name=args.env)
 52 |     engine_configuration_channel.set_configuration_parameters(
 53 |         width=200,
 54 |         height=200,
 55 |         quality_level=5,
 56 |         time_scale=1 if args.show else 20,
 57 |         target_frame_rate=-1,
 58 |         capture_frame_rate=60)
 59 | 
 60 |     env = UnityToGymWrapper(unity_env=unity_env)
 61 |     env.seed(args.seed)
 62 |     env.action_space.seed(args.seed)
 63 |     train_tools.EVAL_SEED = args.seed
 64 | 
 65 |     obs_dim = env.observation_space.shape[0]
 66 |     act_dim = env.action_space.shape[0]
 67 |     act_bound = env.action_space.high[0]
 68 | 
 69 |     # create nets
 70 |     actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
 71 |                              hidden_size=[400, 300], hidden_activation=nn.ReLU)
 72 | 
 73 |     critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
 74 |                             hidden_size=[400, 300], hidden_activation=nn.ReLU)
 75 |     critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim,
 76 |                             hidden_size=[400, 300], hidden_activation=nn.ReLU)
 77 | 
 78 |     # create buffer
 79 |     if args.show:
 80 |         replay_buffer = None
 81 |     else:
 82 |         replay_buffer = ReplayBuffer(obs_dim=obs_dim,
 83 |                                      act_dim=act_dim,
 84 |                                      capacity=args.capacity,
 85 |                                      batch_size=args.batch_size)
 86 | 
 87 |     agent = TD3_Agent(
 88 |         # parameters of PolicyBase
 89 |         env=env,
 90 |         gamma=0.99,
 91 |         eval_freq=args.eval_freq,
 92 |         max_train_step=args.max_train_step,
 93 |         train_id=args.train_id,
 94 |         log_interval=args.log_interval,
 95 |         resume=args.resume,
 96 |         device=args.device,
 97 | 
 98 |         # Parameters of OffPolicyBase
 99 |         replay_buffer=replay_buffer,
100 |         explore_step=args.explore_step,
101 | 
102 |         # Parameters of TD3_Agent
103 |         actor_net=actor_net, critic_net1=critic_net1, critic_net2=critic_net2,
104 |         actor_lr=1e-3, critic_lr=1e-3,  # Or 3e-4
105 |         tau=0.005,
106 |         act_noise=0.1,
107 |         policy_noise=0.2,
108 |         noise_clip=0.5,
109 |         policy_delay=2,
110 |         )
111 | 
112 |     if args.show:
113 |         train_tools.evaluate_unity(agent, 10)
114 |     else:
115 |         agent.learn()
116 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dragon-wang/RL_Algorithms/3d43ece583e80f2828a42f28b790b1d7d73c07bf/utils/__init__.py


--------------------------------------------------------------------------------
/utils/atari_preprocess.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym.wrappers import atari_preprocessing
 3 | from gym.wrappers import FrameStack
 4 | import numpy as np
 5 | 
 6 | 
 7 | def make_env(env_id,  # 环境id
 8 |              noop_max=30,  # 最大的no-op操作步数
 9 |              frame_skip=4,  # 跳帧步数
10 |              screen_size=84,  # 帧的尺寸
11 |              terminal_on_life_loss=True,  # 是否在一条命没后结束Episode
12 |              grayscale_obs=True,  # True的话返回灰度图，否则返回RGB彩色图
13 |              grayscale_newaxis=False,  # 将输出的灰度图由2维转换为1维
14 |              scale_obs=True,  # 是否对obs标准化到[0,1]
15 |              num_stack=4,  # 叠加帧的步数
16 |              lz4_compress=False,  # 是否使用lz4压缩
17 |              obs_LazyFramesToNumpy=True,  # 是否将输出的图像由LazyFrames转化为numpy
18 |              ):
19 | 
20 |     assert gym.envs.registry.spec(env_id).entry_point == 'gym.envs.atari:AtariEnv', "env is not Atari"
21 | 
22 |     env = gym.make(env_id)
23 |     env = atari_preprocessing.AtariPreprocessing(env=env,
24 |                                                  noop_max=noop_max,
25 |                                                  frame_skip=frame_skip,
26 |                                                  screen_size=screen_size,
27 |                                                  terminal_on_life_loss=terminal_on_life_loss,
28 |                                                  grayscale_obs=grayscale_obs,
29 |                                                  grayscale_newaxis=grayscale_newaxis,
30 |                                                  scale_obs=scale_obs)
31 |     env = FrameStack(env, num_stack=num_stack, lz4_compress=lz4_compress)
32 |     if obs_LazyFramesToNumpy:
33 |         env = ObsLazyFramesToNumpy(env)
34 |     return env
35 | 
36 | 
37 | class ObsLazyFramesToNumpy(gym.Wrapper):
38 |     def __init__(self, env):
39 |         super(ObsLazyFramesToNumpy, self).__init__(env)
40 | 
41 |     def reset(self, **kwargs):
42 |         obs = self.env.reset()
43 |         return np.array(obs)
44 | 
45 |     def step(self, action):
46 |         next_obs, reward, done, info = self.env.step(action)
47 |         return np.array(next_obs), reward, done, info
48 | 
49 | 


--------------------------------------------------------------------------------
/utils/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Adapted from OpenAI Baselines
  3 | https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
  4 | """
  5 | import numpy as np
  6 | from collections import deque
  7 | import gym
  8 | from gym import spaces
  9 | import cv2
 10 | cv2.ocl.setUseOpenCL(False)
 11 | 
 12 | 
 13 | def make_atari_env(env_id, episodic_life=True, scale_obs=False, clip_rewards=True, stack_frames=True):
 14 |     env = gym.make(env_id)
 15 |     assert 'NoFrameskip' in env.spec.id
 16 |     env = NoopResetEnv(env, noop_max=30)
 17 |     env = MaxAndSkipEnv(env, skip=4)
 18 |     if episodic_life:
 19 |         env = EpisodicLifeEnv(env)
 20 |     if 'FIRE' in env.unwrapped.get_action_meanings():
 21 |         env = FireResetEnv(env)
 22 |     env = WarpFrame(env)
 23 |     if clip_rewards:
 24 |         env = ClipRewardEnv(env)
 25 |     if stack_frames:
 26 |         env = FrameStack(env, 4)
 27 |     env = PyTorchStyleFrames(env)
 28 |     if scale_obs:
 29 |         env = ScaledFloatFrame(env)
 30 |     return env
 31 | 
 32 | 
 33 | class NoopResetEnv(gym.Wrapper):
 34 |     def __init__(self, env, noop_max=30):
 35 |         """Sample initial states by taking random number of no-ops on reset.
 36 |         No-op is assumed to be action 0.
 37 |         """
 38 |         gym.Wrapper.__init__(self, env)
 39 |         self.noop_max = noop_max
 40 |         self.override_num_noops = None
 41 |         self.noop_action = 0
 42 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 43 | 
 44 |     def reset(self, **kwargs):
 45 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 46 |         self.env.reset(**kwargs)
 47 |         if self.override_num_noops is not None:
 48 |             noops = self.override_num_noops
 49 |         else:
 50 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
 51 |         assert noops > 0
 52 |         obs = None
 53 |         for _ in range(noops):
 54 |             obs, _, done, _ = self.env.step(self.noop_action)
 55 |             if done:
 56 |                 obs = self.env.reset(**kwargs)
 57 |         return obs
 58 | 
 59 |     def step(self, ac):
 60 |         return self.env.step(ac)
 61 | 
 62 | 
 63 | class FireResetEnv(gym.Wrapper):
 64 |     def __init__(self, env):
 65 |         """Take action on reset for environments that are fixed until firing."""
 66 |         gym.Wrapper.__init__(self, env)
 67 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 68 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 69 | 
 70 |     def reset(self, **kwargs):
 71 |         self.env.reset(**kwargs)
 72 |         obs, _, done, _ = self.env.step(1)
 73 |         if done:
 74 |             self.env.reset(**kwargs)
 75 |         obs, _, done, _ = self.env.step(2)
 76 |         if done:
 77 |             self.env.reset(**kwargs)
 78 |         return obs
 79 | 
 80 |     def step(self, ac):
 81 |         return self.env.step(ac)
 82 | 
 83 | 
 84 | class EpisodicLifeEnv(gym.Wrapper):
 85 |     def __init__(self, env):
 86 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 87 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 88 |         """
 89 |         gym.Wrapper.__init__(self, env)
 90 |         self.lives = 0
 91 |         self.was_real_done  = True
 92 | 
 93 |     def step(self, action):
 94 |         obs, reward, done, info = self.env.step(action)
 95 |         self.was_real_done = done
 96 |         # check current lives, make loss of life terminal,
 97 |         # then update lives to handle bonus lives
 98 |         lives = self.env.unwrapped.ale.lives()
 99 |         if lives < self.lives and lives > 0:
100 |             # for Qbert sometimes we stay in lives == 0 condition for a few frames
101 |             # so it's important to keep lives > 0, so that we only reset once
102 |             # the environment advertises done.
103 |             done = True
104 |         self.lives = lives
105 |         return obs, reward, done, info
106 | 
107 |     def reset(self, **kwargs):
108 |         """Reset only when lives are exhausted.
109 |         This way all states are still reachable even though lives are episodic,
110 |         and the learner need not know about any of this behind-the-scenes.
111 |         """
112 |         if self.was_real_done:
113 |             obs = self.env.reset(**kwargs)
114 |         else:
115 |             # no-op step to advance from terminal/lost life state
116 |             obs, _, _, _ = self.env.step(0)
117 |         self.lives = self.env.unwrapped.ale.lives()
118 |         return obs
119 | 
120 | 
121 | class MaxAndSkipEnv(gym.Wrapper):
122 |     def __init__(self, env, skip=4):
123 |         """Return only every `skip`-th frame"""
124 |         gym.Wrapper.__init__(self, env)
125 |         # most recent raw observations (for max pooling across time steps)
126 |         self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
127 |         self._skip       = skip
128 | 
129 |     def step(self, action):
130 |         """Repeat action, sum reward, and max over last observations."""
131 |         total_reward = 0.0
132 |         done = None
133 |         for i in range(self._skip):
134 |             obs, reward, done, info = self.env.step(action)
135 |             if i == self._skip - 2: self._obs_buffer[0] = obs
136 |             if i == self._skip - 1: self._obs_buffer[1] = obs
137 |             total_reward += reward
138 |             if done:
139 |                 break
140 |         # Note that the observation on the done=True frame
141 |         # doesn't matter
142 |         max_frame = self._obs_buffer.max(axis=0)
143 | 
144 |         return max_frame, total_reward, done, info
145 | 
146 |     def reset(self, **kwargs):
147 |         return self.env.reset(**kwargs)
148 | 
149 | 
150 | class ClipRewardEnv(gym.RewardWrapper):
151 |     def __init__(self, env):
152 |         gym.RewardWrapper.__init__(self, env)
153 | 
154 |     def reward(self, reward):
155 |         """Bin reward to {+1, 0, -1} by its sign."""
156 |         return np.sign(reward)
157 | 
158 | 
159 | class WarpFrame(gym.ObservationWrapper):
160 |     def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
161 |         """
162 |         Warp frames to 84x84 as done in the Nature paper and later work.
163 |         If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which
164 |         observation should be warped.
165 |         """
166 |         super().__init__(env)
167 |         self._width = width
168 |         self._height = height
169 |         self._grayscale = grayscale
170 |         self._key = dict_space_key
171 |         if self._grayscale:
172 |             num_colors = 1
173 |         else:
174 |             num_colors = 3
175 | 
176 |         new_space = gym.spaces.Box(
177 |             low=0,
178 |             high=255,
179 |             shape=(self._height, self._width, num_colors),
180 |             dtype=np.uint8,
181 |         )
182 |         if self._key is None:
183 |             original_space = self.observation_space
184 |             self.observation_space = new_space
185 |         else:
186 |             original_space = self.observation_space.spaces[self._key]
187 |             self.observation_space.spaces[self._key] = new_space
188 |         assert original_space.dtype == np.uint8 and len(original_space.shape) == 3
189 | 
190 |     def observation(self, obs):
191 |         if self._key is None:
192 |             frame = obs
193 |         else:
194 |             frame = obs[self._key]
195 | 
196 |         if self._grayscale:
197 |             frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
198 |         frame = cv2.resize(
199 |             frame, (self._width, self._height), interpolation=cv2.INTER_AREA
200 |         )
201 |         if self._grayscale:
202 |             frame = np.expand_dims(frame, -1)
203 | 
204 |         if self._key is None:
205 |             obs = frame
206 |         else:
207 |             obs = obs.copy()
208 |             obs[self._key] = frame
209 |         return obs
210 | 
211 | 
212 | class FrameStack(gym.Wrapper):
213 |     def __init__(self, env, k):
214 |         """Stack k last frames.
215 |         Returns lazy array, which is much more memory efficient.
216 |         See Also
217 |         --------
218 |         baselines.common.atari_wrappers.LazyFrames
219 |         """
220 |         gym.Wrapper.__init__(self, env)
221 |         self.k = k
222 |         self.frames = deque([], maxlen=k)
223 |         shp = env.observation_space.shape
224 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
225 | 
226 |     def reset(self):
227 |         ob = self.env.reset()
228 |         for _ in range(self.k):
229 |             self.frames.append(ob)
230 |         return self._get_ob()
231 | 
232 |     def step(self, action):
233 |         ob, reward, done, info = self.env.step(action)
234 |         self.frames.append(ob)
235 |         return self._get_ob(), reward, done, info
236 | 
237 |     def _get_ob(self):
238 |         assert len(self.frames) == self.k
239 |         return LazyFrames(list(self.frames))
240 | 
241 | 
242 | class ScaledFloatFrame(gym.ObservationWrapper):
243 |     def __init__(self, env):
244 |         gym.ObservationWrapper.__init__(self, env)
245 |         self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
246 | 
247 |     def observation(self, observation):
248 |         # careful! This undoes the memory optimization, use
249 |         # with smaller replay buffers only.
250 |         return np.array(observation).astype(np.float32) / 255.0
251 | 
252 | 
253 | class LazyFrames(object):
254 |     def __init__(self, frames):
255 |         """This object ensures that common frames between the observations are only stored once.
256 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
257 |         buffers.
258 |         This object should only be converted to numpy array before being passed to the model.
259 |         You'd not believe how complex the previous solution was."""
260 |         self._frames = frames
261 |         self._out = None
262 | 
263 |     def _force(self):
264 |         if self._out is None:
265 |             self._out = np.concatenate(self._frames, axis=-1)
266 |             self._frames = None
267 |         return self._out
268 | 
269 |     def __array__(self, dtype=None):
270 |         out = self._force()
271 |         if dtype is not None:
272 |             out = out.astype(dtype)
273 |         return out
274 | 
275 |     def __len__(self):
276 |         return len(self._force())
277 | 
278 |     def __getitem__(self, i):
279 |         return self._force()[i]
280 | 
281 |     def count(self):
282 |         frames = self._force()
283 |         return frames.shape[frames.ndim - 1]
284 | 
285 |     def frame(self, i):
286 |         return self._force()[..., i]
287 | 
288 | 
289 | class PyTorchStyleFrames(gym.Wrapper):
290 |     """
291 |     1.Change "LazyFrames" obs to "Numpy" obs
292 |     2.Change Image shape from "height x width x channels" to "channels x height x width"
293 |     """
294 |     def __init__(self, env):
295 |         super(PyTorchStyleFrames, self).__init__(env)
296 |         shp = env.observation_space.shape
297 |         self.observation_space = spaces.Box(low=0, high=255,
298 |                                             shape=((shp[-1], ) + shp[:-1]), dtype=env.observation_space.dtype)
299 | 
300 |     def reset(self, **kwargs):
301 |         obs = np.array(self.env.reset()).transpose((2, 0, 1))
302 |         return obs
303 | 
304 |     def step(self, action):
305 |         next_obs, reward, done, info = self.env.step(action)
306 |         next_obs = np.array(next_obs).transpose((2, 0, 1))
307 |         return next_obs, reward, done, info
308 | 
309 | 
310 | if __name__ == '__main__':
311 |     env = make_atari_env("PongNoFrameskip-v4")
312 |     print(env.observation_space)
313 |     env.reset()
314 |     obs, _, _, _ = env.step(env.action_space.sample())
315 |     print(obs)
316 |     print(np.array(obs).shape)
317 |     # from matplotlib import pyplot as plt
318 |     # plt.imshow(obs[0], cmap='gray', interpolation='bicubic')
319 |     # plt.show()
320 | 


--------------------------------------------------------------------------------
/utils/data_tools.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | try:
 3 |     import d4rl
 4 | except ImportError:
 5 |     print('No module named "d4rl" , and you can install in https://github.com/rail-berkeley/d4rl')
 6 | 
 7 | try:
 8 |     import d4rl_atari
 9 | except ImportError:
10 |     print('No module named "d4rl_atari" , and you can install in https://github.com/takuseno/d4rl-atari')
11 | 
12 | 
13 | def get_d4rl_dataset(env, get_num=None) -> dict:
14 |     """
15 |     d4rl dataset: https://github.com/rail-berkeley/d4rl
16 |     install: pip install git+https://github.com/rail-berkeley/d4rl@master#egg=d4rl
17 |     :param get_num: how many data get form dataset
18 |     """
19 |     dataset = d4rl.qlearning_dataset(env)
20 |     if get_num is None:
21 |         data = dict(
22 |             obs=dataset['observations'],
23 |             acts=dataset['actions'],
24 |             rews=dataset['rewards'],
25 |             next_obs=dataset['next_observations'],
26 |             done=dataset['terminals']
27 |         )
28 |     else:
29 |         data_num = dataset['actions'].shape[0]
30 |         ind = np.random.choice(data_num, size=get_num, replace=False)
31 |         data = dict(
32 |             obs=dataset['observations'][ind],
33 |             acts=dataset['actions'][ind],
34 |             rews=dataset['rewards'][ind],
35 |             next_obs=dataset['next_observations'][ind],
36 |             done=dataset['terminals'][ind]
37 |         )
38 | 
39 |     return data
40 | 
41 | 
42 | def get_d4rl_dataset_atari(env) -> dict:
43 |     """
44 |     d4rl atari dataset: https://github.com/takuseno/d4rl-atari
45 |     install: pip install git+https://github.com/takuseno/d4rl-atari
46 |     """
47 |     dataset = env.get_dataset()
48 |     data = dict(
49 |         obs=dataset['observations'],
50 |         acts=dataset['actions'],
51 |         rews=dataset['rewards'],
52 |         done=dataset['terminals']
53 |     )
54 | 
55 |     return data
56 | 


--------------------------------------------------------------------------------
/utils/eval_plot.py:
--------------------------------------------------------------------------------
 1 | from tensorboard.backend.event_processing import event_accumulator
 2 | import pandas as pd
 3 | import os
 4 | import seaborn as sns
 5 | from matplotlib import pyplot as plt
 6 | 
 7 | 
 8 | def smooth(df, column, weight=0.6):
 9 |     """
10 |     Smooth a column of data in the DataFrame
11 |     """
12 |     scalars = df[column].to_numpy()
13 |     last = scalars[0]
14 |     smoothed_scalars = []
15 |     for scalar in scalars:
16 |         smoothed_scalar = last * weight + (1 - weight) * scalar  # Calculate smoothed value
17 |         smoothed_scalars.append(smoothed_scalar)
18 |         last = smoothed_scalar
19 |     df[column] = smoothed_scalars
20 | 
21 | 
22 | def get_pd(tensorboard_path, tag='evaluate_data/eval_episode_reward'):
23 |     """
24 |     Get pandas from one tensorboard file
25 |     """
26 |     event_data = event_accumulator.EventAccumulator(tensorboard_path)  # a python interface for loading Event data
27 |     event_data.Reload()
28 |     scalars = event_data.scalars.Items(tag)
29 |     df = pd.DataFrame(scalars)[['step', 'value']]
30 |     return df
31 | 
32 | 
33 | def get_pd_from_parent_path(parents_path, tag='evaluate_data/eval_episode_reward'):
34 |     """
35 |     Get pandas from tensorboard files with common parent path
36 |     """
37 |     child_paths = os.listdir(parents_path)
38 |     df = pd.DataFrame(columns=['step', 'value'])
39 |     for child_path in child_paths:
40 |         tens_path = os.path.join(parents_path, child_path)
41 |         if os.path.isdir(tens_path):
42 |             event_data = event_accumulator.EventAccumulator(tens_path)  # a python interface for loading Event data
43 |             event_data.Reload()
44 |             scalars = event_data.scalars.Items(tag)
45 |             df = df.append(pd.DataFrame(scalars)[['step', 'value']], ignore_index=True)
46 |     return df
47 | 
48 | 
49 | def is_parent_path(parent_path):
50 |     child_paths = os.listdir(parent_path)
51 |     for child_path in child_paths:
52 |         tens_path = os.path.join(parent_path, child_path)
53 |         if os.path.isdir(tens_path):
54 |             return True
55 |     return False
56 | 
57 | 
58 | def plot_from_paths(path_list, label_list, tag='evaluate_data/eval_episode_reward', smooth_weight=0.6):
59 |     """
60 |     Plot tensorboard file from paths from path_list and with label from label_list on one figure
61 |     """
62 |     for i in range(len(path_list)):
63 |         if is_parent_path(path_list[i]):
64 |             df_temp = get_pd_from_parent_path(path_list[i], tag=tag)
65 |         else:
66 |             df_temp = get_pd(path_list[i], tag=tag)
67 |         if smooth_weight > 0:
68 |             smooth(df_temp, "value", weight=smooth_weight)
69 |             sns.lineplot(x="step", y="value", data=df_temp, label=label_list[i])
70 |         else:
71 |             sns.lineplot(x="step", y="value", data=df_temp, label=label_list[i])
72 |     plt.legend(loc="upper left")
73 |     plt.xlabel("time step", fontsize=13)
74 |     plt.ylabel("average reward", fontsize=13)
75 |     plt.show()
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     path_list = ["E:/PycharmProjects/RL_Algorithms/run/results/bcq/Hopper-v0/medium-expert",
80 |                  "E:/PycharmProjects/RL_Algorithms/run/results/bear/Hopper-v0/medium-expert",
81 |                  "E:/PycharmProjects/RL_Algorithms/run/results/cql/Hopper-v0/medium-expert",
82 |                  ]
83 | 
84 |     label_list = ["BCQ",
85 |                   "BEAR",
86 |                   "CQL",
87 |                   ]
88 | 
89 |     plot_from_paths(path_list, label_list, smooth_weight=0.7)
90 | 


--------------------------------------------------------------------------------
/utils/log_tools.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from torch.utils.tensorboard import SummaryWriter
 3 | 
 4 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 5 | 
 6 | 
 7 | def make_dir(path):
 8 |     if not os.path.exists(path):
 9 |         os.makedirs(path)
10 |     return path
11 | 
12 | 
13 | def del_all_files_in_dir(path):
14 |     ls = os.listdir(path)
15 |     for file in ls:
16 |         os.remove(os.path.join(path, file))
17 | 
18 | 
19 | class TensorboardLogger:
20 |     def __init__(self, log_dir):
21 |         self.log_dir = log_dir
22 |         self.writer = SummaryWriter(log_dir)
23 | 
24 |     def log_train_data(self, log_datas: dict, step):
25 |         for log_data in log_datas.items():
26 |             self.writer.add_scalar("train_data/" + log_data[0], log_data[1], step)
27 |         self.writer.flush()
28 | 
29 |     def log_learn_data(self, log_datas: dict, step):
30 |         for log_data in log_datas.items():
31 |             self.writer.add_scalar("learn_data/" + log_data[0], log_data[1], step)
32 |         self.writer.flush()
33 | 
34 |     def log_eval_data(self, log_datas: dict, step):
35 |         for log_data in log_datas.items():
36 |             self.writer.add_scalar("evaluate_data/" + log_data[0], log_data[1], step)
37 |         self.writer.flush()
38 | 


--------------------------------------------------------------------------------
/utils/train_tools.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import Env
  3 | from tqdm import tqdm
  4 | from common.buffers import ReplayBuffer
  5 | import numpy as np
  6 | import copy
  7 | 
  8 | EVAL_SEED = 10  # used for evaluation env's seed
  9 | 
 10 | 
 11 | def hard_target_update(main, target):
 12 |     target.load_state_dict(main.state_dict())
 13 | 
 14 | 
 15 | def soft_target_update(main, target, tau=0.005):
 16 |     for main_param, target_param in zip(main.parameters(), target.parameters()):
 17 |         target_param.data.copy_(tau * main_param.data + (1.0 - tau) * target_param.data)
 18 | 
 19 | 
 20 | def explore_before_train(env: Env, buffer, explore_step):
 21 |     obs = env.reset()
 22 |     done = False
 23 |     t = tqdm(range(explore_step))
 24 |     t.set_description("explore before train")
 25 |     for _ in t:
 26 |         action = env.action_space.sample()
 27 |         next_obs, reward, done, info = env.step(action)
 28 |         buffer.add(obs, action, reward, next_obs, done)
 29 | 
 30 |         if done:
 31 |             obs = env.reset()
 32 |             done = False
 33 |         else:
 34 |             obs = next_obs
 35 | 
 36 | 
 37 | def evaluate(agent, episode_num, seed_offset=100, show=False):
 38 |     if show:
 39 |         agent.load_agent_checkpoint()
 40 |     eval_env = copy.deepcopy(agent.env)
 41 |     eval_env.seed(EVAL_SEED + seed_offset)  # reset environment's seed for evaluate(the seed will not be copied by deepcopy)
 42 |     total_reward = 0
 43 |     total_length = 0
 44 |     print("---------------------------------- evaluating at time step {} ----------------------------------".format(agent.train_step))
 45 |     for i in range(episode_num):
 46 |         episode_reward = 0
 47 |         episode_length = 0
 48 |         obs, done = eval_env.reset(), False
 49 |         while not done:
 50 |             if show:
 51 |                 eval_env.render()
 52 |             action = agent.choose_action(obs, eval=True)
 53 |             action = action[0] if isinstance(action, tuple) else action
 54 |             obs, reward, done, _ = eval_env.step(action)
 55 |             episode_reward += reward
 56 |             episode_length += 1
 57 |             if done:
 58 |                 total_reward += episode_reward
 59 |                 total_length += episode_length
 60 |                 if show:
 61 |                     print("episode:{} \t step length: {} \t reward: {:.2f}".format(i + 1, episode_length, episode_reward))
 62 | 
 63 |     avg_reward = total_reward / episode_num
 64 |     avg_length = total_length / episode_num
 65 | 
 66 |     print("=====> evaluate {} episode <===> average step length: {:.2f} <===> average reward: {:.2f} <=====".format(episode_num, avg_length, avg_reward))
 67 |     print("---------------------------------------------------------------------------------------------------")
 68 | 
 69 |     evaluate_summaries = {"eval_episode_length": avg_length, "eval_episode_reward": avg_reward}
 70 |     return evaluate_summaries
 71 | 
 72 | 
 73 | def evaluate_unity(agent, episode_num):
 74 |     agent.load_agent_checkpoint()
 75 |     eval_env = agent.env
 76 |     total_reward = 0
 77 |     total_length = 0
 78 |     print("---------------------------------- evaluating at time step {} ----------------------------------".format(agent.train_step))
 79 |     for i in range(episode_num):
 80 |         episode_reward = 0
 81 |         episode_length = 0
 82 |         obs, done = eval_env.reset(), False
 83 |         while not done:
 84 |             action = agent.choose_action(obs, eval=True)
 85 |             action = action[0] if isinstance(action, tuple) else action
 86 |             obs, reward, done, _ = eval_env.step(action)
 87 |             episode_reward += reward
 88 |             episode_length += 1
 89 |             if done:
 90 |                 total_reward += episode_reward
 91 |                 total_length += episode_length
 92 |                 print("episode:{} \t step length: {} \t reward: {:.2f}".format(i + 1, episode_length, episode_reward))
 93 | 
 94 |     avg_reward = total_reward / episode_num
 95 |     avg_length = total_length / episode_num
 96 | 
 97 |     print("=====> evaluate {} episode <===> average step length: {:.2f} <===> average reward: {:.2f} <=====".format(episode_num, avg_length, avg_reward))
 98 |     print("---------------------------------------------------------------------------------------------------")
 99 | 
100 |     return avg_reward, avg_length
101 | 
102 | 
103 | class OrnsteinUhlenbeckActionNoise:
104 |     """
105 |     used in DDPG. OU noise
106 |     """
107 |     def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
108 |         self.action_dim = action_dim
109 |         self.mu = mu
110 |         self.theta = theta
111 |         self.sigma = sigma
112 |         self.X = np.ones(self.action_dim) * self.mu
113 | 
114 |     def reset(self):
115 |         self.X = np.ones(self.action_dim) * self.mu
116 | 
117 |     def sample(self):
118 |         dx = self.theta * (self.mu - self.X)
119 |         dx = dx + self.sigma * np.random.randn(len(self.X))
120 |         self.X = self.X + dx
121 |         return self.X
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     a = OrnsteinUhlenbeckActionNoise(action_dim=3)
126 |     print(a.sample())
127 | 


--------------------------------------------------------------------------------