├── .gitignore ├── LICENSE ├── README.md ├── algos ├── __init__.py ├── base.py ├── ddpg.py ├── ddqn.py ├── dqn.py ├── offline │ ├── __init__.py │ ├── bcq.py │ ├── bear.py │ ├── cql.py │ ├── plas.py │ ├── sac_offline.py │ └── td3_bc.py ├── ppo.py ├── sac.py └── td3.py ├── common ├── __init__.py ├── buffers.py └── networks.py ├── run ├── bcq_mujoco.py ├── bear_mujoco.py ├── cql_atari.py ├── cql_mujoco.py ├── ddpg_gym.py ├── ddpg_unity.py ├── ddqn_atari.py ├── ddqn_gym.py ├── dqn_atari.py ├── dqn_gym.py ├── plas_mujoco.py ├── ppo_gym.py ├── ppo_mujoco.py ├── sac_gym.py ├── sac_mujoco.py ├── sac_offline_mujoco.py ├── sac_unity.py ├── td3_bc_mujoco.py ├── td3_gym.py ├── td3_mujoco.py └── td3_unity.py └── utils ├── __init__.py ├── atari_preprocess.py ├── atari_wrappers.py ├── data_tools.py ├── eval_plot.py ├── log_tools.py └── train_tools.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | 3 | __pycache__/ 4 | *.pyc 5 | 6 | test/ 7 | 8 | /notes.txt 9 | /run/RunNotes.txt 10 | 11 | /run/results/ 12 | 13 | /algos/experiments -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 dragon-wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL_Algorithms 2 | A lightweight reinforcement learning algorithm library implemented by pytorch 3 | ## Supported algorithms 4 | 5 | ### Online RL 6 | 7 | Interact with the environment during training. 8 | 9 | | algorithm | discrete control | continuous control | 10 | | ------------------------------------------------------------ | ---------------- | ------------------ | 11 | | [Deep Q-Network (DQN)](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) | ✔ | ⛔ | 12 | | [Double DQN (DDQN)](https://arxiv.org/abs/1509.06461) | ✔ | ⛔ | 13 | | [Deep Deterministic Policy Gradients (DDPG)](https://arxiv.org/abs/1509.02971) | ⛔ | ✔ | 14 | | [Proximal Policy Optimization (PPO)](https://arxiv.org/abs/1707.06347) | ✔ | ✔ | 15 | | [Soft Actor-Critic (SAC)](https://arxiv.org/abs/1812.05905) | ⛔ | ✔ | 16 | | [Twin Delayed Deep Deterministic policy gradient(TD3)](https://arxiv.org/abs/1802.09477) | ⛔ | ✔ | 17 | 18 | ### Offline RL 19 | 20 | Use the existing data set for training, and there is no interaction with the environment during training. 21 | 22 | | algorithm | discrete control | continuous control | 23 | | ------------------------------------------------------------ | ---------------- | ------------------ | 24 | | [Batch-Constrained deep Q-learning (BCQ)](https://arxiv.org/abs/1812.02900) | ⛔ | ✔ | 25 | | [Bootstrapping Error Accumulation Reduction (BEAR)](https://arxiv.org/abs/1906.00949) | ⛔ | ✔ | 26 | | [Policy in the Latent Action Space (PLAS)](https://arxiv.org/abs/2011.07213) | ⛔ | ✔ | 27 | | [Conservative Q-Learning (CQL)](https://arxiv.org/abs/2006.04779) | ✔ | ✔ | 28 | | [TD3 with behavior cloning(TD3-BC)](https://arxiv.org/abs/2106.06860) | ⛔ | ✔ | 29 | 30 | ## To do list 31 | 32 | **Online algorithm:** 33 | 34 | + [Trust Region Policy Optimization(TRPO)](https://proceedings.mlr.press/v37/schulman15.html) 35 | 36 | **Offline algorithm:** 37 | 38 | + [Discrete Batch-Constrained deep Q-Learning (BCQ-Discrete)](https://arxiv.org/abs/1910.01708) 39 | + [Behavior Regularized Actor Critic (BRAC)](https://arxiv.org/abs/1911.11361) 40 | + [Fisher-Behavior Regularized Critic(Fisher-BRC)](https://arxiv.org/abs/2103.08050) 41 | 42 | ## Requirements 43 | 44 | ``` 45 | |Python 3.7 | 46 | |Pytorch 1.7.1 | 47 | |tensorboard 2.7.0 | To view the training curve in real time, 48 | |tqdm 4.62.3 | To show progress bar. 49 | |numpy 1.21.3 | 50 | 51 | |gym 0.19.0 | 52 | |box2d-py 2.3.8 | Include Box2d env, e.g,"BipedalWalker-v2" and "LunarLander-v2". 53 | |atari-py 0.2.6 | Include Atari env, e.g, "Pong", "Breakout" and "SpaceInvaders". 54 | |mujoco-py 2.0.2.8 | Include Mujoco env, e.g, "Hopper-v2", "Ant-v2" and "HalfCheetah-v2". 55 | 56 | |d4rl 1.1 | Only used in Offline RL. Include offline dataset of Mujoco, CARLA and so on. 57 | (Can be installed in "https://github.com/rail-berkeley/d4rl") 58 | |d4rl-atari 0.1 | Only used in Offline RL. Include offline dataset of Atari. 59 | (Can be installed in "https://github.com/takuseno/d4rl-atari") 60 | |mlagents 0.27.0 | To train agents in unity's self built environment. 61 | (Can be installed in "https://github.com/Unity-Technologies/ml-agents") 62 | ``` 63 | 64 | ## Quick start 65 | 66 | ### To train the agents on the environments 67 | 68 | ```shell 69 | git clone https://github.com/dragon-wang/RL_Algorithms.git 70 | cd RL_Algorithms/run 71 | 72 | # train DQN 73 | python dqn_gym.py --env=CartPole-v0 --train_id=dqn_test 74 | 75 | # train DDPG 76 | python ddpg_gym.py --env=Pendulum-v0 --train_id=ddpg_Pendulum-v0 77 | python ddpg_unity.py --train_id=ddpg_unity_test 78 | 79 | # train PPO 80 | python ppo_gym.py --env=CartPole-v0 --train_id=ppo_CartPole-v0 81 | python ppo_mujoco.py --env=Hopper-v2 --train_id=ppo_Hopper-v2 82 | 83 | # train SAC 84 | python sac_gym.py --env=Pendulum-v0 --train_id=sac_Pendulum-v0 85 | python sac_mujoco.py --env=Hopper-v2 --train_id=sac_Hopper-v2 --max_train_step=2000000 --auto 86 | python sac_unity.py --train_id=sac_unity_test --auto 87 | 88 | # train TD3 89 | python td3_gym.py --env=Pendulum-v0 --train_id=td3_Pendulum-v0 90 | python td3_mujoco.py --env=Hopper-v2 --train_id=td3_Hopper-v2 91 | python td3_unity.py --train_id=td3_unity_test 92 | 93 | # train BCQ 94 | python bcq_mujoco.py --train_id=bcq_hopper-mudium-v2 --env=hopper-medium-v2 --device=cuda 95 | 96 | # train PLAS 97 | python plas_mujoco.py --train_id=plas_hopper-mudium-v2 --env=hopper-medium-v2 --device=cuda 98 | 99 | # train CQL 100 | python cql_mujoco.py --train_id=cql_hopper-mudium-v2 --env=hopper-medium-v2 --auto_alpha --entropy_backup --with_lagrange --lagrange_thresh=10.0 --device=cuda 101 | 102 | # train BEAR 103 | python bear_mujoco.py --env=hopper-medium-v2 --train_id=bear_hopper-mudium-v2 --kernel_type=laplacian --seed=10 --device=cuda 104 | ``` 105 | 106 | Some command line common parameters: 107 | 108 | + `--env`: the name of environment.(`--env=xxx`) 109 | + `--capacity`: the max size of replay buffer.(`--capacity=xxx`) 110 | + `--batch_size`: the size of batch that sampled from buffer.(`--batch_size=xxx`) 111 | + `--explore_step`: the steps of exploration before train.(`--explore_step=xxx`) 112 | + `--eval_freq`: how often (time steps) we evaluate during training, and it will not evaluate if `eval_freq < 0`(but in offline algorithms, we must evaluate during training).(`--eval_freq=xxx`) 113 | + `--max_train_step`: the max train step.(`--max_train_step=xxx`) 114 | + `--log_interval`: the number of steps taken to record the model and the tensorboard.(`--log_interval=xxx`) 115 | + `--train_id`: path to save model and log tensorboard.(`--train_id=xxx`) 116 | + `--resume`: whether load the last saved model to train.(`--resume`) 117 | + `--device`: choose device.(`--device=cpu` or `--device=cuda`) 118 | + `--show`: show the trained model visually.(`--show`) 119 | + `--seed`: the random seed of env or neural network(`--seed=xxx`) 120 | 121 | The specific parameters for each algorithm can be viewed in the "xxx.py" files under the "run" folder. Of course I have also provided some default parameters. 122 | 123 | **Note that your trained model and tensorboard files are stored in the "results/your train_id" folder.** 124 | 125 | ### Use tensorboard to view the training curve 126 | 127 | ``` 128 | cd run 129 | 130 | tensorboard --logdir results 131 | ``` 132 | 133 | You can then view the training curve by typing "http://localhost:6006/" into your browser. 134 | 135 | ## Continue to train from last checkpoint 136 | 137 | You just need to add `--resume` after your command line, such as: 138 | 139 | ```shell 140 | python sac_mujoco.py --env=Hopper-v2 --train_id=sac_Hopper-v2 --max_train_step=2000000 --auto --resume 141 | ``` 142 | 143 | **Note that the "train_id" must be the same as your last training id.** 144 | 145 | ## Show trained agent 146 | 147 | You can view the display of the trained agent via `--show`, such as: 148 | 149 | ```shell 150 | python sac_mujoco.py --env=Hopper-v2 --train_id=sac_Hopper-v2 --show 151 | ``` 152 | 153 | **Note that the "train_id" must be the same as the id of the agent you want to see.** 154 | -------------------------------------------------------------------------------- /algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dragon-wang/RL_Algorithms/3d43ece583e80f2828a42f28b790b1d7d73c07bf/algos/__init__.py -------------------------------------------------------------------------------- /algos/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod, ABC, ABCMeta 2 | import torch 3 | from utils import log_tools 4 | from utils.train_tools import explore_before_train, evaluate 5 | import numpy as np 6 | import os 7 | 8 | class PolicyBase(ABC): 9 | def __init__(self, 10 | env, # RL environment object 11 | gamma, # The decay factor 12 | eval_freq, # How often (time steps) the policy is evaluated. it will not evaluate the agent during train if eval_freq < 0. 13 | max_train_step, # The max train step 14 | train_id, # The name and path to save model and log tensorboard 15 | log_interval, # The number of steps taken to record the model and the tensorboard 16 | resume, # Whether load the last saved model and continue to train 17 | device, # The device. Choose cpu or cuda 18 | ): 19 | self.env = env 20 | self.gamma = gamma 21 | self.eval_freq = eval_freq 22 | self.max_train_step = max_train_step 23 | self.train_id = train_id 24 | self.log_interval = log_interval 25 | self.resume = resume 26 | self.device = torch.device(device) 27 | 28 | self.train_step = 0 29 | 30 | self.result_dir = os.path.join(log_tools.ROOT_DIR, "run/results", self.train_id) 31 | self.checkpoint_path = os.path.join(self.result_dir, "checkpoint.pth") 32 | 33 | @abstractmethod 34 | def choose_action(self, obs, eval=False): 35 | """Select an action according to the observation 36 | 37 | Args: 38 | obs (_type_): The observation 39 | eval (bool): Whether used in evaluation 40 | """ 41 | pass 42 | 43 | @abstractmethod 44 | def train(self): 45 | """The main body of rl algorithm 46 | """ 47 | pass 48 | 49 | @abstractmethod 50 | def learn(self): 51 | """The main loop of training process 52 | """ 53 | pass 54 | 55 | @abstractmethod 56 | def store_agent_checkpoint(self): 57 | """Save training data. (e.g. neural network parameters, optimizer parameters, training steps, ...) 58 | """ 59 | pass 60 | 61 | @abstractmethod 62 | def load_agent_checkpoint(self): 63 | """Load training data 64 | """ 65 | pass 66 | 67 | 68 | class OffPolicyBase(PolicyBase): 69 | def __init__(self, 70 | replay_buffer, # The replay buffer 71 | explore_step, # Steps to explore the environment before training 72 | **kwargs # The parameters of the parent class 73 | ): 74 | super().__init__(**kwargs) 75 | 76 | self.replay_buffer = replay_buffer 77 | self.explore_step = explore_step 78 | 79 | self.episode_num = 0 80 | 81 | def choose_action(self, obs, eval=False): 82 | raise NotImplementedError 83 | 84 | def train(self): 85 | raise NotImplementedError 86 | 87 | def learn(self): 88 | # Make the directory to save the training results that consist of checkpoint files and tensorboard files 89 | log_tools.make_dir(self.result_dir) 90 | tensorboard_writer = log_tools.TensorboardLogger(self.result_dir) 91 | 92 | if self.resume: 93 | self.load_agent_checkpoint() 94 | else: 95 | # delete tensorboard log file 96 | log_tools.del_all_files_in_dir(self.result_dir) 97 | 98 | explore_before_train(self.env, self.replay_buffer, self.explore_step) 99 | print("==============================start train===================================") 100 | obs = self.env.reset() 101 | 102 | episode_reward = 0 103 | episode_length = 0 104 | 105 | # The main loop of "choose action -> act action -> add buffer -> train policy" 106 | while self.train_step < self.max_train_step: 107 | action = self.choose_action(np.array(obs), eval=False) 108 | next_obs, reward, done, _ = self.env.step(action) 109 | episode_reward += reward 110 | self.replay_buffer.add(obs, action, reward, next_obs, done) 111 | obs = next_obs 112 | episode_length += 1 113 | 114 | train_summaries = self.train() 115 | 116 | if done: 117 | self.episode_num += 1 118 | obs = self.env.reset() 119 | 120 | print(f"Time Step: {self.train_step} Episode Num: {self.episode_num}" 121 | f"Episode Length: {episode_length} Episode Reward: {episode_reward:.2f}") 122 | tensorboard_writer.log_learn_data({"episode_length": episode_length, 123 | "episode_reward": episode_reward}, self.train_step) 124 | episode_reward = 0 125 | episode_length = 0 126 | 127 | if self.train_step % self.log_interval == 0: 128 | self.store_agent_checkpoint() 129 | tensorboard_writer.log_train_data(train_summaries, self.train_step) 130 | 131 | if self.eval_freq > 0 and self.train_step % self.eval_freq == 0: 132 | evaluate_summaries = evaluate(agent=self, episode_num=10) 133 | tensorboard_writer.log_eval_data(evaluate_summaries, self.train_step) 134 | 135 | def store_agent_checkpoint(self): 136 | raise NotImplementedError 137 | 138 | def load_agent_checkpoint(self): 139 | raise NotImplementedError 140 | 141 | 142 | class OfflineBase(PolicyBase): 143 | def __init__(self, data_buffer, **kwargs): 144 | super().__init__(**kwargs) 145 | self.data_buffer = data_buffer 146 | 147 | def choose_action(self, obs, eval=True): 148 | """In offline settings, 149 | since the agent does not interact with the environment during training, 150 | this function is only used during evaluation. 151 | """ 152 | raise NotImplementedError 153 | 154 | def train(self): 155 | raise NotImplementedError 156 | 157 | def learn(self): 158 | # Make the directory to save the training results that consist of checkpoint files and tensorboard files 159 | log_tools.make_dir(self.result_dir) 160 | tensorboard_writer = log_tools.TensorboardLogger(self.result_dir) 161 | 162 | if self.resume: 163 | self.load_agent_checkpoint() 164 | else: 165 | # delete tensorboard log file 166 | log_tools.del_all_files_in_dir(self.result_dir) 167 | 168 | while self.train_step < self.max_train_step: 169 | train_summaries = self.train() 170 | 171 | if self.train_step % self.log_interval == 0: 172 | self.store_agent_checkpoint() 173 | tensorboard_writer.log_train_data(train_summaries, self.train_step) 174 | 175 | if self.eval_freq > 0 and self.train_step % self.eval_freq == 0: 176 | evaluate_summaries = evaluate(agent=self, episode_num=10) 177 | tensorboard_writer.log_eval_data(evaluate_summaries, self.train_step) 178 | 179 | def store_agent_checkpoint(self): 180 | raise NotImplementedError 181 | 182 | def load_agent_checkpoint(self): 183 | raise NotImplementedError -------------------------------------------------------------------------------- /algos/ddpg.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import torch 4 | import torch.nn.functional as F 5 | from algos.base import OffPolicyBase 6 | from utils.train_tools import soft_target_update 7 | 8 | 9 | class DDPG_Agent(OffPolicyBase): 10 | """ 11 | Implementation of Deep Deterministic Policy Gradient (DDPG) 12 | https://arxiv.org/abs/1509.02971 13 | """ 14 | def __init__(self, 15 | actor_net: torch.nn.Module, 16 | critic_net: torch.nn.Module, 17 | actor_lr=1e-4, 18 | critic_lr=1e-3, 19 | tau=0.005, # used to update target network, w' = tau*w + (1-tau)*w' 20 | gaussian_noise_sigma=0.2, 21 | **kwargs 22 | ): 23 | super().__init__(**kwargs) 24 | 25 | self.action_num = self.env.action_space.shape[0] 26 | self.action_bound = self.env.action_space.high[0] 27 | 28 | # the network and optimizers 29 | self.actor_net = actor_net.to(self.device) 30 | self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device) 31 | self.critic_net = critic_net.to(self.device) 32 | self.target_critic_net = copy.deepcopy(self.critic_net).to(self.device) 33 | self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr) 34 | self.critic_optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=critic_lr) 35 | 36 | self.tau = tau 37 | self.gaussian_noise_sigma = gaussian_noise_sigma 38 | 39 | def choose_action(self, obs, eval=False): 40 | """Choose an action by deterministic policy with some gaussian noise""" 41 | obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device) 42 | with torch.no_grad(): 43 | action = self.actor_net(obs).cpu().numpy().flatten() 44 | if eval: 45 | return action 46 | else: 47 | noise = np.random.normal(0, self.gaussian_noise_sigma, size=self.action_num) 48 | return (action + noise).clip(-self.action_bound, self.action_bound) 49 | 50 | def train(self): 51 | 52 | # Sample 53 | batch = self.replay_buffer.sample() 54 | obs = batch["obs"].to(self.device) 55 | acts = batch["acts"].to(self.device) 56 | rews = batch["rews"].to(self.device) 57 | next_obs = batch["next_obs"].to(self.device) 58 | done = batch["done"].to(self.device) 59 | 60 | # Compute target Q value 61 | with torch.no_grad(): 62 | next_act = self.target_actor_net(next_obs) 63 | next_Q = self.target_critic_net(next_obs, next_act).squeeze(1) 64 | target_Q = rews + (1. - done) * self.gamma * next_Q 65 | 66 | # Compute current Q 67 | current_Q = self.critic_net(obs, acts).squeeze(1) 68 | 69 | # Compute critic loss 70 | critic_loss = F.mse_loss(current_Q, target_Q) 71 | 72 | # Compute actor loss 73 | actor_loss = -self.critic_net(obs, self.actor_net(obs)).mean() 74 | 75 | # Optimize actor net 76 | self.actor_optimizer.zero_grad() 77 | actor_loss.backward() 78 | self.actor_optimizer.step() 79 | 80 | # Optimize critic net 81 | self.critic_optimizer.zero_grad() 82 | critic_loss.backward() 83 | self.critic_optimizer.step() 84 | 85 | soft_target_update(self.actor_net, self.target_actor_net, tau=self.tau) 86 | soft_target_update(self.critic_net, self.target_critic_net, tau=self.tau) 87 | 88 | self.train_step += 1 89 | 90 | train_summaries = {"actor_loss": actor_loss.cpu().item(), 91 | "critic_loss": critic_loss.cpu().item()} 92 | return train_summaries 93 | 94 | def store_agent_checkpoint(self): 95 | checkpoint = { 96 | "actor_net": self.actor_net.state_dict(), 97 | "critic_net": self.critic_net.state_dict(), 98 | "actor_optimizer": self.actor_optimizer.state_dict(), 99 | "critic_optimizer": self.critic_optimizer.state_dict(), 100 | "train_step": self.train_step, 101 | "episode_num": self.episode_num 102 | } 103 | torch.save(checkpoint, self.checkpoint_path) 104 | 105 | def load_agent_checkpoint(self): 106 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 107 | self.actor_net.load_state_dict(checkpoint["actor_net"]) 108 | self.target_actor_net.load_state_dict(checkpoint["actor_net"]) 109 | self.critic_net.load_state_dict(checkpoint["critic_net"]) 110 | self.target_critic_net.load_state_dict(checkpoint["critic_net"]) 111 | self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"]) 112 | self.critic_optimizer.load_state_dict(checkpoint["critic_optimizer"]) 113 | self.train_step = checkpoint["train_step"] 114 | self.episode_num = checkpoint["episode_num"] 115 | print("load checkpoint from \"" + self.checkpoint_path + 116 | "\" at " + str(self.train_step) + " time step") 117 | -------------------------------------------------------------------------------- /algos/ddqn.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import torch 4 | from algos.base import OffPolicyBase 5 | from utils.train_tools import hard_target_update 6 | 7 | 8 | class DDQN_Agent(OffPolicyBase): 9 | """ 10 | Implementation of Double DQN (DDQN) 11 | https://arxiv.org/abs/1509.06461 12 | """ 13 | def __init__(self, 14 | Q_net: torch.nn.Module, 15 | qf_lr=0.001, 16 | initial_eps=0.1, 17 | end_eps=0.001, 18 | eps_decay_period=2000, 19 | eval_eps=0.001, 20 | target_update_freq =10, 21 | **kwargs 22 | ): 23 | super().__init__(**kwargs) 24 | 25 | self.target_update_freq = target_update_freq 26 | 27 | self.Q_net = Q_net.to(self.device) 28 | self.target_Q_net = copy.deepcopy(self.Q_net).to(self.device) 29 | self.optimizer = torch.optim.Adam(self.Q_net.parameters(), lr=qf_lr) 30 | 31 | # Decay for epsilon 32 | self.initial_eps = initial_eps 33 | self.end_eps = end_eps 34 | self.slope = (self.end_eps - self.initial_eps) / eps_decay_period 35 | self.eval_eps = eval_eps 36 | 37 | def choose_action(self, obs, eval=False): 38 | eps = self.eval_eps if eval else max(self.slope * self.train_step + self.initial_eps, self.end_eps) 39 | 40 | if np.random.uniform(0, 1) > eps: 41 | with torch.no_grad(): 42 | obs = torch.FloatTensor(obs).unsqueeze(0).to(self.device) 43 | return int(self.Q_net(obs).argmax(dim=1).cpu()) 44 | else: 45 | return self.env.action_space.sample() 46 | 47 | def train(self): 48 | """ 49 | Sample a batch of data from replay buffer and train 50 | """ 51 | 52 | # Sample 53 | batch = self.replay_buffer.sample() 54 | obs = batch["obs"].to(self.device) 55 | acts = batch["acts"].to(self.device) 56 | rews = batch["rews"].to(self.device) 57 | next_obs = batch["next_obs"].to(self.device) 58 | done = batch["done"].to(self.device) 59 | 60 | # Compute target Q value (Double DQN) 61 | with torch.no_grad(): 62 | next_acts = self.Q_net(next_obs).max(dim=1)[1].unsqueeze(1) # use Q net to get next actions, rather than target Q net 63 | target_Q = self.target_Q_net(next_obs).gather(1, next_acts).squeeze(1) 64 | target_Q = rews + (1. - done) * self.gamma * target_Q 65 | 66 | # Compute current Q value 67 | current_Q = self.Q_net(obs).gather(1, acts.long()).squeeze(1) 68 | 69 | # Compute Q loss 70 | q_loss = 0.5 * (target_Q - current_Q).pow(2).mean() 71 | # q_loss = F.mse_loss(current_Q, target_Q) 72 | 73 | # Optimize the Q network 74 | self.optimizer.zero_grad() 75 | q_loss.backward() 76 | self.optimizer.step() 77 | 78 | self.train_step += 1 79 | 80 | # update target Q 81 | if self.train_step % self.target_update_freq == 0: 82 | hard_target_update(self.Q_net, self.target_Q_net) 83 | 84 | train_summaries = {"q_loss": q_loss.cpu().item()} 85 | 86 | return train_summaries 87 | 88 | def store_agent_checkpoint(self): 89 | checkpoint = { 90 | "net": self.Q_net.state_dict(), 91 | "optimizer": self.optimizer.state_dict(), 92 | "train_step": self.train_step, 93 | "episode_num": self.episode_num 94 | } 95 | torch.save(checkpoint, self.checkpoint_path) 96 | 97 | def load_agent_checkpoint(self): 98 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 99 | self.Q_net.load_state_dict(checkpoint["net"]) 100 | self.target_Q_net = copy.deepcopy(self.Q_net) 101 | self.optimizer.load_state_dict(checkpoint["optimizer"]) 102 | self.train_step = checkpoint["train_step"] 103 | self.episode_num = checkpoint["episode_num"] 104 | print("load checkpoint from \"" + self.checkpoint_path + 105 | "\" at " + str(self.train_step) + " time step") 106 | -------------------------------------------------------------------------------- /algos/dqn.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import torch 4 | from algos.base import OffPolicyBase 5 | from utils.train_tools import hard_target_update 6 | 7 | 8 | class DQN_Agent(OffPolicyBase): 9 | """ 10 | Implementation of Deep Q-Network (DQN) 11 | https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf 12 | """ 13 | def __init__(self, 14 | Q_net: torch.nn.Module, 15 | qf_lr=0.001, 16 | initial_eps=0.1, 17 | end_eps=0.001, 18 | eps_decay_period=2000, 19 | eval_eps=0.001, 20 | target_update_freq =10, 21 | **kwargs 22 | ): 23 | super().__init__(**kwargs) 24 | 25 | self.target_update_freq = target_update_freq 26 | 27 | self.Q_net = Q_net.to(self.device) 28 | self.target_Q_net = copy.deepcopy(self.Q_net).to(self.device) 29 | self.optimizer = torch.optim.Adam(self.Q_net.parameters(), lr=qf_lr) 30 | 31 | # Decay for epsilon 32 | self.initial_eps = initial_eps 33 | self.end_eps = end_eps 34 | self.slope = (self.end_eps - self.initial_eps) / eps_decay_period 35 | self.eval_eps = eval_eps 36 | 37 | def choose_action(self, obs, eval=False): 38 | eps = self.eval_eps if eval else max(self.slope * self.train_step + self.initial_eps, self.end_eps) 39 | 40 | if np.random.uniform(0, 1) > eps: 41 | with torch.no_grad(): 42 | obs = torch.FloatTensor(obs).unsqueeze(0).to(self.device) 43 | return int(self.Q_net(obs).argmax(dim=1).cpu()) 44 | else: 45 | return self.env.action_space.sample() 46 | 47 | def train(self): 48 | """ 49 | Sample a batch of data from replay buffer and train 50 | """ 51 | 52 | # Sample 53 | batch = self.replay_buffer.sample() 54 | obs = batch["obs"].to(self.device) 55 | acts = batch["acts"].to(self.device) 56 | rews = batch["rews"].to(self.device) 57 | next_obs = batch["next_obs"].to(self.device) 58 | done = batch["done"].to(self.device) 59 | 60 | # Compute target Q value 61 | with torch.no_grad(): 62 | target_q = rews + (1. - done) * self.gamma * self.target_Q_net(next_obs).max(dim=1)[0] 63 | 64 | # Compute current Q value 65 | current_q = self.Q_net(obs).gather(1, acts.long()).squeeze(1) 66 | 67 | # Compute Q loss 68 | q_loss = 0.5 * (target_q - current_q).pow(2).mean() 69 | # Q_loss = F.mse_loss(current_Q, target_Q) 70 | 71 | # Optimize the Q network 72 | self.optimizer.zero_grad() 73 | q_loss.backward() 74 | self.optimizer.step() 75 | 76 | self.train_step += 1 77 | 78 | # update target Q 79 | if self.train_step % self.target_update_freq == 0: 80 | hard_target_update(self.Q_net, self.target_Q_net) 81 | 82 | train_summaries = {"q_loss": q_loss.cpu().item()} 83 | 84 | return train_summaries 85 | 86 | def store_agent_checkpoint(self): 87 | checkpoint = { 88 | "net": self.Q_net.state_dict(), 89 | "optimizer": self.optimizer.state_dict(), 90 | "train_step": self.train_step, 91 | "episode_num": self.episode_num 92 | } 93 | torch.save(checkpoint, self.checkpoint_path) 94 | 95 | def load_agent_checkpoint(self): 96 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 97 | self.Q_net.load_state_dict(checkpoint["net"]) 98 | self.target_Q_net = copy.deepcopy(self.Q_net) 99 | self.optimizer.load_state_dict(checkpoint["optimizer"]) 100 | self.train_step = checkpoint["train_step"] 101 | self.episode_num = checkpoint["episode_num"] 102 | print("load checkpoint from \"" + self.checkpoint_path + 103 | "\" at " + str(self.train_step) + " time step") -------------------------------------------------------------------------------- /algos/offline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dragon-wang/RL_Algorithms/3d43ece583e80f2828a42f28b790b1d7d73c07bf/algos/offline/__init__.py -------------------------------------------------------------------------------- /algos/offline/bcq.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import torch.nn.functional as F 4 | from algos.base import OfflineBase 5 | from utils.train_tools import soft_target_update 6 | 7 | 8 | class BCQ_Agent(OfflineBase): 9 | """ 10 | Implementation of Batch-Constrained deep Q-learning(BCQ) in continuous action space 11 | https://arxiv.org/abs/1812.02900 12 | """ 13 | def __init__(self, 14 | critic_net1: torch.nn.Module, 15 | critic_net2: torch.nn.Module, 16 | perturbation_net: torch.nn.Module, 17 | cvae_net: torch.nn.Module, # generation model 18 | critic_lr=1e-3, 19 | per_lr=1e-3, 20 | cvae_lr=1e-3, 21 | tau=0.005, 22 | lmbda=0.75, # used for double clipped double q-learning 23 | **kwargs 24 | ): 25 | super().__init__(**kwargs) 26 | 27 | self.critic_net1 = critic_net1.to(self.device) 28 | self.critic_net2 = critic_net2.to(self.device) 29 | self.target_critic_net1 = copy.deepcopy(self.critic_net1).to(self.device) 30 | self.target_critic_net2 = copy.deepcopy(self.critic_net2).to(self.device) 31 | self.perturbation_net = perturbation_net.to(self.device) 32 | self.target_perturbation_net = copy.deepcopy(self.perturbation_net).to(self.device) 33 | self.cvae_net = cvae_net.to(self.device) 34 | self.critic_optimizer1 = torch.optim.Adam(self.critic_net1.parameters(), lr=critic_lr) 35 | self.critic_optimizer2 = torch.optim.Adam(self.critic_net2.parameters(), lr=critic_lr) 36 | self.perturbation_optimizer = torch.optim.Adam(self.perturbation_net.parameters(), lr=per_lr) 37 | self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(), lr=cvae_lr) 38 | 39 | self.tau = tau 40 | self.lmbda = lmbda 41 | 42 | def choose_action(self, obs, eval=True): 43 | with torch.no_grad(): 44 | obs = torch.FloatTensor(obs).reshape(1, -1).repeat(100, 1).to(self.device) 45 | generated_action = self.cvae_net.decode(obs, z_device=self.device) 46 | perturbed_action = self.perturbation_net(obs, generated_action) 47 | q1 = self.critic_net1(obs, perturbed_action) 48 | ind = q1.argmax(dim=0) 49 | return perturbed_action[ind].cpu().data.numpy().flatten() 50 | 51 | def train(self): 52 | # Sample 53 | batch = self.data_buffer.sample() 54 | obs = batch["obs"].to(self.device) 55 | acts = batch["acts"].to(self.device) 56 | rews = batch["rews"].to(self.device) 57 | next_obs = batch["next_obs"].to(self.device) 58 | done = batch["done"].to(self.device) 59 | 60 | """ 61 | CVAE Loss (the generation model) 62 | """ 63 | recon_action, mu, log_std = self.cvae_net(obs, acts) 64 | cvae_loss = self.cvae_net.loss_function(recon_action, acts, mu, log_std) 65 | 66 | self.cvae_optimizer.zero_grad() 67 | cvae_loss.backward() 68 | self.cvae_optimizer.step() 69 | 70 | """ 71 | Critic Loss 72 | """ 73 | with torch.no_grad(): 74 | # generate 10 actions for every next_obs 75 | next_obs = torch.repeat_interleave(next_obs, repeats=10, dim=0).to(self.device) 76 | generated_action = self.cvae_net.decode(next_obs, z_device=self.device) 77 | # perturb the generated action 78 | perturbed_action = self.target_perturbation_net(next_obs, generated_action) 79 | # compute target Q value of perturbed action 80 | target_q1 = self.target_critic_net1(next_obs, perturbed_action) 81 | target_q2 = self.target_critic_net2(next_obs, perturbed_action) 82 | # soft clipped double q-learning 83 | target_q = self.lmbda * torch.min(target_q1, target_q2) + (1. - self.lmbda) * torch.max(target_q1, target_q2) 84 | # take max over each action sampled from the generation and perturbation model 85 | target_q = target_q.reshape(obs.shape[0], 10, 1).max(1)[0].squeeze(1) 86 | target_q = rews + self.gamma * (1. - done) * target_q 87 | 88 | # compute current Q 89 | current_q1 = self.critic_net1(obs, acts).squeeze(1) 90 | current_q2 = self.critic_net2(obs, acts).squeeze(1) 91 | # compute critic loss 92 | critic_loss1 = F.mse_loss(current_q1, target_q) 93 | critic_loss2 = F.mse_loss(current_q2, target_q) 94 | 95 | self.critic_optimizer1.zero_grad() 96 | critic_loss1.backward() 97 | self.critic_optimizer1.step() 98 | 99 | self.critic_optimizer2.zero_grad() 100 | critic_loss2.backward() 101 | self.critic_optimizer2.step() 102 | 103 | """ 104 | Perturbation Loss 105 | """ 106 | generated_action_ = self.cvae_net.decode(obs, z_device=self.device) 107 | perturbed_action_ = self.perturbation_net(obs, generated_action_) 108 | perturbation_loss = -self.critic_net1(obs, perturbed_action_).mean() 109 | 110 | self.perturbation_optimizer.zero_grad() 111 | perturbation_loss.backward() 112 | self.perturbation_optimizer.step() 113 | 114 | """ 115 | Update target networks 116 | """ 117 | soft_target_update(self.critic_net1, self.target_critic_net1, tau=self.tau) 118 | soft_target_update(self.critic_net2, self.target_critic_net2, tau=self.tau) 119 | soft_target_update(self.perturbation_net, self.target_perturbation_net, tau=self.tau) 120 | 121 | self.train_step += 1 122 | 123 | train_summaries = {"cvae_loss": cvae_loss.cpu().item(), 124 | "critic_loss1": critic_loss1.cpu().item(), 125 | "critic_loss2": critic_loss2.cpu().item(), 126 | "perturbation_loss": perturbation_loss.cpu().item()} 127 | 128 | return train_summaries 129 | 130 | def store_agent_checkpoint(self): 131 | checkpoint = { 132 | "critic_net1": self.critic_net1.state_dict(), 133 | "critic_net2": self.critic_net2.state_dict(), 134 | "perturbation_net": self.perturbation_net.state_dict(), 135 | "cvae_net": self.cvae_net.state_dict(), 136 | "critic_optimizer1": self.critic_optimizer1.state_dict(), 137 | "critic_optimizer2": self.critic_optimizer2.state_dict(), 138 | "perturbation_optimizer": self.perturbation_optimizer.state_dict(), 139 | "cvae_optimizer": self.cvae_optimizer.state_dict(), 140 | "train_step": self.train_step, 141 | } 142 | torch.save(checkpoint, self.checkpoint_path) 143 | 144 | def load_agent_checkpoint(self): 145 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 146 | self.critic_net1.load_state_dict(checkpoint["critic_net1"]) 147 | self.critic_net2.load_state_dict(checkpoint["critic_net2"]) 148 | self.perturbation_net.load_state_dict(checkpoint["perturbation_net"]) 149 | self.cvae_net.load_state_dict(checkpoint["cvae_net"]) 150 | self.critic_optimizer1.load_state_dict(checkpoint["critic_optimizer1"]) 151 | self.critic_optimizer2.load_state_dict(checkpoint["critic_optimizer2"]) 152 | self.perturbation_optimizer.load_state_dict(checkpoint["perturbation_optimizer"]) 153 | self.cvae_optimizer.load_state_dict(checkpoint["cvae_optimizer"]) 154 | self.train_step = checkpoint["train_step"] 155 | 156 | print("load checkpoint from \"" + self.checkpoint_path + 157 | "\" at " + str(self.train_step) + " time step") 158 | -------------------------------------------------------------------------------- /algos/offline/bear.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import torch.nn.functional as F 4 | from algos.base import OfflineBase 5 | from utils.train_tools import soft_target_update 6 | from common.networks import MLPSquashedReparamGaussianPolicy, CVAE, MLPQsaNet 7 | 8 | 9 | class BEAR_Agent(OfflineBase): 10 | """ 11 | Implementation of Bootstrapping Error Accumulation Reduction (BEAR) 12 | https://arxiv.org/abs/1906.00949 13 | BEAR's MMD Loss's weight alpha_prime is tuned automatically by default. 14 | 15 | Actor Loss: alpha_prime * MMD Loss + -minQ(s,a) 16 | Critic Loss: Like BCQ 17 | Alpha_prime Loss: -(alpha_prime * (MMD Loss - threshold)) 18 | """ 19 | def __init__(self, 20 | policy_net: MLPSquashedReparamGaussianPolicy, # actor 21 | q_net1: MLPQsaNet, # critic 22 | q_net2: MLPQsaNet, 23 | cvae_net: CVAE, 24 | policy_lr=1e-4, 25 | qf_lr=3e-4, 26 | cvae_lr=3e-4, 27 | tau=0.05, 28 | 29 | # BEAR 30 | lmbda=0.75, # used for double clipped double q-learning 31 | mmd_sigma=20.0, # the sigma used in mmd kernel 32 | kernel_type='gaussian', # the type of mmd kernel(gaussian or laplacian) 33 | lagrange_thresh=0.05, # the hyper-parameter used in automatic tuning alpha in cql loss 34 | n_action_samples=100, # the number of action samples to compute the best action when choose action 35 | n_target_samples=10, # the number of action samples to compute BCQ-like target value 36 | n_mmd_action_samples=4, # the number of action samples to compute MMD. 37 | warmup_step=40000, # do support matching with a warm start before policy(actor) train 38 | **kwargs 39 | ): 40 | super().__init__(**kwargs) 41 | 42 | # the network and optimizers 43 | self.policy_net = policy_net.to(self.device) 44 | self.q_net1 = q_net1.to(self.device) 45 | self.q_net2 = q_net2.to(self.device) 46 | self.target_q_net1 = copy.deepcopy(self.q_net1).to(self.device) 47 | self.target_q_net2 = copy.deepcopy(self.q_net2).to(self.device) 48 | self.cvae_net = cvae_net.to(self.device) 49 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=policy_lr) 50 | self.q_optimizer1 = torch.optim.Adam(self.q_net1.parameters(), lr=qf_lr) 51 | self.q_optimizer2 = torch.optim.Adam(self.q_net2.parameters(), lr=qf_lr) 52 | self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(), lr=cvae_lr) 53 | 54 | self.tau = tau 55 | 56 | self.lmbda = lmbda 57 | self.mmd_sigma = mmd_sigma 58 | self.kernel_type = kernel_type 59 | self.lagrange_thresh = lagrange_thresh 60 | self.n_action_samples = n_action_samples 61 | self.n_target_samples = n_target_samples 62 | self.n_mmd_action_samples = n_mmd_action_samples 63 | self.warmup_step = warmup_step 64 | 65 | # mmd loss's temperature 66 | self.log_alpha_prime = torch.zeros(1, requires_grad=True, device=self.device) 67 | self.alpha_prime_optimizer = torch.optim.Adam([self.log_alpha_prime], lr=1e-3) 68 | 69 | def choose_action(self, obs, eval=True): 70 | with torch.no_grad(): 71 | obs = torch.FloatTensor(obs).reshape(1, -1).repeat(self.n_action_samples, 1).to(self.device) 72 | action, _, _ = self.policy_net(obs) 73 | q1 = self.q_net1(obs, action) 74 | ind = q1.argmax(dim=0) 75 | return action[ind].cpu().numpy().flatten() 76 | 77 | def mmd_loss_laplacian(self, samples1, samples2, sigma=0.2): 78 | """MMD constraint with Laplacian kernel for support matching""" 79 | # sigma is set to 10.0 for hopper, cheetah and 20 for walker/ant 80 | diff_x_x = samples1.unsqueeze(2) - samples1.unsqueeze(1) # B x N x N x d 81 | diff_x_x = torch.mean((-(diff_x_x.abs()).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2)) 82 | 83 | diff_x_y = samples1.unsqueeze(2) - samples2.unsqueeze(1) 84 | diff_x_y = torch.mean((-(diff_x_y.abs()).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2)) 85 | 86 | diff_y_y = samples2.unsqueeze(2) - samples2.unsqueeze(1) # B x N x N x d 87 | diff_y_y = torch.mean((-(diff_y_y.abs()).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2)) 88 | 89 | overall_loss = (diff_x_x + diff_y_y - 2.0 * diff_x_y + 1e-6).sqrt() 90 | return overall_loss 91 | 92 | def mmd_loss_gaussian(self, samples1, samples2, sigma=0.2): 93 | """MMD constraint with Gaussian Kernel support matching""" 94 | # sigma is set to 10.0 for hopper, cheetah and 20 for walker/ant 95 | diff_x_x = samples1.unsqueeze(2) - samples1.unsqueeze(1) # B x N x N x d 96 | diff_x_x = torch.mean((-(diff_x_x.pow(2)).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2)) 97 | 98 | diff_x_y = samples1.unsqueeze(2) - samples2.unsqueeze(1) 99 | diff_x_y = torch.mean((-(diff_x_y.pow(2)).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2)) 100 | 101 | diff_y_y = samples2.unsqueeze(2) - samples2.unsqueeze(1) # B x N x N x d 102 | diff_y_y = torch.mean((-(diff_y_y.pow(2)).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2)) 103 | 104 | overall_loss = (diff_x_x + diff_y_y - 2.0 * diff_x_y + 1e-6).sqrt() 105 | return overall_loss 106 | 107 | def train(self): 108 | # Sample 109 | batch = self.data_buffer.sample() 110 | obs = batch["obs"].to(self.device) 111 | acts = batch["acts"].to(self.device) 112 | rews = batch["rews"].to(self.device) 113 | next_obs = batch["next_obs"].to(self.device) 114 | done = batch["done"].to(self.device) 115 | 116 | """ 117 | Train the Behaviour cloning policy to be able to take more than 1 sample for MMD. 118 | Conditional VAE is used as Behaviour cloning policy in BEAR. 119 | """ 120 | recon_action, mu, log_std = self.cvae_net(obs, acts) 121 | cvae_loss = self.cvae_net.loss_function(recon_action, acts, mu, log_std) 122 | 123 | self.cvae_optimizer.zero_grad() 124 | cvae_loss.backward() 125 | self.cvae_optimizer.step() 126 | 127 | """ 128 | Critic Training 129 | """ 130 | with torch.no_grad(): 131 | # generate 10 actions for every next_obs(Same as BCQ) 132 | next_obs = torch.repeat_interleave(next_obs, repeats=self.n_target_samples, dim=0).to(self.device) 133 | # compute target Q value of generated action 134 | target_q1 = self.target_q_net1(next_obs, self.policy_net(next_obs)[0]) 135 | target_q2 = self.target_q_net2(next_obs, self.policy_net(next_obs)[0]) 136 | # soft clipped double q-learning 137 | target_q = self.lmbda * torch.min(target_q1, target_q2) + (1. - self.lmbda) * torch.max(target_q1, target_q2) 138 | # take max over each action sampled from the generation and perturbation model 139 | target_q = target_q.reshape(obs.shape[0], self.n_target_samples, 1).max(1)[0].squeeze(1) 140 | target_q = rews + self.gamma * (1. - done) * target_q 141 | 142 | # compute current Q 143 | current_q1 = self.q_net1(obs, acts).squeeze(1) 144 | current_q2 = self.q_net2(obs, acts).squeeze(1) 145 | # compute critic loss 146 | critic_loss1 = F.mse_loss(current_q1, target_q) 147 | critic_loss2 = F.mse_loss(current_q2, target_q) 148 | 149 | self.q_optimizer1.zero_grad() 150 | critic_loss1.backward() 151 | self.q_optimizer1.step() 152 | 153 | self.q_optimizer2.zero_grad() 154 | critic_loss2.backward() 155 | self.q_optimizer2.step() 156 | 157 | # MMD Loss 158 | # sample actions from dataset and current policy(B x N x D) 159 | raw_sampled_actions = self.cvae_net.decode_multiple_without_squash(obs, decode_num=self.n_mmd_action_samples, 160 | z_device=self.device) 161 | raw_actor_actions = self.policy_net.sample_multiple_without_squash(obs, sample_num=self.n_mmd_action_samples) 162 | if self.kernel_type == 'gaussian': 163 | mmd_loss = self.mmd_loss_gaussian(raw_sampled_actions, raw_actor_actions, sigma=self.mmd_sigma) 164 | else: 165 | mmd_loss = self.mmd_loss_laplacian(raw_sampled_actions, raw_actor_actions, sigma=self.mmd_sigma) 166 | 167 | """ 168 | Alpha prime training(lagrangian parameter update for MMD loss weight) 169 | """ 170 | alpha_prime_loss = -(self.log_alpha_prime.exp() * (mmd_loss - self.lagrange_thresh)).mean() 171 | self.alpha_prime_optimizer.zero_grad() 172 | alpha_prime_loss.backward(retain_graph=True) 173 | self.alpha_prime_optimizer.step() 174 | 175 | self.log_alpha_prime.data.clamp_(min=-5.0, max=10.0) # clip for stability 176 | 177 | """ 178 | Actor Training 179 | Actor Loss = alpha_prime * MMD Loss + -minQ(s,a) 180 | """ 181 | a, log_prob, _ = self.policy_net(obs) 182 | min_q = torch.min(self.q_net1(obs, a), self.q_net2(obs, a)).squeeze(1) 183 | # policy_loss = (self.alpha * log_prob - min_q).mean() # SAC Type 184 | policy_loss = - (min_q.mean()) 185 | 186 | # BEAR Actor Loss 187 | actor_loss = (self.log_alpha_prime.exp() * mmd_loss).mean() 188 | if self.train_step > self.warmup_step: 189 | actor_loss = policy_loss + actor_loss 190 | self.policy_optimizer.zero_grad() 191 | actor_loss.backward() # the mmd_loss will backward again in alpha_prime_loss. 192 | self.policy_optimizer.step() 193 | 194 | soft_target_update(self.q_net1, self.target_q_net1, tau=self.tau) 195 | soft_target_update(self.q_net2, self.target_q_net2, tau=self.tau) 196 | 197 | self.train_step += 1 198 | 199 | train_summaries = {"actor_loss": policy_loss.cpu().item(), 200 | "critic_loss1": critic_loss1.cpu().item(), 201 | "critic_loss2": critic_loss2.cpu().item(), 202 | "alpha_prime_loss": alpha_prime_loss.cpu().item()} 203 | 204 | return train_summaries 205 | 206 | def store_agent_checkpoint(self): 207 | checkpoint = { 208 | "q_net1": self.q_net1.state_dict(), 209 | "q_net2": self.q_net2.state_dict(), 210 | "policy_net": self.policy_net.state_dict(), 211 | "q_optimizer1": self.q_optimizer1.state_dict(), 212 | "q_optimizer2": self.q_optimizer2.state_dict(), 213 | "policy_optimizer": self.policy_optimizer.state_dict(), 214 | "log_alpha_prime": self.log_alpha_prime, 215 | "alpha_prime_optimizer": self.alpha_prime_optimizer.state_dict(), 216 | "train_step": self.train_step, 217 | } 218 | 219 | torch.save(checkpoint, self.checkpoint_path) 220 | 221 | def load_agent_checkpoint(self): 222 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 223 | self.q_net1.load_state_dict(checkpoint["q_net1"]) 224 | self.q_net2.load_state_dict(checkpoint["q_net2"]) 225 | self.policy_net.load_state_dict(checkpoint["policy_net"]) 226 | self.q_optimizer1.load_state_dict(checkpoint["q_optimizer1"]) 227 | self.q_optimizer2.load_state_dict(checkpoint["q_optimizer2"]) 228 | self.policy_optimizer.load_state_dict(checkpoint["policy_optimizer"]) 229 | self.log_alpha_prime = checkpoint["log_alpha_prime"] 230 | self.alpha_prime_optimizer.load_state_dict(checkpoint["alpha_prime_optimizer"]) 231 | self.train_step = checkpoint["train_step"] 232 | 233 | print("load checkpoint from \"" + self.checkpoint_path + 234 | "\" at " + str(self.train_step) + " time step") 235 | -------------------------------------------------------------------------------- /algos/offline/plas.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import torch.nn.functional as F 4 | from algos.base import OfflineBase 5 | from common.networks import MLPQsaNet, CVAE, PLAS_Actor 6 | from utils.train_tools import soft_target_update, evaluate 7 | from utils import log_tools 8 | 9 | 10 | class PLAS_Agent(OfflineBase): 11 | """ 12 | Implementation of Policy in the Latent Action Space(PLAS) in continuous action space 13 | https://arxiv.org/abs/2011.07213 14 | """ 15 | def __init__(self, 16 | critic_net1: MLPQsaNet, 17 | critic_net2: MLPQsaNet, 18 | actor_net: PLAS_Actor, 19 | cvae_net: CVAE, # generation model 20 | critic_lr=1e-3, 21 | actor_lr=1e-4, 22 | cvae_lr=1e-4, 23 | tau=0.005, 24 | lmbda=0.75, # used for double clipped double q-learning 25 | max_cvae_iterations=500000, # the num of iterations when training CVAE model 26 | **kwargs 27 | ): 28 | super().__init__(**kwargs) 29 | 30 | self.critic_net1 = critic_net1.to(self.device) 31 | self.critic_net2 = critic_net2.to(self.device) 32 | self.target_critic_net1 = copy.deepcopy(self.critic_net1).to(self.device) 33 | self.target_critic_net2 = copy.deepcopy(self.critic_net2).to(self.device) 34 | self.actor_net = actor_net.to(self.device) 35 | self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device) 36 | self.cvae_net = cvae_net.to(self.device) 37 | self.critic_optimizer1 = torch.optim.Adam(self.critic_net1.parameters(), lr=critic_lr) 38 | self.critic_optimizer2 = torch.optim.Adam(self.critic_net2.parameters(), lr=critic_lr) 39 | self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr) 40 | self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(), lr=cvae_lr) 41 | 42 | self.tau = tau 43 | self.lmbda = lmbda 44 | self.max_cvae_iterations = max_cvae_iterations 45 | self.cvae_iterations= 0 46 | 47 | def choose_action(self, obs, eval=True): 48 | with torch.no_grad(): 49 | obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device) 50 | action = self.actor_net(obs, self.cvae_net.decode) 51 | return action.cpu().data.numpy().flatten() 52 | 53 | def train_cvae(self): 54 | """ 55 | Train CVAE one step 56 | """ 57 | # Sample 58 | batch = self.data_buffer.sample() 59 | obs = batch["obs"].to(self.device) 60 | acts = batch["acts"].to(self.device) 61 | 62 | recon_action, mu, log_std = self.cvae_net(obs, acts) 63 | cvae_loss = self.cvae_net.loss_function(recon_action, acts, mu, log_std) 64 | 65 | self.cvae_optimizer.zero_grad() 66 | cvae_loss.backward() 67 | self.cvae_optimizer.step() 68 | 69 | self.cvae_iterations += 1 70 | 71 | train_summaries = {"cvae_loss": cvae_loss.cpu().item()} 72 | 73 | return train_summaries 74 | 75 | def train(self): 76 | # Sample 77 | batch = self.data_buffer.sample() 78 | obs = batch["obs"].to(self.device) 79 | acts = batch["acts"].to(self.device) 80 | rews = batch["rews"].to(self.device) 81 | next_obs = batch["next_obs"].to(self.device) 82 | done = batch["done"].to(self.device) 83 | 84 | """ 85 | Train Critic 86 | """ 87 | with torch.no_grad(): 88 | decode_action_next = self.target_actor_net(next_obs, self.cvae_net.decode) 89 | 90 | target_q1 = self.target_critic_net1(next_obs, decode_action_next) 91 | target_q2 = self.target_critic_net2(next_obs, decode_action_next) 92 | 93 | target_q = (self.lmbda * torch.min(target_q1, target_q2) + (1. - self.lmbda) * torch.max(target_q1, target_q2)).squeeze(1) 94 | target_q = rews + self.gamma * (1. - done) * target_q 95 | 96 | current_q1 = self.critic_net1(obs, acts).squeeze(1) 97 | current_q2 = self.critic_net2(obs, acts).squeeze(1) 98 | 99 | critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q) 100 | 101 | self.critic_optimizer1.zero_grad() 102 | self.critic_optimizer2.zero_grad() 103 | critic_loss.backward() 104 | self.critic_optimizer1.step() 105 | self.critic_optimizer2.step() 106 | 107 | """ 108 | Train Actor 109 | """ 110 | decode_action = self.actor_net(obs, self.cvae_net.decode) 111 | actor_loss = -self.critic_net1(obs, decode_action).mean() 112 | 113 | self.actor_optimizer.zero_grad() 114 | actor_loss.backward() 115 | self.actor_optimizer.step() 116 | 117 | """ 118 | Update target networks 119 | """ 120 | soft_target_update(self.critic_net1, self.target_critic_net1, tau=self.tau) 121 | soft_target_update(self.critic_net2, self.target_critic_net2, tau=self.tau) 122 | soft_target_update(self.actor_net, self.target_actor_net, tau=self.tau) 123 | 124 | self.train_step += 1 125 | 126 | train_summaries = {"actor_loss": actor_loss.cpu().item(), 127 | "critic_loss": critic_loss.cpu().item()} 128 | 129 | return train_summaries 130 | 131 | def learn(self): 132 | """Train PLAS without interacting with the environment (offline)""" 133 | 134 | log_tools.make_dir(self.result_dir) 135 | tensorboard_writer = log_tools.TensorboardLogger(self.result_dir) 136 | 137 | if self.resume: 138 | self.load_agent_checkpoint() 139 | else: 140 | # delete tensorboard log file 141 | log_tools.del_all_files_in_dir(self.result_dir) 142 | 143 | # Train CVAE before train agent 144 | print("==============================Start to train CVAE==============================") 145 | 146 | while self.cvae_iterations < self.max_cvae_iterations: 147 | train_summaries_cvae = self.train_cvae() 148 | if self.cvae_iterations % 1000 == 0: 149 | print("CVAE iteration:", self.cvae_iterations, "\t", "CVAE Loss:", train_summaries_cvae["cvae_loss"]) 150 | tensorboard_writer.log_train_data(train_summaries_cvae, self.cvae_iterations) 151 | 152 | # Train Agent 153 | print("==============================Start to train Agent==============================") 154 | while self.train_step < self.max_train_step: 155 | train_summaries = self.train() 156 | 157 | if self.train_step % self.log_interval == 0: 158 | self.store_agent_checkpoint() 159 | tensorboard_writer.log_train_data(train_summaries, self.train_step) 160 | 161 | if self.eval_freq > 0 and self.train_step % self.eval_freq == 0: 162 | evaluate_summaries = evaluate(agent=self, episode_num=10) 163 | tensorboard_writer.log_eval_data(evaluate_summaries, self.train_step) 164 | 165 | def store_agent_checkpoint(self): 166 | checkpoint = { 167 | "critic_net1": self.critic_net1.state_dict(), 168 | "critic_net2": self.critic_net2.state_dict(), 169 | "actor_net": self.actor_net.state_dict(), 170 | "cvae_net": self.cvae_net.state_dict(), 171 | "critic_optimizer1": self.critic_optimizer1.state_dict(), 172 | "critic_optimizer2": self.critic_optimizer2.state_dict(), 173 | "actor_optimizer": self.actor_optimizer.state_dict(), 174 | "cvae_optimizer": self.cvae_optimizer.state_dict(), 175 | "train_step": self.train_step, 176 | "cvae_iterations": self.cvae_iterations, 177 | } 178 | torch.save(checkpoint, self.checkpoint_path) 179 | 180 | def load_agent_checkpoint(self): 181 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 182 | self.critic_net1.load_state_dict(checkpoint["critic_net1"]) 183 | self.critic_net2.load_state_dict(checkpoint["critic_net2"]) 184 | self.actor_net.load_state_dict(checkpoint["actor_net"]) 185 | self.cvae_net.load_state_dict(checkpoint["cvae_net"]) 186 | self.critic_optimizer1.load_state_dict(checkpoint["critic_optimizer1"]) 187 | self.critic_optimizer2.load_state_dict(checkpoint["critic_optimizer2"]) 188 | self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"]) 189 | self.cvae_optimizer.load_state_dict(checkpoint["cvae_optimizer"]) 190 | self.train_step = checkpoint["train_step"] 191 | self.cvae_iterations = checkpoint["cvae_iterations"] 192 | 193 | print("load checkpoint from \"" + self.checkpoint_path + 194 | "\" at " + str(self.train_step) + " time step") 195 | -------------------------------------------------------------------------------- /algos/offline/sac_offline.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import torch 4 | import torch.nn.functional as F 5 | from algos.base import OfflineBase 6 | from utils.train_tools import soft_target_update 7 | 8 | 9 | class SAC_Offline_Agent(OfflineBase): 10 | """ 11 | The SAC 12 | """ 13 | def __init__(self, 14 | policy_net: torch.nn.Module, # actor 15 | q_net1: torch.nn.Module, # critic 16 | q_net2: torch.nn.Module, 17 | policy_lr=3e-4, 18 | qf_lr=3e-4, 19 | tau=0.05, 20 | alpha=0.5, 21 | auto_alpha_tuning=False, 22 | **kwargs 23 | ): 24 | super().__init__(**kwargs) 25 | 26 | # the network and optimizers 27 | self.policy_net = policy_net.to(self.device) 28 | self.q_net1 = q_net1.to(self.device) 29 | self.q_net2 = q_net2.to(self.device) 30 | self.target_q_net1 = copy.deepcopy(self.q_net1).to(self.device) 31 | self.target_q_net2 = copy.deepcopy(self.q_net2).to(self.device) 32 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=policy_lr) 33 | self.q_optimizer1 = torch.optim.Adam(self.q_net1.parameters(), lr=qf_lr) 34 | self.q_optimizer2 = torch.optim.Adam(self.q_net2.parameters(), lr=qf_lr) 35 | 36 | self.tau = tau 37 | self.alpha = alpha 38 | self.auto_alpha_tuning = auto_alpha_tuning 39 | 40 | if self.auto_alpha_tuning: 41 | self.target_entropy = -np.prod(self.env.action_space.shape).item() 42 | self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) 43 | self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=policy_lr) 44 | self.alpha = torch.exp(self.log_alpha) 45 | 46 | def choose_action(self, obs, eval=True): 47 | with torch.no_grad(): 48 | obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device) 49 | _, _, mu_action = self.policy_net(obs) 50 | 51 | return mu_action.cpu().numpy().flatten() 52 | 53 | def train(self): 54 | 55 | # Sample 56 | batch = self.data_buffer.sample() 57 | obs = batch["obs"].to(self.device) 58 | acts = batch["acts"].to(self.device) 59 | rews = batch["rews"].to(self.device) 60 | next_obs = batch["next_obs"].to(self.device) 61 | done = batch["done"].to(self.device) 62 | 63 | # compute policy Loss 64 | a, log_prob, _ = self.policy_net(obs) 65 | min_q = torch.min(self.q_net1(obs, a), self.q_net2(obs, a)).squeeze(1) 66 | policy_loss = (self.alpha * log_prob - min_q).mean() 67 | 68 | # compute Q Loss 69 | q1 = self.q_net1(obs, acts).squeeze(1) 70 | q2 = self.q_net2(obs, acts).squeeze(1) 71 | with torch.no_grad(): 72 | next_a, next_log_prob, _ = self.policy_net(next_obs) 73 | min_target_next_q = torch.min(self.target_q_net1(next_obs, next_a), self.target_q_net2(next_obs, next_a)).squeeze(1) 74 | y = rews + self.gamma * (1. - done) * (min_target_next_q - self.alpha * next_log_prob) 75 | 76 | q_loss1 = F.mse_loss(q1, y) 77 | q_loss2 = F.mse_loss(q2, y) 78 | 79 | # Update policy network parameter 80 | # policy network's update should be done before updating q network, or there will make some errors 81 | self.policy_optimizer.zero_grad() 82 | policy_loss.backward() 83 | self.policy_optimizer.step() 84 | 85 | # Update q network1 parameter 86 | self.q_optimizer1.zero_grad() 87 | q_loss1.backward() 88 | self.q_optimizer1.step() 89 | 90 | # Update q network2 parameter 91 | self.q_optimizer2.zero_grad() 92 | q_loss2.backward() 93 | self.q_optimizer2.step() 94 | 95 | if self.auto_alpha_tuning: 96 | alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean() 97 | self.alpha_optimizer.zero_grad() 98 | alpha_loss.backward() 99 | self.alpha_optimizer.step() 100 | 101 | self.alpha = self.log_alpha.exp() 102 | else: 103 | alpha_loss = torch.tensor(0) 104 | 105 | self.train_step += 1 106 | 107 | soft_target_update(self.q_net1, self.target_q_net1, tau=self.tau) 108 | soft_target_update(self.q_net2, self.target_q_net2, tau=self.tau) 109 | 110 | train_summaries = {"actor_loss": policy_loss.cpu().item(), 111 | "critic_loss1": q_loss1.cpu().item(), 112 | "critic_loss2": q_loss2.cpu().item(), 113 | "alpha_loss": alpha_loss.cpu().item()} 114 | 115 | return train_summaries 116 | 117 | def store_agent_checkpoint(self): 118 | checkpoint = { 119 | "q_net1": self.q_net1.state_dict(), 120 | "q_net2": self.q_net2.state_dict(), 121 | "policy_net": self.policy_net.state_dict(), 122 | "q_optimizer1": self.q_optimizer1.state_dict(), 123 | "q_optimizer2": self.q_optimizer2.state_dict(), 124 | "policy_optimizer": self.policy_optimizer.state_dict(), 125 | "train_step": self.train_step, 126 | } 127 | if self.auto_alpha_tuning: 128 | checkpoint["log_alpha"] = self.log_alpha 129 | checkpoint["alpha_optimizer"] = self.alpha_optimizer.state_dict() 130 | torch.save(checkpoint, self.checkpoint_path) 131 | 132 | def load_agent_checkpoint(self): 133 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 134 | self.q_net1.load_state_dict(checkpoint["q_net1"]) 135 | self.q_net2.load_state_dict(checkpoint["q_net2"]) 136 | self.policy_net.load_state_dict(checkpoint["policy_net"]) 137 | self.q_optimizer1.load_state_dict(checkpoint["q_optimizer1"]) 138 | self.q_optimizer2.load_state_dict(checkpoint["q_optimizer2"]) 139 | self.policy_optimizer.load_state_dict(checkpoint["policy_optimizer"]) 140 | self.train_step = checkpoint["train_step"] 141 | if self.auto_alpha_tuning: 142 | self.log_alpha = checkpoint["log_alpha"] 143 | self.alpha_optimizer.load_state_dict(checkpoint["alpha_optimizer"]) 144 | 145 | print("load checkpoint from \"" + self.checkpoint_path + 146 | "\" at " + str(self.train_step) + " time step") -------------------------------------------------------------------------------- /algos/offline/td3_bc.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import torch.nn.functional as F 4 | from algos.base import OfflineBase 5 | from utils.train_tools import soft_target_update 6 | 7 | 8 | class TD3_BC_Agent(OfflineBase): 9 | """ 10 | Implementation of TD3 with behavior cloning (TD3_BC) 11 | https://arxiv.org/abs/2106.06860 12 | """ 13 | def __init__(self, 14 | actor_net: torch.nn.Module, 15 | critic_net1: torch.nn.Module, 16 | critic_net2: torch.nn.Module, 17 | actor_lr=3e-4, 18 | critic_lr=3e-4, 19 | tau=0.005, # used to update target network, w' = tau*w + (1-tau)*w' 20 | policy_noise=0.2, # Noise added to target policy during critic update 21 | noise_clip=0.5, # Range to clip target policy noise 22 | policy_delay=2, # Frequency of delayed policy updates 23 | alpha=2.5, # The alpha to compute lambda 24 | **kwargs 25 | ): 26 | super().__init__(**kwargs) 27 | 28 | self.action_num = self.env.action_space.shape[0] 29 | self.action_bound = self.env.action_space.high[0] 30 | 31 | # the network and optimizers 32 | self.actor_net = actor_net.to(self.device) 33 | self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device) 34 | self.critic_net1 = critic_net1.to(self.device) 35 | self.target_critic_net1 = copy.deepcopy(self.critic_net1).to(self.device) 36 | self.critic_net2 = critic_net2.to(self.device) 37 | self.target_critic_net2 = copy.deepcopy(self.critic_net2).to(self.device) 38 | 39 | self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr) 40 | self.critic_optimizer1 = torch.optim.Adam(self.critic_net1.parameters(), lr=critic_lr) 41 | self.critic_optimizer2 = torch.optim.Adam(self.critic_net2.parameters(), lr=critic_lr) 42 | 43 | self.tau = tau 44 | self.policy_noise = policy_noise 45 | self.noise_clip = noise_clip 46 | self.policy_delay = policy_delay 47 | self.alpha = alpha 48 | 49 | self.actor_loss = 0 50 | 51 | def choose_action(self, obs, eval=True): 52 | """Choose an action by deterministic policy with some gaussian noise""" 53 | obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device) 54 | with torch.no_grad(): 55 | action = self.actor_net(obs).cpu().numpy().flatten() 56 | return action 57 | 58 | def train(self): 59 | 60 | # Sample 61 | batch = self.data_buffer.sample() 62 | obs = batch["obs"].to(self.device) 63 | acts = batch["acts"].to(self.device) 64 | rews = batch["rews"].to(self.device) 65 | next_obs = batch["next_obs"].to(self.device) 66 | done = batch["done"].to(self.device) 67 | 68 | # Target Policy Smoothing. Add clipped noise to next actions when computing target Q. 69 | with torch.no_grad(): 70 | noise = torch.normal(mean=0, std=self.policy_noise, size=acts.size()).to(self.device) 71 | noise = noise.clamp(-self.noise_clip, self.noise_clip) 72 | next_act = self.target_actor_net(next_obs) + noise 73 | next_act = next_act.clamp(-self.action_bound, self.action_bound) 74 | 75 | # Clipped Double Q-Learning. Compute the min of target Q1 and target Q2 76 | min_target_q = torch.min(self.target_critic_net1(next_obs, next_act), 77 | self.target_critic_net2(next_obs, next_act)).squeeze(1) 78 | y = rews + self.gamma * (1. - done) * min_target_q 79 | 80 | current_q1 = self.critic_net1(obs, acts).squeeze(1) 81 | current_q2 = self.critic_net2(obs, acts).squeeze(1) 82 | 83 | # TD3 Loss 84 | critic_loss1 = F.mse_loss(current_q1, y) 85 | critic_loss2 = F.mse_loss(current_q2, y) 86 | 87 | # Optimize critic net 88 | self.critic_optimizer1.zero_grad() 89 | critic_loss1.backward() 90 | self.critic_optimizer1.step() 91 | 92 | self.critic_optimizer2.zero_grad() 93 | critic_loss2.backward() 94 | self.critic_optimizer2.step() 95 | 96 | if (self.train_step+1) % self.policy_delay == 0: 97 | # Compute actor loss 98 | pi = self.actor_net(obs) 99 | Q = self.critic_net1(obs, pi) 100 | lmbda = self.alpha / Q.abs().mean().detach() 101 | actor_loss = -lmbda * Q.mean() + F.mse_loss(pi, acts) 102 | 103 | # Optimize actor net 104 | self.actor_optimizer.zero_grad() 105 | actor_loss.backward() 106 | self.actor_optimizer.step() 107 | 108 | soft_target_update(self.actor_net, self.target_actor_net, tau=self.tau) 109 | soft_target_update(self.critic_net1, self.target_critic_net1, tau=self.tau) 110 | soft_target_update(self.critic_net2, self.target_critic_net2, tau=self.tau) 111 | else: 112 | actor_loss = torch.tensor(0) 113 | 114 | self.train_step += 1 115 | 116 | train_summaries = {"actor_loss": actor_loss.cpu().item(), 117 | "critic_loss1": critic_loss1.cpu().item(), 118 | "critic_loss2": critic_loss2.cpu().item()} 119 | 120 | return train_summaries 121 | 122 | def store_agent_checkpoint(self): 123 | checkpoint = { 124 | "actor_net": self.actor_net.state_dict(), 125 | "critic_net1": self.critic_net1.state_dict(), 126 | "critic_net2": self.critic_net2.state_dict(), 127 | "actor_optimizer": self.actor_optimizer.state_dict(), 128 | "critic_optimizer1": self.critic_optimizer1.state_dict(), 129 | "critic_optimizer2": self.critic_optimizer2.state_dict(), 130 | "train_step": self.train_step, 131 | } 132 | torch.save(checkpoint, self.checkpoint_path) 133 | 134 | def load_agent_checkpoint(self): 135 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 136 | self.actor_net.load_state_dict(checkpoint["actor_net"]) 137 | self.target_actor_net.load_state_dict(checkpoint["actor_net"]) 138 | self.critic_net1.load_state_dict(checkpoint["critic_net1"]) 139 | self.target_critic_net1.load_state_dict(checkpoint["critic_net1"]) 140 | self.critic_net2.load_state_dict(checkpoint["critic_net2"]) 141 | self.target_critic_net2.load_state_dict(checkpoint["critic_net2"]) 142 | self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"]) 143 | self.critic_optimizer1.load_state_dict(checkpoint["critic_optimizer1"]) 144 | self.critic_optimizer2.load_state_dict(checkpoint["critic_optimizer2"]) 145 | self.train_step = checkpoint["train_step"] 146 | print("load checkpoint from \"" + self.checkpoint_path + 147 | "\" at " + str(self.train_step) + " time step") 148 | -------------------------------------------------------------------------------- /algos/ppo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from algos.base import PolicyBase 4 | from common.buffers import TrajectoryBuffer 5 | from utils.train_tools import evaluate 6 | from utils import log_tools 7 | 8 | 9 | class PPO_Agent(PolicyBase): 10 | """ 11 | Implementation of Proximal Policy Optimization (PPO) 12 | This is the version of "PPO-Clip" 13 | https://arxiv.org/abs/1707.06347 14 | """ 15 | def __init__(self, 16 | trajectory_buffer: TrajectoryBuffer, 17 | actor_net: torch.nn.Module, 18 | critic_net: torch.nn.Module, 19 | actor_lr=1e-4, 20 | critic_lr=1e-3, 21 | gae_lambda=0.95, 22 | gae_normalize=False, 23 | clip_pram=0.2, 24 | trajectory_length=128, # the length of a trajectory_ 25 | train_actor_iters=10, 26 | train_critic_iters=10, 27 | **kwargs 28 | ): 29 | super().__init__(**kwargs) 30 | 31 | self.trajectory_buffer = trajectory_buffer 32 | 33 | # the network and optimizers 34 | self.actor_net = actor_net.to(self.device) 35 | self.critic_net = critic_net.to(self.device) 36 | self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr) 37 | self.critic_optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=critic_lr) 38 | 39 | self.gae_lambda = gae_lambda 40 | self.gae_normalize = gae_normalize 41 | self.trajectory_length = trajectory_length 42 | self.train_actor_iters = train_actor_iters 43 | self.train_critic_iters = train_critic_iters 44 | self.clip_pram = clip_pram 45 | 46 | self.episode_num = 0 47 | 48 | def choose_action(self, obs, eval=False): 49 | with torch.no_grad(): 50 | obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device) 51 | action, log_prob, eval_action = self.actor_net(obs) 52 | if eval: 53 | action = eval_action 54 | return action.cpu().numpy().squeeze(0), log_prob.cpu().numpy()[0] 55 | 56 | def train(self): 57 | batch = self.trajectory_buffer.sample() 58 | obs = batch["obs"].to(self.device) 59 | acts = batch["acts"].to(self.device) 60 | log_probs = batch["log_probs"].to(self.device) 61 | gae_advs = batch["gae_advs"].to(self.device) 62 | rets = batch["rets"].to(self.device) 63 | 64 | # Train policy with multiple steps of gradient descent 65 | for i in range(self.train_actor_iters): 66 | _, new_log_probs, _ = self.actor_net(obs, acts.squeeze()) 67 | ratios = torch.exp(new_log_probs - log_probs) 68 | 69 | surrogate = ratios * gae_advs 70 | clipped_surrogate = torch.clamp(ratios, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * gae_advs 71 | actor_loss = -(torch.min(surrogate, clipped_surrogate)).mean() 72 | 73 | self.actor_optimizer.zero_grad() 74 | actor_loss.backward() 75 | self.actor_optimizer.step() 76 | 77 | # Train value function with multiple steps of gradient descent 78 | for i in range(self.train_critic_iters): 79 | values = self.critic_net(obs).squeeze() 80 | critic_loss = 0.5 * ((rets - values) ** 2).mean() 81 | self.critic_optimizer.zero_grad() 82 | critic_loss.backward() 83 | self.critic_optimizer.step() 84 | 85 | train_summaries = {"actor_loss": actor_loss.cpu().item(), 86 | "critic_loss": critic_loss.cpu().item()} 87 | 88 | return train_summaries 89 | 90 | def learn(self): 91 | log_tools.make_dir(self.result_dir) 92 | tensorboard_writer = log_tools.TensorboardLogger(self.result_dir) 93 | 94 | if self.resume: 95 | self.load_agent_checkpoint() 96 | else: 97 | # delete tensorboard log file 98 | log_tools.del_all_files_in_dir(self.result_dir) 99 | 100 | print("==============================start train===================================") 101 | obs = self.env.reset() 102 | done = False 103 | 104 | episode_reward = 0 105 | episode_length = 0 106 | trajectory_length = 0 107 | 108 | while self.train_step < self.max_train_step: 109 | action, log_prob = self.choose_action(np.array(obs)) 110 | next_obs, reward, done, info = self.env.step(action) 111 | value = self.critic_net(torch.tensor([obs], dtype=torch.float32)).item() 112 | episode_reward += reward 113 | self.trajectory_buffer.add(obs, action, reward, done, log_prob, value) 114 | obs = next_obs 115 | episode_length += 1 116 | trajectory_length += 1 117 | self.train_step += 1 118 | 119 | if done: 120 | obs = self.env.reset() 121 | self.episode_num += 1 122 | 123 | print(f"Time Step: {self.train_step} Episode Num: {self.episode_num} " 124 | f"Episode Length: {episode_length} Episode Reward: {episode_reward:.2f}") 125 | tensorboard_writer.log_learn_data({"episode_length": episode_length, 126 | "episode_reward": episode_reward}, self.train_step) 127 | episode_reward = 0 128 | episode_length = 0 129 | 130 | if trajectory_length == self.trajectory_length: 131 | last_val = self.critic_net(torch.tensor([obs], dtype=torch.float32)).item() if done else 0 132 | self.trajectory_buffer.finish_path(last_val=last_val, gamma=self.gamma, 133 | gae_lambda=self.gae_lambda, gae_normalize=self.gae_normalize) 134 | train_summaries = self.train() 135 | trajectory_length = 0 136 | 137 | if self.train_step % self.log_interval == 0: 138 | self.store_agent_checkpoint() 139 | tensorboard_writer.log_train_data(train_summaries, self.train_step) 140 | 141 | if self.eval_freq > 0 and self.train_step % self.eval_freq == 0: 142 | evaluate_summaries = evaluate(agent=self, episode_num=10) 143 | tensorboard_writer.log_eval_data(evaluate_summaries, self.train_step) 144 | 145 | def store_agent_checkpoint(self): 146 | checkpoint = { 147 | "actor_net": self.actor_net.state_dict(), 148 | "critic_net": self.critic_net.state_dict(), 149 | "actor_optimizer": self.actor_optimizer.state_dict(), 150 | "critic_optimizer": self.critic_optimizer.state_dict(), 151 | "train_step": self.train_step, 152 | "episode_num": self.episode_num 153 | } 154 | torch.save(checkpoint, self.checkpoint_path) 155 | 156 | def load_agent_checkpoint(self): 157 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 158 | self.actor_net.load_state_dict(checkpoint["actor_net"]) 159 | self.critic_net.load_state_dict(checkpoint["critic_net"]) 160 | self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"]) 161 | self.critic_optimizer.load_state_dict(checkpoint["critic_optimizer"]) 162 | self.train_step = checkpoint["train_step"] 163 | self.episode_num = checkpoint["episode_num"] 164 | print("load checkpoint from \"" + self.checkpoint_path + 165 | "\" at " + str(self.train_step) + " time step") 166 | -------------------------------------------------------------------------------- /algos/sac.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import torch 4 | import torch.nn.functional as F 5 | from algos.base import OffPolicyBase 6 | from utils.train_tools import soft_target_update 7 | 8 | 9 | class SAC_Agent(OffPolicyBase): 10 | """ 11 | Implementation of Soft Actor-Critic (SAC) 12 | https://arxiv.org/abs/1812.05905(SAC 2019) 13 | """ 14 | def __init__(self, 15 | policy_net: torch.nn.Module, # actor 16 | q_net1: torch.nn.Module, # critic 17 | q_net2: torch.nn.Module, 18 | policy_lr=4e-3, 19 | qf_lr=4e-3, 20 | tau=0.05, 21 | alpha=0.5, 22 | auto_alpha_tuning=False, 23 | **kwargs 24 | ): 25 | super().__init__(**kwargs) 26 | 27 | # the network and optimizers 28 | self.policy_net = policy_net.to(self.device) 29 | self.q_net1 = q_net1.to(self.device) 30 | self.q_net2 = q_net2.to(self.device) 31 | self.target_q_net1 = copy.deepcopy(self.q_net1).to(self.device) 32 | self.target_q_net2 = copy.deepcopy(self.q_net2).to(self.device) 33 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=policy_lr) 34 | self.q_optimizer1 = torch.optim.Adam(self.q_net1.parameters(), lr=qf_lr) 35 | self.q_optimizer2 = torch.optim.Adam(self.q_net2.parameters(), lr=qf_lr) 36 | 37 | self.tau = tau 38 | self.alpha = alpha 39 | self.auto_alpha_tuning = auto_alpha_tuning 40 | 41 | if self.auto_alpha_tuning: 42 | self.target_entropy = -np.prod(self.env.action_space.shape).item() 43 | self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) 44 | self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=policy_lr) 45 | self.alpha = torch.exp(self.log_alpha) 46 | 47 | def choose_action(self, obs, eval=False): 48 | with torch.no_grad(): 49 | obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device) 50 | action, log_prob, mu_action = self.policy_net(obs) 51 | 52 | if eval: 53 | action = mu_action # if eval, use mu as the action 54 | 55 | return action.cpu().numpy().flatten() 56 | 57 | def train(self): 58 | 59 | # Sample 60 | batch = self.replay_buffer.sample() 61 | obs = batch["obs"].to(self.device) 62 | acts = batch["acts"].to(self.device) 63 | rews = batch["rews"].to(self.device) 64 | next_obs = batch["next_obs"].to(self.device) 65 | done = batch["done"].to(self.device) 66 | 67 | # compute policy Loss 68 | a, log_prob, _ = self.policy_net(obs) 69 | min_q = torch.min(self.q_net1(obs, a), self.q_net2(obs, a)).squeeze(1) 70 | policy_loss = (self.alpha * log_prob - min_q).mean() 71 | 72 | # compute Q Loss 73 | q1 = self.q_net1(obs, acts).squeeze(1) 74 | q2 = self.q_net2(obs, acts).squeeze(1) 75 | with torch.no_grad(): 76 | next_a, next_log_prob, _ = self.policy_net(next_obs) 77 | min_target_next_q = torch.min(self.target_q_net1(next_obs, next_a), self.target_q_net2(next_obs, next_a)).squeeze(1) 78 | y = rews + self.gamma * (1. - done) * (min_target_next_q - self.alpha * next_log_prob) 79 | 80 | q_loss1 = F.mse_loss(q1, y) 81 | q_loss2 = F.mse_loss(q2, y) 82 | 83 | # Update policy network parameter 84 | # policy network's update should be done before updating q network, or there will make some errors 85 | self.policy_optimizer.zero_grad() 86 | policy_loss.backward() 87 | self.policy_optimizer.step() 88 | 89 | # Update q network1 parameter 90 | self.q_optimizer1.zero_grad() 91 | q_loss1.backward() 92 | self.q_optimizer1.step() 93 | 94 | # Update q network2 parameter 95 | self.q_optimizer2.zero_grad() 96 | q_loss2.backward() 97 | self.q_optimizer2.step() 98 | 99 | if self.auto_alpha_tuning: 100 | alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean() 101 | self.alpha_optimizer.zero_grad() 102 | alpha_loss.backward() 103 | self.alpha_optimizer.step() 104 | 105 | self.alpha = self.log_alpha.exp() 106 | else: 107 | alpha_loss = torch.tensor(0) 108 | 109 | self.train_step += 1 110 | 111 | soft_target_update(self.q_net1, self.target_q_net1, tau=self.tau) 112 | soft_target_update(self.q_net2, self.target_q_net2, tau=self.tau) 113 | 114 | train_summaries = {"actor_loss": policy_loss.cpu().item(), 115 | "critic_loss1": q_loss1.cpu().item(), 116 | "critic_loss2": q_loss2.cpu().item(), 117 | "alpha_loss": alpha_loss.cpu().item()} 118 | 119 | return train_summaries 120 | 121 | def store_agent_checkpoint(self): 122 | checkpoint = { 123 | "q_net1": self.q_net1.state_dict(), 124 | "q_net2": self.q_net2.state_dict(), 125 | "policy_net": self.policy_net.state_dict(), 126 | "q_optimizer1": self.q_optimizer1.state_dict(), 127 | "q_optimizer2": self.q_optimizer2.state_dict(), 128 | "policy_optimizer": self.policy_optimizer.state_dict(), 129 | "train_step": self.train_step, 130 | "episode_num": self.episode_num 131 | } 132 | if self.auto_alpha_tuning: 133 | checkpoint["log_alpha"] = self.log_alpha 134 | checkpoint["alpha_optimizer"] = self.alpha_optimizer.state_dict() 135 | torch.save(checkpoint, self.checkpoint_path) 136 | 137 | def load_agent_checkpoint(self): 138 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 139 | self.q_net1.load_state_dict(checkpoint["q_net1"]) 140 | self.q_net2.load_state_dict(checkpoint["q_net2"]) 141 | self.policy_net.load_state_dict(checkpoint["policy_net"]) 142 | self.q_optimizer1.load_state_dict(checkpoint["q_optimizer1"]) 143 | self.q_optimizer2.load_state_dict(checkpoint["q_optimizer2"]) 144 | self.policy_optimizer.load_state_dict(checkpoint["policy_optimizer"]) 145 | self.train_step = checkpoint["train_step"] 146 | self.episode_num = checkpoint["episode_num"] 147 | if self.auto_alpha_tuning: 148 | self.log_alpha = checkpoint["log_alpha"] 149 | self.alpha_optimizer.load_state_dict(checkpoint["alpha_optimizer"]) 150 | print("load checkpoint from \"" + self.checkpoint_path + 151 | "\" at " + str(self.train_step) + " time step") 152 | -------------------------------------------------------------------------------- /algos/td3.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import torch 4 | import torch.nn.functional as F 5 | from algos.base import OffPolicyBase 6 | from utils.train_tools import soft_target_update 7 | 8 | 9 | class TD3_Agent(OffPolicyBase): 10 | """ 11 | Implementation of Twin Delayed Deep Deterministic policy gradient (TD3) 12 | https://arxiv.org/abs/1802.09477 13 | """ 14 | def __init__(self, 15 | actor_net: torch.nn.Module, 16 | critic_net1: torch.nn.Module, 17 | critic_net2: torch.nn.Module, 18 | actor_lr=1e-3, 19 | critic_lr=1e-3, 20 | tau=0.005, # used to update target network, w' = tau*w + (1-tau)*w' 21 | act_noise=0.1, # Std of Gaussian exploration noise 22 | policy_noise=0.2, # Noise added to target policy during critic update 23 | noise_clip=0.5, # Range to clip target policy noise 24 | policy_delay=2, # Frequency of delayed policy updates 25 | **kwargs 26 | ): 27 | super().__init__(**kwargs) 28 | 29 | self.action_num = self.env.action_space.shape[0] 30 | self.action_bound = self.env.action_space.high[0] 31 | 32 | # the network and optimizers 33 | self.actor_net = actor_net.to(self.device) 34 | self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device) 35 | self.critic_net1 = critic_net1.to(self.device) 36 | self.target_critic_net1 = copy.deepcopy(self.critic_net1).to(self.device) 37 | self.critic_net2 = critic_net2.to(self.device) 38 | self.target_critic_net2 = copy.deepcopy(self.critic_net2).to(self.device) 39 | 40 | self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr) 41 | self.critic_optimizer1 = torch.optim.Adam(self.critic_net1.parameters(), lr=critic_lr) 42 | self.critic_optimizer2 = torch.optim.Adam(self.critic_net2.parameters(), lr=critic_lr) 43 | 44 | self.tau = tau 45 | self.act_noise = act_noise 46 | self.policy_noise = policy_noise 47 | self.noise_clip = noise_clip 48 | self.policy_delay = policy_delay 49 | self.actor_loss = 0 50 | 51 | def choose_action(self, obs, eval=False): 52 | """Choose an action by deterministic policy with some gaussian noise""" 53 | obs = torch.FloatTensor(obs).reshape(1, -1).to(self.device) 54 | with torch.no_grad(): 55 | action = self.actor_net(obs).cpu().numpy().flatten() 56 | if eval: 57 | return action 58 | else: 59 | noise = np.random.normal(0, self.act_noise, size=self.action_num) 60 | return (action + noise).clip(-self.action_bound, self.action_bound) 61 | 62 | def train(self): 63 | 64 | # Sample 65 | batch = self.replay_buffer.sample() 66 | obs = batch["obs"].to(self.device) 67 | acts = batch["acts"].to(self.device) 68 | rews = batch["rews"].to(self.device) 69 | next_obs = batch["next_obs"].to(self.device) 70 | done = batch["done"].to(self.device) 71 | 72 | # Target Policy Smoothing. Add clipped noise to next actions when computing target Q. 73 | with torch.no_grad(): 74 | noise = torch.normal(mean=0, std=self.policy_noise, size=acts.size()).to(self.device) 75 | noise = noise.clamp(-self.noise_clip, self.noise_clip) 76 | next_act = self.target_actor_net(next_obs) + noise 77 | next_act = next_act.clamp(-self.action_bound, self.action_bound) 78 | 79 | # Clipped Double Q-Learning. Compute the min of target Q1 and target Q2 80 | min_target_q = torch.min(self.target_critic_net1(next_obs, next_act), 81 | self.target_critic_net2(next_obs, next_act)).squeeze(1) 82 | y = rews + self.gamma * (1. - done) * min_target_q 83 | 84 | current_q1 = self.critic_net1(obs, acts).squeeze(1) 85 | current_q2 = self.critic_net2(obs, acts).squeeze(1) 86 | 87 | # TD3 Loss 88 | critic_loss1 = F.mse_loss(current_q1, y) 89 | critic_loss2 = F.mse_loss(current_q2, y) 90 | 91 | # Optimize critic net 92 | self.critic_optimizer1.zero_grad() 93 | critic_loss1.backward() 94 | self.critic_optimizer1.step() 95 | 96 | self.critic_optimizer2.zero_grad() 97 | critic_loss2.backward() 98 | self.critic_optimizer2.step() 99 | 100 | if (self.train_step+1) % self.policy_delay == 0: 101 | # Compute actor loss 102 | actor_loss = -self.critic_net1(obs, self.actor_net(obs)).mean() 103 | # Optimize actor net 104 | self.actor_optimizer.zero_grad() 105 | actor_loss.backward() 106 | self.actor_optimizer.step() 107 | 108 | soft_target_update(self.actor_net, self.target_actor_net, tau=self.tau) 109 | soft_target_update(self.critic_net1, self.target_critic_net1, tau=self.tau) 110 | soft_target_update(self.critic_net2, self.target_critic_net2, tau=self.tau) 111 | else: 112 | actor_loss = torch.tensor(0) 113 | 114 | self.train_step += 1 115 | 116 | train_summaries = {"actor_loss": actor_loss.cpu().item(), 117 | "critic_loss1": critic_loss1.cpu().item(), 118 | "critic_loss2": critic_loss2.cpu().item()} 119 | 120 | return train_summaries 121 | 122 | def store_agent_checkpoint(self): 123 | checkpoint = { 124 | "actor_net": self.actor_net.state_dict(), 125 | "critic_net1": self.critic_net1.state_dict(), 126 | "critic_net2": self.critic_net2.state_dict(), 127 | "actor_optimizer": self.actor_optimizer.state_dict(), 128 | "critic_optimizer1": self.critic_optimizer1.state_dict(), 129 | "critic_optimizer2": self.critic_optimizer2.state_dict(), 130 | "train_step": self.train_step, 131 | "episode_num": self.episode_num 132 | } 133 | torch.save(checkpoint, self.checkpoint_path) 134 | 135 | def load_agent_checkpoint(self): 136 | checkpoint = torch.load(self.checkpoint_path, map_location=self.device) # can load gpu's data on cpu machine 137 | self.actor_net.load_state_dict(checkpoint["actor_net"]) 138 | self.target_actor_net.load_state_dict(checkpoint["actor_net"]) 139 | self.critic_net1.load_state_dict(checkpoint["critic_net1"]) 140 | self.target_critic_net1.load_state_dict(checkpoint["critic_net1"]) 141 | self.critic_net2.load_state_dict(checkpoint["critic_net2"]) 142 | self.target_critic_net2.load_state_dict(checkpoint["critic_net2"]) 143 | self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"]) 144 | self.critic_optimizer1.load_state_dict(checkpoint["critic_optimizer1"]) 145 | self.critic_optimizer2.load_state_dict(checkpoint["critic_optimizer2"]) 146 | self.train_step = checkpoint["train_step"] 147 | self.episode_num = checkpoint["episode_num"] 148 | print("load checkpoint from \"" + self.checkpoint_path + 149 | "\" at " + str(self.train_step) + " time step") -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dragon-wang/RL_Algorithms/3d43ece583e80f2828a42f28b790b1d7d73c07bf/common/__init__.py -------------------------------------------------------------------------------- /common/buffers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | from typing import Sequence, Type, Optional, List, Union 5 | 6 | 7 | class ReplayBuffer: 8 | def __init__(self, obs_dim: Union[int, Sequence[int]], act_dim: int, capacity: int, batch_size: int): 9 | 10 | # Transfer the "int" observation dimension to "list" 11 | if isinstance(obs_dim, int): 12 | self.obs_dim = [obs_dim] 13 | else: 14 | self.obs_dim = list(obs_dim) 15 | 16 | self.act_dim = act_dim 17 | self.max_size = capacity 18 | self.batch_size = batch_size 19 | self.ptr = 0 # Point to the current position in the buffer 20 | self.crt_size = 0 # The current size of the buffer 21 | 22 | # Use numpy.ndarray to initialize the replay buffer 23 | self.obs = np.zeros(shape=[self.max_size] + self.obs_dim, dtype=np.float32) 24 | self.acts = np.zeros((self.max_size, self.act_dim), dtype=np.float32) 25 | self.rews = np.zeros(self.max_size, dtype=np.float32) 26 | self.next_obs = np.zeros(shape=[self.max_size] + self.obs_dim, dtype=np.float32) 27 | self.done = np.zeros(self.max_size, dtype=np.float32) 28 | 29 | def add(self, obs, act, rew, next_obs, done): 30 | self.obs[self.ptr] = obs 31 | self.acts[self.ptr] = act 32 | self.rews[self.ptr] = rew 33 | self.next_obs[self.ptr] = next_obs 34 | self.done[self.ptr] = float(done) 35 | 36 | self.ptr = (self.ptr + 1) % self.max_size 37 | self.crt_size = min(self.crt_size + 1, self.max_size) 38 | 39 | def sample(self): 40 | ind = np.random.choice(self.crt_size, size=self.batch_size, replace=True) # replace=False will make sample be slow 41 | return dict(obs=torch.FloatTensor(self.obs[ind]), 42 | acts=torch.FloatTensor(self.acts[ind]), 43 | rews=torch.FloatTensor(self.rews[ind]), # 1D 44 | next_obs=torch.FloatTensor(self.next_obs[ind]), 45 | done=torch.FloatTensor(self.done[ind])) # 1D 46 | 47 | 48 | class TrajectoryBuffer: 49 | """ 50 | Used to store experiences for a trajectory (e.g., in PPO) 51 | """ 52 | def __init__(self, obs_dim: Union[int, Sequence[int]], act_dim: int, capacity: int): 53 | 54 | # Transfer the "int" observation dimension to "list" 55 | if isinstance(obs_dim, int): 56 | self.obs_dim = [obs_dim] 57 | else: 58 | self.obs_dim = list(obs_dim) 59 | self.act_dim = act_dim 60 | self.max_size = capacity 61 | self.ptr = 0 # Point to the current position in the buffer 62 | 63 | # Use numpy.ndarray to initialize the replay buffer 64 | self.obs = np.zeros(shape=[self.max_size] + self.obs_dim, dtype=np.float32) 65 | self.acts = np.zeros((self.max_size, self.act_dim), dtype=np.float32) 66 | self.rews = np.zeros(self.max_size, dtype=np.float32) 67 | self.done = np.zeros(self.max_size, dtype=np.float32) 68 | self.log_probs = np.zeros(self.max_size, dtype=np.float32) # the log probability of choosing an action 69 | self.values = np.zeros(self.max_size + 1, dtype=np.float32) # the value of the state. the length of values is T+1, while others are T 70 | self.rets = np.zeros(self.max_size, dtype=np.float32) # the Return in time t, which is also known as G_t. 71 | self.gae_advs = np.zeros(self.max_size, dtype=np.float32) # the GAE advantage 72 | 73 | def add(self, obs, act, rew, done, log_prob, value): 74 | self.obs[self.ptr] = obs 75 | self.acts[self.ptr] = act 76 | self.rews[self.ptr] = rew 77 | self.done[self.ptr] = float(done) 78 | self.log_probs[self.ptr] = log_prob 79 | self.values[self.ptr] = value 80 | self.ptr += 1 81 | 82 | def finish_path(self, last_val=0, gamma=0.99, gae_lambda=0.95, gae_normalize=False): 83 | """ 84 | This method is called at the end of a trajectory 85 | """ 86 | self.values[-1] = last_val 87 | 88 | g = self.values[-1] 89 | gae_adv = 0 90 | for i in reversed(range(len(self.rews))): 91 | # compute G_t 92 | g = self.rews[i] + gamma * g * (1-self.done[i]) 93 | self.rets[i] = g 94 | # compute A_t 95 | delt = self.rews[i] + gamma * self.values[i + 1] * (1 - self.done[i]) - self.values[i] 96 | gae_adv = delt + gamma * gae_lambda * gae_adv * (1 - self.done[i]) 97 | self.gae_advs[i] = gae_adv 98 | 99 | if gae_normalize: 100 | self.gae_advs = (self.gae_advs - np.mean(self.gae_advs) / np.std(self.gae_advs)) 101 | 102 | self.ptr = 0 103 | 104 | def sample(self): 105 | return dict(obs=torch.FloatTensor(self.obs), 106 | acts=torch.FloatTensor(self.acts), 107 | rews=torch.FloatTensor(self.rews), 108 | done=torch.FloatTensor(self.done), 109 | log_probs=torch.FloatTensor(self.log_probs), 110 | gae_advs=torch.FloatTensor(self.gae_advs), 111 | rets=torch.FloatTensor(self.rets)) 112 | 113 | 114 | class OfflineBuffer: 115 | """ 116 | Used in offline setting 117 | """ 118 | def __init__(self, data: dict, batch_size: int): 119 | self.obs = data["obs"] 120 | self.acts = data["acts"] 121 | self.rews = data["rews"] 122 | self.next_obs = data["next_obs"] 123 | self.done = data["done"] 124 | 125 | self.data_num = self.acts.shape[0] 126 | self.batch_size = batch_size 127 | 128 | def sample(self) -> dict: 129 | ind = np.random.choice(self.data_num, size=self.batch_size, replace=True) # replace=False will make sample be slow 130 | return dict(obs=torch.FloatTensor(self.obs[ind]), 131 | acts=torch.FloatTensor(self.acts[ind]), 132 | rews=torch.FloatTensor(self.rews[ind]), # 1D 133 | next_obs=torch.FloatTensor(self.next_obs[ind]), 134 | done=torch.FloatTensor(self.done[ind])) # 1D 135 | 136 | 137 | class OfflineBufferAtari: 138 | """ 139 | Used in offline setting 140 | """ 141 | def __init__(self, data: dict, batch_size: int): 142 | self.obs = data["obs"] # list 143 | self.acts = data["acts"] # ndarray 144 | self.rews = data["rews"] # ndarray 145 | self.done = data["done"] # ndarray 146 | 147 | self.data_num = self.acts.shape[0] 148 | self.batch_size = batch_size 149 | 150 | def sample(self) -> dict: 151 | ind = np.random.choice(self.data_num-1, size=self.batch_size, replace=True) # replace=False will make sample be slow 152 | obs = [self.obs[i] for i in ind] 153 | next_obs = [self.obs[i+1] for i in ind] 154 | return dict(obs=torch.FloatTensor(obs), 155 | acts=torch.FloatTensor(self.acts[ind]).reshape(-1, 1), 156 | rews=torch.FloatTensor(self.rews[ind]), # 1D 157 | next_obs=torch.FloatTensor(next_obs), 158 | done=torch.FloatTensor(self.done[ind])) # 1D 159 | 160 | 161 | class OfflineToOnlineBuffer: 162 | """ 163 | Used in offline to Online setting 164 | """ 165 | def __init__(self, data: dict, batch_size: int): 166 | self.obs = data["obs"] 167 | self.acts = data["acts"] 168 | self.rews = data["rews"] 169 | self.next_obs = data["next_obs"] 170 | self.done = data["done"] 171 | 172 | self.data_num = self.acts.shape[0] 173 | self.batch_size = batch_size 174 | self.max_size = self.data_num 175 | 176 | self.ptr = 0 # Point to the current position in the buffer 177 | self.crt_size = 0 # The current size of the buffer 178 | 179 | def add(self, obs, act, rew, next_obs, done): 180 | self.obs[self.ptr] = obs 181 | self.acts[self.ptr] = act 182 | self.rews[self.ptr] = rew 183 | self.next_obs[self.ptr] = next_obs 184 | self.done[self.ptr] = float(done) 185 | 186 | self.ptr = (self.ptr + 1) % self.max_size 187 | self.crt_size = min(self.crt_size + 1, self.max_size) 188 | 189 | def sample(self) -> dict: 190 | ind = np.random.choice(self.data_num, size=self.batch_size, replace=True) # replace=False will make sample be slow 191 | return dict(obs=torch.FloatTensor(self.obs[ind]), 192 | acts=torch.FloatTensor(self.acts[ind]), 193 | rews=torch.FloatTensor(self.rews[ind]), # 1D 194 | next_obs=torch.FloatTensor(self.next_obs[ind]), 195 | done=torch.FloatTensor(self.done[ind])) # 1D 196 | -------------------------------------------------------------------------------- /run/bcq_mujoco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.offline.bcq import BCQ_Agent 11 | from common.buffers import OfflineBuffer 12 | from common.networks import MLPQsaNet, CVAE, BCQ_Perturbation 13 | from utils import train_tools, data_tools 14 | 15 | 16 | if __name__ == '__main__': 17 | parser = argparse.ArgumentParser(description='BCQ algorithm in mujoco environment') 18 | parser.add_argument('--env', type=str, default='hopper-medium-v0', 19 | help='the name of environment') 20 | parser.add_argument('--batch_size', type=int, default=100, 21 | help='the size of batch that sampled from buffer') 22 | 23 | parser.add_argument('--max_train_step', type=int, default=1000000, 24 | help='the max train step') 25 | parser.add_argument('--log_interval', type=int, default=1000, 26 | help='The number of steps taken to record the model and the tensorboard') 27 | parser.add_argument('--train_id', type=str, default='bcq_mujoco_test', 28 | help='Path to save model and log tensorboard') 29 | parser.add_argument('--resume', action='store_true', default=False, 30 | help='whether load the last saved model to train') 31 | parser.add_argument('--device', type=str, default='cpu', 32 | help='Choose cpu or cuda') 33 | parser.add_argument('--show', action='store_true', default=False, 34 | help='show the trained model visually') 35 | parser.add_argument('--eval_freq', type=int, default=5000, 36 | help='how often (time steps) we evaluate') 37 | parser.add_argument('--seed', type=int, default=10, 38 | help='the random seed') 39 | 40 | args = parser.parse_args() 41 | 42 | torch.manual_seed(args.seed) 43 | np.random.seed(args.seed) 44 | 45 | # create environment 46 | env = gym.make(args.env) 47 | env.seed(args.seed) 48 | env.action_space.seed(args.seed) 49 | train_tools.EVAL_SEED = args.seed 50 | 51 | obs_dim = env.observation_space.shape[0] 52 | act_dim = env.action_space.shape[0] 53 | act_bound = env.action_space.high[0] 54 | 55 | critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300], 56 | hidden_activation=nn.ReLU) 57 | critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300], 58 | hidden_activation=nn.ReLU) 59 | 60 | perturbation_net = BCQ_Perturbation(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 61 | hidden_size=[400, 300], hidden_activation=nn.ReLU, 62 | phi=0.05) 63 | 64 | cvae_net = CVAE(obs_dim=obs_dim, act_dim=act_dim, 65 | latent_dim=2 * act_dim, act_bound=act_bound) 66 | 67 | # create buffer 68 | if args.show: 69 | data_buffer = None 70 | else: 71 | data = data_tools.get_d4rl_dataset(env) 72 | data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size) 73 | 74 | agent = BCQ_Agent( 75 | # parameters of PolicyBase 76 | env=env, 77 | gamma=0.99, 78 | eval_freq=args.eval_freq, 79 | max_train_step=args.max_train_step, 80 | train_id=args.train_id, 81 | log_interval=args.log_interval, 82 | resume=args.resume, 83 | device=args.device, 84 | 85 | # Parameters of OfflineBase 86 | data_buffer=data_buffer, 87 | 88 | # Parameters of BCQ_Agent 89 | critic_net1=critic_net1, 90 | critic_net2=critic_net2, 91 | perturbation_net=perturbation_net, 92 | cvae_net=cvae_net, # generation model 93 | critic_lr=1e-3, 94 | per_lr=1e-3, 95 | cvae_lr=1e-3, 96 | tau=0.005, 97 | lmbda=0.75, # used for double clipped double q-learning 98 | ) 99 | 100 | if args.show: 101 | train_tools.evaluate(agent, 10, show=True) 102 | else: 103 | agent.learn() 104 | -------------------------------------------------------------------------------- /run/bear_mujoco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.offline.bear import BEAR_Agent 11 | from common.buffers import OfflineBuffer 12 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy, CVAE 13 | from utils import train_tools, data_tools 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser(description='CQL algorithm in mujoco environment') 17 | parser.add_argument('--env', type=str, default='hopper-medium-v0', 18 | help='the name of environment') 19 | parser.add_argument('--batch_size', type=int, default=256, 20 | help='the size of batch that sampled from buffer') 21 | 22 | # BEAR 23 | parser.add_argument('--mmd_sigma', type=float, default=20.0, 24 | help='the sigma used in mmd kernel') 25 | parser.add_argument('--kernel_type', type=str, default='gaussian', 26 | help='the type of mmd kernel(gaussian or laplacian)') 27 | parser.add_argument('--lagrange_thresh', type=float, default=0.05, 28 | help='the hyper-parameter used in automatic tuning alpha in cql loss') 29 | 30 | parser.add_argument('--max_train_step', type=int, default=1000000, 31 | help='the max train step') 32 | parser.add_argument('--log_interval', type=int, default=1000, 33 | help='The number of steps taken to record the model and the tensorboard') 34 | parser.add_argument('--train_id', type=str, default='bear_hopper-mujoco_test', 35 | help='Path to save model and log tensorboard') 36 | parser.add_argument('--resume', action='store_true', default=False, 37 | help='whether load the last saved model to train') 38 | parser.add_argument('--device', type=str, default='cpu', 39 | help='Choose cpu or cuda') 40 | parser.add_argument('--show', action='store_true', default=False, 41 | help='show the trained model visually') 42 | parser.add_argument('--eval_freq', type=int, default=5000, 43 | help='how often (time steps) we evaluate') 44 | parser.add_argument('--seed', type=int, default=10, 45 | help='the random seed') 46 | 47 | args = parser.parse_args() 48 | 49 | torch.manual_seed(args.seed) 50 | np.random.seed(args.seed) 51 | 52 | # create environment 53 | env = gym.make(args.env) 54 | env.seed(args.seed) 55 | env.action_space.seed(args.seed) 56 | train_tools.EVAL_SEED = args.seed 57 | 58 | obs_dim = env.observation_space.shape[0] 59 | act_dim = env.action_space.shape[0] 60 | act_bound = env.action_space.high[0] 61 | 62 | # create nets 63 | policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 64 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 65 | q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 66 | hidden_activation=nn.ReLU) 67 | 68 | q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 69 | hidden_activation=nn.ReLU) 70 | 71 | cvae_net = CVAE(obs_dim=obs_dim, act_dim=act_dim, 72 | latent_dim=2 * act_dim, act_bound=act_bound) 73 | 74 | # create buffer 75 | if args.show: 76 | data_buffer = None 77 | else: 78 | data = data_tools.get_d4rl_dataset(env) 79 | data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size) 80 | 81 | agent = BEAR_Agent( 82 | # parameters of PolicyBase 83 | env=env, 84 | gamma=0.99, 85 | eval_freq=args.eval_freq, 86 | max_train_step=args.max_train_step, 87 | train_id=args.train_id, 88 | log_interval=args.log_interval, 89 | resume=args.resume, 90 | device=args.device, 91 | 92 | # Parameters of OfflineBase 93 | data_buffer=data_buffer, 94 | 95 | # Parameters of BEAR_Agent 96 | policy_net=policy_net, 97 | q_net1=q_net1, 98 | q_net2=q_net2, 99 | cvae_net=cvae_net, 100 | policy_lr=1e-4, 101 | qf_lr=3e-4, 102 | cvae_lr=3e-4, 103 | tau=0.05, 104 | lmbda=0.75, 105 | mmd_sigma=args.mmd_sigma, 106 | kernel_type=args.kernel_type, 107 | lagrange_thresh=args.lagrange_thresh, 108 | n_action_samples=100, 109 | n_target_samples=10, 110 | n_mmd_action_samples=4, 111 | warmup_step=40000, 112 | ) 113 | if args.show: 114 | train_tools.evaluate(agent, 10, show=True) 115 | else: 116 | agent.learn() 117 | -------------------------------------------------------------------------------- /run/cql_atari.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import numpy as np 9 | from algos.offline.cql import DiscreteCQL_Agent 10 | from common.buffers import OfflineBufferAtari 11 | from common.networks import ConvAtariQsNet 12 | from utils import train_tools 13 | from utils import data_tools 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | parser = argparse.ArgumentParser(description='DiscreteCQL algorithm in atari environment') 19 | parser.add_argument('--env', type=str, default='pong-mixed-v0', 20 | help='the name of environment') 21 | parser.add_argument('--batch_size', type=int, default=32, 22 | help='the size of batch that sampled from buffer') 23 | 24 | parser.add_argument('--min_q_weight', type=float, default=5.0, 25 | help='the value of alpha, set to 5.0 or 10.0 if not using lagrange') 26 | 27 | parser.add_argument('--max_train_step', type=int, default=2000000, 28 | help='the max train step') 29 | parser.add_argument('--log_interval', type=int, default=1000, 30 | help='The number of steps taken to record the model and the tensorboard') 31 | parser.add_argument('--train_id', type=str, default='cql_atari_test', 32 | help='Path to save model and log tensorboard') 33 | parser.add_argument('--resume', action='store_true', default=False, 34 | help='whether load the last saved model to train') 35 | parser.add_argument('--device', type=str, default='cpu', 36 | help='Choose cpu or cuda') 37 | parser.add_argument('--show', action='store_true', default=False, 38 | help='show the trained model visually') 39 | parser.add_argument('--eval_freq', type=int, default=5000, 40 | help='how often (time steps) we evaluate') 41 | parser.add_argument('--seed', type=int, default=10, 42 | help='the random seed') 43 | 44 | args = parser.parse_args() 45 | 46 | torch.manual_seed(args.seed) 47 | np.random.seed(args.seed) 48 | 49 | env = gym.make(args.env, stack=True) 50 | env.seed(args.seed) 51 | env.action_space.seed(args.seed) 52 | train_tools.EVAL_SEED = args.seed 53 | 54 | obs_dim = env.observation_space.shape 55 | act_dim = env.action_space.n 56 | 57 | Q_net = ConvAtariQsNet(num_frames_stack=4, act_dim=act_dim) 58 | 59 | # create buffer 60 | if args.show: 61 | data_buffer = None 62 | else: 63 | data = data_tools.get_d4rl_dataset_atari(env) 64 | data_buffer = OfflineBufferAtari(data=data, batch_size=args.batch_size) 65 | 66 | agent = DiscreteCQL_Agent( 67 | # parameters of PolicyBase 68 | env=env, 69 | gamma=0.99, 70 | eval_freq=args.eval_freq, 71 | max_train_step=args.max_train_step, 72 | train_id=args.train_id, 73 | log_interval=args.log_interval, 74 | resume=args.resume, 75 | device=args.device, 76 | 77 | # Parameters of OfflineBase 78 | data_buffer=data_buffer, 79 | 80 | # Parameters of CQL_Agent 81 | Q_net=Q_net, 82 | qf_lr=1e-4, 83 | eval_eps=0.001, 84 | target_update_freq=8000, 85 | min_q_weight=args.min_q_weight, # the value of alpha in CQL loss, set to 5.0 or 10.0 if not using lagrange 86 | ) 87 | 88 | if args.show: 89 | train_tools.evaluate(agent, 10, show=True) 90 | else: 91 | agent.learn() 92 | -------------------------------------------------------------------------------- /run/cql_mujoco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.offline.cql import CQL_Agent 11 | from common.buffers import OfflineBuffer 12 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy 13 | from utils import train_tools, data_tools 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser(description='CQL algorithm in mujoco environment') 17 | parser.add_argument('--env', type=str, default='hopper-medium-v0', 18 | help='the name of environment') 19 | parser.add_argument('--batch_size', type=int, default=256, 20 | help='the size of batch that sampled from buffer') 21 | parser.add_argument('--auto_alpha_tuning', action='store_true', default=False, 22 | help='whether automatic tune alpha') 23 | 24 | # CQL 25 | parser.add_argument('--min_q_weight', type=float, default=5.0, 26 | help='the value of alpha, set to 5.0 or 10.0 if not using lagrange') 27 | parser.add_argument('--entropy_backup', action='store_true', default=False, 28 | help='whether use sac style target Q with entropy') 29 | parser.add_argument('--max_q_backup', action='store_true', default=False, 30 | help='whether use max q backup') 31 | parser.add_argument('--with_lagrange', action='store_true', default=False, 32 | help='whether auto tune alpha in Conservative Q Loss(different from the alpha in sac)') 33 | parser.add_argument('--lagrange_thresh', type=float, default=5.0, 34 | help='the hyper-parameter used in automatic tuning alpha in cql loss') 35 | 36 | parser.add_argument('--max_train_step', type=int, default=1000000, 37 | help='the max train step') 38 | parser.add_argument('--log_interval', type=int, default=1000, 39 | help='The number of steps taken to record the model and the tensorboard') 40 | parser.add_argument('--train_id', type=str, default='cql_hopper-mujoco_test', 41 | help='Path to save model and log tensorboard') 42 | parser.add_argument('--resume', action='store_true', default=False, 43 | help='whether load the last saved model to train') 44 | parser.add_argument('--device', type=str, default='cpu', 45 | help='Choose cpu or cuda') 46 | parser.add_argument('--show', action='store_true', default=False, 47 | help='show the trained model visually') 48 | parser.add_argument('--eval_freq', type=int, default=5000, 49 | help='how often (time steps) we evaluate') 50 | parser.add_argument('--seed', type=int, default=10, 51 | help='the random seed') 52 | 53 | args = parser.parse_args() 54 | 55 | torch.manual_seed(args.seed) 56 | np.random.seed(args.seed) 57 | 58 | # create environment 59 | env = gym.make(args.env) 60 | env.seed(args.seed) 61 | env.action_space.seed(args.seed) 62 | train_tools.EVAL_SEED = args.seed 63 | 64 | obs_dim = env.observation_space.shape[0] 65 | act_dim = env.action_space.shape[0] 66 | act_bound = env.action_space.high[0] 67 | 68 | # create nets 69 | policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 70 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 71 | q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 72 | hidden_activation=nn.ReLU) 73 | 74 | q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 75 | hidden_activation=nn.ReLU) 76 | 77 | # create buffer 78 | if args.show: 79 | data_buffer = None 80 | else: 81 | data = data_tools.get_d4rl_dataset(env) 82 | data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size) 83 | 84 | agent = CQL_Agent( 85 | # parameters of PolicyBase 86 | env=env, 87 | gamma=0.99, 88 | eval_freq=args.eval_freq, 89 | max_train_step=args.max_train_step, 90 | train_id=args.train_id, 91 | log_interval=args.log_interval, 92 | resume=args.resume, 93 | device=args.device, 94 | 95 | # Parameters of OfflineBase 96 | data_buffer=data_buffer, 97 | 98 | # Parameters of CQL_Agent 99 | policy_net=policy_net, 100 | q_net1=q_net1, 101 | q_net2=q_net2, 102 | policy_lr=1e-4, 103 | qf_lr=3e-4, 104 | tau=0.05, 105 | alpha=0.5, 106 | auto_alpha_tuning=args.auto_alpha_tuning, 107 | min_q_weight=args.min_q_weight, 108 | entropy_backup=args.entropy_backup, 109 | max_q_backup=args.max_q_backup, 110 | with_lagrange=args.with_lagrange, 111 | lagrange_thresh=args.lagrange_thresh, 112 | n_action_samples=10, 113 | ) 114 | 115 | if args.show: 116 | train_tools.evaluate(agent, 10, show=True) 117 | else: 118 | agent.learn() 119 | -------------------------------------------------------------------------------- /run/ddpg_gym.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.ddpg import DDPG_Agent 11 | from common.buffers import ReplayBuffer 12 | from common.networks import MLPQsaNet, DDPGMLPActor 13 | from utils import train_tools 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser(description='DDPG algorithm in gym environment') 17 | parser.add_argument('--env', type=str, default='Pendulum-v0', 18 | help='the name of environment') 19 | parser.add_argument('--capacity', type=int, default=50000, 20 | help='the max size of data buffer') 21 | parser.add_argument('--batch_size', type=int, default=64, 22 | help='the size of batch that sampled from buffer') 23 | parser.add_argument('--explore_step', type=int, default=2000, 24 | help='the steps of exploration before train') 25 | parser.add_argument('--eval_freq', type=int, default=1000, 26 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 27 | parser.add_argument('--max_train_step', type=int, default=100000, 28 | help='the max train step') 29 | parser.add_argument('--log_interval', type=int, default=1000, 30 | help='The number of steps taken to record the model and the tensorboard') 31 | parser.add_argument('--train_id', type=str, default='ddpg_gym_test', 32 | help='Path to save model and log tensorboard') 33 | parser.add_argument('--resume', action='store_true', default=False, 34 | help='whether load the last saved model to train') 35 | parser.add_argument('--device', type=str, default='cpu', 36 | help='Choose cpu or cuda') 37 | parser.add_argument('--show', action='store_true', default=False, 38 | help='show the trained model visually') 39 | parser.add_argument('--seed', type=int, default=10, 40 | help='the random seed') 41 | 42 | args = parser.parse_args() 43 | 44 | torch.manual_seed(args.seed) 45 | np.random.seed(args.seed) 46 | 47 | # create environment 48 | env = gym.make(args.env) 49 | env.seed(args.seed) 50 | env.action_space.seed(args.seed) 51 | train_tools.EVAL_SEED = args.seed 52 | 53 | obs_dim = env.observation_space.shape[0] 54 | act_dim = env.action_space.shape[0] 55 | act_bound = env.action_space.high[0] 56 | 57 | # create nets 58 | actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 59 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 60 | 61 | critic_net = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 62 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 63 | 64 | # create buffer 65 | if args.show: 66 | replay_buffer = None 67 | else: 68 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, 69 | act_dim=act_dim, 70 | capacity=args.capacity, 71 | batch_size=args.batch_size) 72 | 73 | agent = DDPG_Agent( 74 | # parameters of PolicyBase 75 | env=env, 76 | gamma=0.99, 77 | eval_freq=args.eval_freq, 78 | max_train_step=args.max_train_step, 79 | train_id=args.train_id, 80 | log_interval=args.log_interval, 81 | resume=args.resume, 82 | device=args.device, 83 | 84 | # Parameters of OffPolicyBase 85 | replay_buffer=replay_buffer, 86 | explore_step=args.explore_step, 87 | 88 | # Parameters of DDPG_Agent 89 | actor_net=actor_net, critic_net=critic_net, 90 | actor_lr=1e-4, critic_lr=1e-3, 91 | tau=0.005, 92 | gaussian_noise_sigma=0.1, 93 | ) 94 | 95 | if args.show: 96 | train_tools.evaluate(agent, 10, show=True) 97 | else: 98 | agent.learn() 99 | -------------------------------------------------------------------------------- /run/ddpg_unity.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import torch 7 | import torch.nn as nn 8 | import numpy as np 9 | from algos.ddpg import DDPG_Agent 10 | from common.buffers import ReplayBuffer 11 | from common.networks import MLPQsaNet, DDPGMLPActor 12 | from utils import train_tools 13 | from mlagents_envs.environment import UnityEnvironment 14 | from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel 15 | from gym_unity.envs import UnityToGymWrapper 16 | 17 | 18 | if __name__ == '__main__': 19 | parser = argparse.ArgumentParser(description='DDPG algorithm in gym environment') 20 | parser.add_argument('--env', type=str, default=None, 21 | help='the path of unity environment') 22 | parser.add_argument('--capacity', type=int, default=50000, 23 | help='the max size of data buffer') 24 | parser.add_argument('--batch_size', type=int, default=64, 25 | help='the size of batch that sampled from buffer') 26 | parser.add_argument('--explore_step', type=int, default=2000, 27 | help='the steps of exploration before train') 28 | # parser.add_argument('--eval_freq', type=int, default=1000, 29 | # help='how often (time steps) we evaluate') 30 | parser.add_argument('--max_train_step', type=int, default=100000, 31 | help='the max train step') 32 | parser.add_argument('--log_interval', type=int, default=1000, 33 | help='The number of steps taken to record the model and the tensorboard') 34 | parser.add_argument('--train_id', type=str, default='ddpg_unity_test', 35 | help='Path to save model and log tensorboard') 36 | parser.add_argument('--resume', action='store_true', default=False, 37 | help='whether load the last saved model to train') 38 | parser.add_argument('--device', type=str, default='cpu', 39 | help='Choose cpu or cuda') 40 | parser.add_argument('--show', action='store_true', default=False, 41 | help='show the trained model visually') 42 | parser.add_argument('--seed', type=int, default=10, 43 | help='the random seed') 44 | 45 | args = parser.parse_args() 46 | 47 | torch.manual_seed(args.seed) 48 | np.random.seed(args.seed) 49 | 50 | engine_configuration_channel = EngineConfigurationChannel() 51 | unity_env = UnityEnvironment(side_channels=[engine_configuration_channel], file_name=args.env) 52 | engine_configuration_channel.set_configuration_parameters( 53 | width=200, 54 | height=200, 55 | quality_level=5, 56 | time_scale=1 if args.show else 20, 57 | target_frame_rate=-1, 58 | capture_frame_rate=60) 59 | 60 | env = UnityToGymWrapper(unity_env=unity_env) 61 | env.seed(args.seed) 62 | env.action_space.seed(args.seed) 63 | train_tools.EVAL_SEED = args.seed 64 | 65 | obs_dim = env.observation_space.shape[0] 66 | act_dim = env.action_space.shape[0] 67 | act_bound = env.action_space.high[0] 68 | 69 | # create nets 70 | actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 71 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 72 | 73 | critic_net = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 74 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 75 | 76 | # create buffer 77 | if args.show: 78 | replay_buffer = None 79 | else: 80 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, 81 | act_dim=act_dim, 82 | capacity=args.capacity, 83 | batch_size=args.batch_size) 84 | 85 | agent = DDPG_Agent( 86 | # parameters of PolicyBase 87 | env=env, 88 | gamma=0.99, 89 | eval_freq=args.eval_freq, 90 | max_train_step=args.max_train_step, 91 | train_id=args.train_id, 92 | log_interval=args.log_interval, 93 | resume=args.resume, 94 | device=args.device, 95 | 96 | # Parameters of OffPolicyBase 97 | replay_buffer=replay_buffer, 98 | explore_step=args.explore_step, 99 | 100 | # Parameters of DDPG_Agent 101 | actor_net=actor_net, critic_net=critic_net, 102 | actor_lr=1e-4, critic_lr=1e-3, 103 | tau=0.005, 104 | gaussian_noise_sigma=0.1, 105 | ) 106 | 107 | if args.show: 108 | train_tools.evaluate_unity(agent, 10) 109 | else: 110 | agent.learn() 111 | -------------------------------------------------------------------------------- /run/ddqn_atari.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import torch 7 | import numpy as np 8 | from algos.ddqn import DDQN_Agent 9 | from common.buffers import ReplayBuffer 10 | from common.networks import ConvAtariQsNet 11 | from utils import train_tools 12 | from utils.atari_wrappers import make_atari_env 13 | 14 | 15 | if __name__ == '__main__': 16 | 17 | parser = argparse.ArgumentParser(description='DDQN algorithm in atari environment') 18 | parser.add_argument('--env', type=str, default='PongNoFrameskip-v4', 19 | help='the name of environment') 20 | parser.add_argument('--capacity', type=int, default=100000, 21 | help='the max size of data buffer') 22 | parser.add_argument('--batch_size', type=int, default=32, 23 | help='the size of batch that sampled from buffer') 24 | parser.add_argument('--explore_step', type=int, default=20000, 25 | help='the steps of exploration before train') 26 | parser.add_argument('--eval_freq', type=int, default=10000, 27 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 28 | parser.add_argument('--max_train_step', type=int, default=2000000, 29 | help='the max train step') 30 | parser.add_argument('--log_interval', type=int, default=1000, 31 | help='The number of steps taken to record the model and the tensorboard') 32 | parser.add_argument('--resume', action='store_true', default=False, 33 | help='whether load the last saved model to train') 34 | parser.add_argument('--train_id', type=str, default='ddqn_atari_test', 35 | help='Path to save model and log tensorboard') 36 | parser.add_argument('--device', type=str, default='cpu', 37 | help='Choose cpu or cuda') 38 | parser.add_argument('--show', action='store_true', default=False, 39 | help='show the trained model visually') 40 | parser.add_argument('--seed', type=int, default=10, 41 | help='the random seed') 42 | parser.add_argument('--scale_obs', action='store_true', default=False, 43 | help='whether scale the obs to 0-1') 44 | args = parser.parse_args() 45 | 46 | torch.manual_seed(args.seed) 47 | np.random.seed(args.seed) 48 | 49 | env = make_atari_env(args.env, scale_obs=args.scale_obs) 50 | env.seed(args.seed) 51 | env.action_space.seed(args.seed) 52 | train_tools.EVAL_SEED = args.seed 53 | 54 | obs_dim = env.observation_space.shape 55 | act_dim = env.action_space.n 56 | 57 | Q_net = ConvAtariQsNet(num_frames_stack=4, act_dim=act_dim) 58 | 59 | # create buffer 60 | if args.show: 61 | replay_buffer = None 62 | else: 63 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=1, 64 | capacity=args.capacity, batch_size=args.batch_size) 65 | 66 | agent = DDQN_Agent( 67 | # parameters of PolicyBase 68 | env=env, 69 | gamma=0.99, 70 | eval_freq=args.eval_freq, 71 | max_train_step=args.max_train_step, 72 | train_id=args.train_id, 73 | log_interval=args.log_interval, 74 | resume=args.resume, 75 | device=args.device, 76 | 77 | # Parameters of OffPolicyBase 78 | replay_buffer=replay_buffer, 79 | explore_step=args.explore_step, 80 | 81 | # Parameters of DDQN_Agent 82 | Q_net=Q_net, 83 | qf_lr=1e-4, 84 | initial_eps=0.1, 85 | end_eps=0.001, 86 | eps_decay_period=1000000, 87 | eval_eps=0.001, 88 | target_update_freq=1000, 89 | ) 90 | 91 | if args.show: 92 | train_tools.evaluate(agent, 10, show=True) 93 | else: 94 | agent.learn() 95 | 96 | -------------------------------------------------------------------------------- /run/ddqn_gym.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.ddqn import DDQN_Agent 11 | from common.buffers import ReplayBuffer 12 | from common.networks import MLPQsNet 13 | from utils import train_tools 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | parser = argparse.ArgumentParser(description='DDQN algorithm in gym environment') 19 | parser.add_argument('--env', type=str, default='CartPole-v0', 20 | help='the name of environment') 21 | parser.add_argument('--capacity', type=int, default=5000, 22 | help='the max size of data buffer') 23 | parser.add_argument('--batch_size', type=int, default=128, 24 | help='the size of batch that sampled from buffer') 25 | parser.add_argument('--explore_step', type=int, default=500, 26 | help='the steps of exploration before train') 27 | parser.add_argument('--eval_freq', type=int, default=1000, 28 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 29 | parser.add_argument('--max_train_step', type=int, default=10000, 30 | help='the max train step') 31 | parser.add_argument('--log_interval', type=int, default=500, 32 | help='The number of steps taken to record the model and the tensorboard') 33 | parser.add_argument('--resume', action='store_true', default=False, 34 | help='whether load the last saved model to train') 35 | parser.add_argument('--train_id', type=str, default='ddqn_gym_test', 36 | help='Path to save model and log tensorboard') 37 | parser.add_argument('--device', type=str, default='cpu', 38 | help='Choose cpu or cuda') 39 | parser.add_argument('--show', action='store_true', default=False, 40 | help='show the trained model visually') 41 | parser.add_argument('--seed', type=int, default=10, 42 | help='the random seed') 43 | 44 | args = parser.parse_args() 45 | 46 | torch.manual_seed(args.seed) 47 | np.random.seed(args.seed) 48 | 49 | env = gym.make(args.env) 50 | env.seed(args.seed) 51 | env.action_space.seed(args.seed) 52 | train_tools.EVAL_SEED = args.seed 53 | 54 | obs_dim = env.observation_space.shape[0] 55 | act_dim = env.action_space.n 56 | 57 | Q_net = MLPQsNet(obs_dim=obs_dim, act_dim=act_dim, 58 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 59 | 60 | # create buffer 61 | if args.show: 62 | replay_buffer = None 63 | else: 64 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=1, 65 | capacity=args.capacity, batch_size=args.batch_size) 66 | 67 | agent = DDQN_Agent( 68 | # parameters of PolicyBase 69 | env=env, 70 | gamma=0.99, 71 | eval_freq=args.eval_freq, 72 | max_train_step=args.max_train_step, 73 | train_id=args.train_id, 74 | log_interval=args.log_interval, 75 | resume=args.resume, 76 | device=args.device, 77 | 78 | # Parameters of OffPolicyBase 79 | replay_buffer=replay_buffer, 80 | explore_step=args.explore_step, 81 | 82 | # Parameters of DDQN_Agent 83 | Q_net=Q_net, 84 | qf_lr=0.001, 85 | initial_eps=0.1, 86 | end_eps=0.001, 87 | eps_decay_period=2000, 88 | eval_eps=0.001, 89 | target_update_freq=10, 90 | ) 91 | 92 | if args.show: 93 | train_tools.evaluate(agent, 10, show=True) 94 | else: 95 | agent.learn() 96 | 97 | -------------------------------------------------------------------------------- /run/dqn_atari.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import torch 7 | import numpy as np 8 | from algos.dqn import DQN_Agent 9 | from common.buffers import ReplayBuffer 10 | from common.networks import ConvAtariQsNet 11 | from utils import train_tools 12 | from utils.atari_wrappers import make_atari_env 13 | 14 | 15 | if __name__ == '__main__': 16 | 17 | parser = argparse.ArgumentParser(description='DQN algorithm in atari environment') 18 | parser.add_argument('--env', type=str, default='PongNoFrameskip-v4', 19 | help='the name of environment') 20 | parser.add_argument('--capacity', type=int, default=100000, 21 | help='the max size of data buffer') 22 | parser.add_argument('--batch_size', type=int, default=32, 23 | help='the size of batch that sampled from buffer') 24 | parser.add_argument('--explore_step', type=int, default=20000, 25 | help='the steps of exploration before train') 26 | parser.add_argument('--eval_freq', type=int, default=10000, 27 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 28 | parser.add_argument('--max_train_step', type=int, default=2000000, 29 | help='the max train step') 30 | parser.add_argument('--log_interval', type=int, default=1000, 31 | help='The number of steps taken to record the model and the tensorboard') 32 | parser.add_argument('--resume', action='store_true', default=False, 33 | help='whether load the last saved model to train') 34 | parser.add_argument('--train_id', type=str, default='dqn_atari_test', 35 | help='Path to save model and log tensorboard') 36 | parser.add_argument('--device', type=str, default='cpu', 37 | help='Choose cpu or cuda') 38 | parser.add_argument('--show', action='store_true', default=False, 39 | help='show the trained model visually') 40 | parser.add_argument('--seed', type=int, default=10, 41 | help='the random seed') 42 | parser.add_argument('--scale_obs', action='store_true', default=False, 43 | help='whether scale the obs to 0-1') 44 | args = parser.parse_args() 45 | 46 | torch.manual_seed(args.seed) 47 | np.random.seed(args.seed) 48 | 49 | env = make_atari_env(args.env, scale_obs=args.scale_obs) 50 | env.seed(args.seed) 51 | env.action_space.seed(args.seed) 52 | train_tools.EVAL_SEED = args.seed 53 | 54 | obs_dim = env.observation_space.shape 55 | act_dim = env.action_space.n 56 | 57 | Q_net = ConvAtariQsNet(num_frames_stack=4, act_dim=act_dim) 58 | 59 | # create buffer 60 | if args.show: 61 | replay_buffer = None 62 | else: 63 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=1, 64 | capacity=args.capacity, batch_size=args.batch_size) 65 | 66 | agent = DQN_Agent( 67 | # parameters of PolicyBase 68 | env=env, 69 | gamma=0.99, 70 | eval_freq=args.eval_freq, 71 | max_train_step=args.max_train_step, 72 | train_id=args.train_id, 73 | log_interval=args.log_interval, 74 | resume=args.resume, 75 | device=args.device, 76 | 77 | # Parameters of OffPolicyBase 78 | replay_buffer=replay_buffer, 79 | explore_step=args.explore_step, 80 | 81 | # Parameters of DQN_Agent 82 | Q_net=Q_net, 83 | qf_lr=1e-4, 84 | initial_eps=0.1, 85 | end_eps=0.001, 86 | eps_decay_period=1000000, 87 | eval_eps=0.001, 88 | target_update_freq=1000, 89 | ) 90 | 91 | if args.show: 92 | train_tools.evaluate(agent, 10, show=True) 93 | else: 94 | agent.learn() 95 | -------------------------------------------------------------------------------- /run/dqn_gym.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.dqn import DQN_Agent 11 | from common.buffers import ReplayBuffer 12 | from common.networks import MLPQsNet 13 | from utils import train_tools 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | parser = argparse.ArgumentParser(description='DQN algorithm in gym environment') 19 | parser.add_argument('--env', type=str, default='CartPole-v0', 20 | help='the name of environment') 21 | parser.add_argument('--capacity', type=int, default=5000, 22 | help='the max size of data buffer') 23 | parser.add_argument('--batch_size', type=int, default=128, 24 | help='the size of batch that sampled from buffer') 25 | parser.add_argument('--explore_step', type=int, default=500, 26 | help='the steps of exploration before train') 27 | parser.add_argument('--eval_freq', type=int, default=1000, 28 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 29 | parser.add_argument('--max_train_step', type=int, default=10000, 30 | help='the max train step') 31 | parser.add_argument('--log_interval', type=int, default=500, 32 | help='The number of steps taken to record the model and the tensorboard') 33 | parser.add_argument('--resume', action='store_true', default=False, 34 | help='whether load the last saved model to train') 35 | parser.add_argument('--train_id', type=str, default='dqn_gym_test', 36 | help='Path to save model and log tensorboard') 37 | parser.add_argument('--device', type=str, default='cpu', 38 | help='Choose cpu or cuda') 39 | parser.add_argument('--show', action='store_true', default=False, 40 | help='show the trained model visually') 41 | parser.add_argument('--seed', type=int, default=10, 42 | help='the random seed') 43 | 44 | args = parser.parse_args() 45 | 46 | torch.manual_seed(args.seed) 47 | np.random.seed(args.seed) 48 | 49 | env = gym.make(args.env) 50 | env.seed(args.seed) 51 | env.action_space.seed(args.seed) 52 | train_tools.EVAL_SEED = args.seed 53 | 54 | obs_dim = env.observation_space.shape[0] 55 | act_dim = env.action_space.n 56 | 57 | Q_net = MLPQsNet(obs_dim=obs_dim, act_dim=act_dim, 58 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 59 | 60 | # create buffer 61 | if args.show: 62 | replay_buffer = None 63 | else: 64 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=1, 65 | capacity=args.capacity, batch_size=args.batch_size) 66 | 67 | agent = DQN_Agent( 68 | # parameters of PolicyBase 69 | env=env, 70 | gamma=0.99, 71 | eval_freq=args.eval_freq, 72 | max_train_step=args.max_train_step, 73 | train_id=args.train_id, 74 | log_interval=args.log_interval, 75 | resume=args.resume, 76 | device=args.device, 77 | 78 | # Parameters of OffPolicyBase 79 | replay_buffer=replay_buffer, 80 | explore_step=args.explore_step, 81 | 82 | # Parameters of DQN_Agent 83 | Q_net=Q_net, 84 | qf_lr=0.001, 85 | initial_eps=0.1, 86 | end_eps=0.001, 87 | eps_decay_period=2000, 88 | eval_eps=0.001, 89 | target_update_freq=10, 90 | ) 91 | 92 | if args.show: 93 | train_tools.evaluate(agent, 10, show=True) 94 | else: 95 | agent.learn() 96 | 97 | -------------------------------------------------------------------------------- /run/plas_mujoco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.offline.plas import PLAS_Agent 11 | from common.buffers import OfflineBuffer 12 | from common.networks import MLPQsaNet, CVAE, PLAS_Actor, DDPGMLPActor 13 | from utils import train_tools, data_tools 14 | 15 | 16 | if __name__ == '__main__': 17 | parser = argparse.ArgumentParser(description='PLAS algorithm in mujoco environment') 18 | parser.add_argument('--env', type=str, default='hopper-medium-v0', 19 | help='the name of environment') 20 | parser.add_argument('--batch_size', type=int, default=100, 21 | help='the size of batch that sampled from buffer') 22 | parser.add_argument('--max_train_step', type=int, default=500000, 23 | help='the max train step') 24 | parser.add_argument('--max_cvae_iterations', type=int, default=500000, 25 | help='the num of iterations when training CVAE model') 26 | parser.add_argument('--use_ptb', action='store_true', default=False, 27 | help='whether use perturbation layer') 28 | parser.add_argument('--log_interval', type=int, default=1000, 29 | help='The number of steps taken to record the model and the tensorboard') 30 | parser.add_argument('--train_id', type=str, default='plas_mujoco_test', 31 | help='Path to save model and log tensorboard') 32 | parser.add_argument('--resume', action='store_true', default=False, 33 | help='whether load the last saved model to train') 34 | parser.add_argument('--device', type=str, default='cpu', 35 | help='Choose cpu or cuda') 36 | parser.add_argument('--show', action='store_true', default=False, 37 | help='show the trained model visually') 38 | parser.add_argument('--eval_freq', type=int, default=5000, 39 | help='how often (time steps) we evaluate') 40 | parser.add_argument('--seed', type=int, default=10, 41 | help='the random seed') 42 | 43 | args = parser.parse_args() 44 | 45 | torch.manual_seed(args.seed) 46 | np.random.seed(args.seed) 47 | 48 | # create environment 49 | env = gym.make(args.env) 50 | env.seed(args.seed) 51 | env.action_space.seed(args.seed) 52 | train_tools.EVAL_SEED = args.seed 53 | 54 | obs_dim = env.observation_space.shape[0] 55 | act_dim = env.action_space.shape[0] 56 | act_bound = env.action_space.high[0] 57 | 58 | critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300], 59 | hidden_activation=nn.ReLU) 60 | critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300], 61 | hidden_activation=nn.ReLU) 62 | 63 | cvae_net = CVAE(obs_dim=obs_dim, act_dim=act_dim, 64 | latent_dim=2 * act_dim, act_bound=act_bound) 65 | 66 | actor_net = PLAS_Actor(obs_dim=obs_dim, act_dim=act_dim, latent_act_dim=2 * act_dim, 67 | act_bound=act_bound, latent_act_bound=2, 68 | actor_hidden_size=[400, 300], ptb_hidden_size=[400, 300], hidden_activation=nn.ReLU, 69 | use_ptb=args.use_ptb, phi=0.05) 70 | 71 | # create buffer 72 | if args.show: 73 | data_buffer = None 74 | else: 75 | data = data_tools.get_d4rl_dataset(env) 76 | data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size) 77 | 78 | agent = PLAS_Agent( 79 | # parameters of PolicyBase 80 | env=env, 81 | gamma=0.99, 82 | eval_freq=args.eval_freq, 83 | max_train_step=args.max_train_step, 84 | train_id=args.train_id, 85 | log_interval=args.log_interval, 86 | resume=args.resume, 87 | device=args.device, 88 | 89 | # Parameters of OfflineBase 90 | data_buffer=data_buffer, 91 | 92 | # Parameters of PLAS_Agent 93 | critic_net1=critic_net1, 94 | critic_net2=critic_net2, 95 | actor_net=actor_net, 96 | cvae_net=cvae_net, # generation model 97 | critic_lr=1e-3, 98 | actor_lr=1e-4, 99 | cvae_lr=1e-4, 100 | tau=0.005, 101 | lmbda=1, # used for double clipped double q-learning 102 | max_cvae_iterations=args.max_cvae_iterations, 103 | ) 104 | 105 | if args.show: 106 | train_tools.evaluate(agent, 10, show=True) 107 | else: 108 | agent.learn() 109 | -------------------------------------------------------------------------------- /run/ppo_gym.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from pathlib import Path 4 | # sys.path.append(str(Path(__file__).absolute().parent.parent)) 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 | 7 | import argparse 8 | import gym 9 | from gym.spaces import Box, Discrete 10 | import torch 11 | import torch.nn as nn 12 | import numpy as np 13 | from algos.ppo import PPO_Agent 14 | from common.buffers import TrajectoryBuffer 15 | from common.networks import MLPVsNet, MLPCategoricalActor, MLPGaussianActor 16 | from utils import train_tools 17 | 18 | if __name__ == '__main__': 19 | parser = argparse.ArgumentParser(description='PPO algorithm in gym environment') 20 | parser.add_argument('--env', type=str, default='CartPole-v0', 21 | help='the name of environment') 22 | parser.add_argument('--gae_norm', action='store_true', default=False, 23 | help='whether normalize the GAE') 24 | parser.add_argument('--traj_length', type=int, default=128, 25 | help='the length of trajectory') 26 | parser.add_argument('--eval_freq', type=int, default=1000, 27 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 28 | parser.add_argument('--max_train_step', type=int, default=100000, 29 | help='the max time step to train') 30 | parser.add_argument('--log_interval', type=int, default=1000, 31 | help='The number of steps taken to record the model and the tensorboard') 32 | parser.add_argument('--train_id', type=str, default='ppo_gym_test', 33 | help='Path to save model and log tensorboard') 34 | parser.add_argument('--resume', action='store_true', default=False, 35 | help='whether load the last saved model to train') 36 | parser.add_argument('--device', type=str, default='cpu', 37 | help='Choose cpu or cuda') 38 | parser.add_argument('--show', action='store_true', default=False, 39 | help='show the trained model visually') 40 | parser.add_argument('--seed', type=int, default=10, 41 | help='the random seed') 42 | 43 | args = parser.parse_args() 44 | 45 | torch.manual_seed(args.seed) 46 | np.random.seed(args.seed) 47 | 48 | # create environment 49 | env = gym.make(args.env) 50 | env.seed(args.seed) 51 | env.action_space.seed(args.seed) 52 | train_tools.EVAL_SEED = args.seed 53 | 54 | obs_dim = env.observation_space.shape[0] 55 | 56 | # create nets 57 | if isinstance(env.action_space, Discrete): 58 | act_num = env.action_space.n 59 | buffer_act_dim = 1 60 | actor_net = MLPCategoricalActor(obs_dim=obs_dim, act_num=act_num, 61 | hidden_size=[64, 64], hidden_activation=nn.Tanh) 62 | elif isinstance(env.action_space, Box): 63 | act_dim = env.action_space.shape[0] 64 | buffer_act_dim = act_dim 65 | actor_net = MLPGaussianActor(obs_dim=obs_dim, act_dim=act_dim, 66 | hidden_size=[64, 64], hidden_activation=nn.Tanh) 67 | 68 | critic_net = MLPVsNet(obs_dim=obs_dim, hidden_size=[64, 64], hidden_activation=nn.Tanh) 69 | 70 | 71 | # create buffer 72 | if args.show: 73 | trajectory_buffer = None 74 | else: 75 | trajectory_buffer = TrajectoryBuffer(obs_dim=obs_dim, 76 | act_dim=buffer_act_dim, 77 | capacity=args.traj_length) 78 | 79 | agent = PPO_Agent( 80 | # parameters of PolicyBase 81 | env=env, 82 | gamma=0.99, 83 | eval_freq=args.eval_freq, 84 | max_train_step=args.max_train_step, 85 | train_id=args.train_id, 86 | log_interval=args.log_interval, 87 | resume=args.resume, 88 | device=args.device, 89 | 90 | # Parameters of PPO_Agent 91 | trajectory_buffer=trajectory_buffer, 92 | actor_net=actor_net, 93 | critic_net=critic_net, 94 | actor_lr=3e-4, 95 | critic_lr=1e-3, 96 | gae_lambda=0.95, 97 | gae_normalize=args.gae_norm, 98 | clip_pram=0.2, 99 | trajectory_length=args.traj_length, # the length of a trajectory_ 100 | train_actor_iters=10, 101 | train_critic_iters=10, 102 | ) 103 | 104 | if args.show: 105 | train_tools.evaluate(agent, 10, show=True) 106 | else: 107 | agent.learn() 108 | -------------------------------------------------------------------------------- /run/ppo_mujoco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from pathlib import Path 4 | # sys.path.append(str(Path(__file__).absolute().parent.parent)) 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 | 7 | import argparse 8 | import gym 9 | from gym.spaces import Box, Discrete 10 | import torch 11 | import torch.nn as nn 12 | import numpy as np 13 | from algos.ppo import PPO_Agent 14 | from common.buffers import TrajectoryBuffer 15 | from common.networks import MLPVsNet, MLPCategoricalActor, MLPGaussianActor 16 | from utils import train_tools 17 | 18 | if __name__ == '__main__': 19 | parser = argparse.ArgumentParser(description='PPO algorithm in mujoco environment') 20 | parser.add_argument('--env', type=str, default='Hopper-v2', 21 | help='the name of environment') 22 | parser.add_argument('--gae_norm', action='store_true', default=False, 23 | help='whether normalize the GAE') 24 | parser.add_argument('--traj_length', type=int, default=2048, 25 | help='the length of trajectory') 26 | parser.add_argument('--eval_freq', type=int, default=5000, 27 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 28 | parser.add_argument('--max_train_step', type=int, default=1000000, 29 | help='the max time step to train') 30 | parser.add_argument('--log_interval', type=int, default=5000, 31 | help='The number of steps taken to record the model and the tensorboard') 32 | parser.add_argument('--train_id', type=str, default='ppo_mujoco_test', 33 | help='Path to save model and log tensorboard') 34 | parser.add_argument('--resume', action='store_true', default=False, 35 | help='whether load the last saved model to train') 36 | parser.add_argument('--device', type=str, default='cpu', 37 | help='Choose cpu or cuda') 38 | parser.add_argument('--show', action='store_true', default=False, 39 | help='show the trained model visually') 40 | parser.add_argument('--seed', type=int, default=10, 41 | help='the random seed') 42 | 43 | args = parser.parse_args() 44 | 45 | torch.manual_seed(args.seed) 46 | np.random.seed(args.seed) 47 | 48 | # create environment 49 | env = gym.make(args.env) 50 | env.seed(args.seed) 51 | env.action_space.seed(args.seed) 52 | train_tools.EVAL_SEED = args.seed 53 | 54 | obs_dim = env.observation_space.shape[0] 55 | 56 | # create nets 57 | if isinstance(env.action_space, Discrete): 58 | act_num = env.action_space.n 59 | buffer_act_dim = 1 60 | actor_net = MLPCategoricalActor(obs_dim=obs_dim, act_num=act_num, 61 | hidden_size=[64, 64], hidden_activation=nn.Tanh) 62 | elif isinstance(env.action_space, Box): 63 | act_dim = env.action_space.shape[0] 64 | buffer_act_dim = act_dim 65 | actor_net = MLPGaussianActor(obs_dim=obs_dim, act_dim=act_dim, 66 | hidden_size=[64, 64], hidden_activation=nn.Tanh) 67 | 68 | critic_net = MLPVsNet(obs_dim=obs_dim, hidden_size=[64, 64], hidden_activation=nn.Tanh) 69 | 70 | 71 | # create buffer 72 | if args.show: 73 | trajectory_buffer = None 74 | else: 75 | trajectory_buffer = TrajectoryBuffer(obs_dim=obs_dim, 76 | act_dim=buffer_act_dim, 77 | capacity=args.traj_length) 78 | 79 | agent = PPO_Agent( 80 | # parameters of PolicyBase 81 | env=env, 82 | gamma=0.99, 83 | eval_freq=args.eval_freq, 84 | max_train_step=args.max_train_step, 85 | train_id=args.train_id, 86 | log_interval=args.log_interval, 87 | resume=args.resume, 88 | device=args.device, 89 | 90 | # Parameters of PPO_Agent 91 | trajectory_buffer=trajectory_buffer, 92 | actor_net=actor_net, 93 | critic_net=critic_net, 94 | actor_lr=3e-4, 95 | critic_lr=1e-3, 96 | gae_lambda=0.95, 97 | gae_normalize=args.gae_norm, 98 | clip_pram=0.2, 99 | trajectory_length=args.traj_length, # the length of a trajectory_ 100 | train_actor_iters=80, 101 | train_critic_iters=80, 102 | ) 103 | 104 | if args.show: 105 | train_tools.evaluate(agent, 10, show=True) 106 | else: 107 | agent.learn() 108 | -------------------------------------------------------------------------------- /run/sac_gym.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from pathlib import Path 4 | # sys.path.append(str(Path(__file__).absolute().parent.parent)) 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 | 7 | import argparse 8 | import gym 9 | import torch 10 | import torch.nn as nn 11 | import numpy as np 12 | from algos.sac import SAC_Agent 13 | from common.buffers import ReplayBuffer 14 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy 15 | from utils import train_tools 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser(description='SAC algorithm in gym environment') 19 | parser.add_argument('--env', type=str, default='Pendulum-v0', 20 | help='the name of environment') 21 | parser.add_argument('--capacity', type=int, default=50000, 22 | help='the max size of data buffer') 23 | parser.add_argument('--batch_size', type=int, default=128, 24 | help='the size of batch that sampled from buffer') 25 | parser.add_argument('--alpha', type=float, default=0.5, 26 | help='the coefficient of entropy') 27 | parser.add_argument('--auto_alpha_tuning', action='store_true', default=False, 28 | help='whether automatic tune alpha') 29 | parser.add_argument('--explore_step', type=int, default=2000, 30 | help='the steps of exploration before train') 31 | parser.add_argument('--eval_freq', type=int, default=1000, 32 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 33 | parser.add_argument('--max_train_step', type=int, default=100000, 34 | help='the max train step') 35 | parser.add_argument('--log_interval', type=int, default=1000, 36 | help='The number of steps taken to record the model and the tensorboard') 37 | parser.add_argument('--train_id', type=str, default='sac_gym_test', 38 | help='Path to save model and log tensorboard') 39 | parser.add_argument('--resume', action='store_true', default=False, 40 | help='whether load the last saved model to train') 41 | parser.add_argument('--device', type=str, default='cpu', 42 | help='Choose cpu or cuda') 43 | parser.add_argument('--show', action='store_true', default=False, 44 | help='show the trained model visually') 45 | parser.add_argument('--seed', type=int, default=10, 46 | help='the random seed') 47 | 48 | args = parser.parse_args() 49 | 50 | torch.manual_seed(args.seed) 51 | np.random.seed(args.seed) 52 | 53 | # create environment 54 | env = gym.make(args.env) 55 | env.seed(args.seed) 56 | env.action_space.seed(args.seed) 57 | train_tools.EVAL_SEED = args.seed 58 | 59 | obs_dim = env.observation_space.shape[0] 60 | act_dim = env.action_space.shape[0] 61 | act_bound = env.action_space.high[0] 62 | 63 | # create nets 64 | policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 65 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 66 | q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 67 | hidden_activation=nn.ReLU) 68 | 69 | q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 70 | hidden_activation=nn.ReLU) 71 | 72 | # create buffer 73 | if args.show: 74 | replay_buffer = None 75 | else: 76 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, 77 | act_dim=act_dim, 78 | capacity=args.capacity, 79 | batch_size=args.batch_size) 80 | 81 | agent = SAC_Agent( 82 | # parameters of PolicyBase 83 | env=env, 84 | gamma=0.99, 85 | eval_freq=args.eval_freq, 86 | max_train_step=args.max_train_step, 87 | train_id=args.train_id, 88 | log_interval=args.log_interval, 89 | resume=args.resume, 90 | device=args.device, 91 | 92 | # Parameters of OffPolicyBase 93 | replay_buffer=replay_buffer, 94 | explore_step=args.explore_step, 95 | 96 | # Parameters of SAC_Agent 97 | policy_net=policy_net, 98 | q_net1=q_net1, # critic 99 | q_net2=q_net2, 100 | policy_lr=4e-3, 101 | qf_lr=4e-3, 102 | tau=0.005, 103 | alpha=args.alpha, 104 | auto_alpha_tuning=args.auto_alpha_tuning, 105 | ) 106 | 107 | if args.show: 108 | train_tools.evaluate(agent, 10, show=True) 109 | else: 110 | agent.learn() 111 | -------------------------------------------------------------------------------- /run/sac_mujoco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.sac import SAC_Agent 11 | from common.buffers import ReplayBuffer 12 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy 13 | from utils import train_tools 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser(description='SAC algorithm in mujoco environment') 17 | parser.add_argument('--env', type=str, default='Hopper-v2', 18 | help='the name of environment') 19 | parser.add_argument('--capacity', type=int, default=1000000, 20 | help='the max size of data buffer') 21 | parser.add_argument('--batch_size', type=int, default=256, 22 | help='the size of batch that sampled from buffer') 23 | parser.add_argument('--alpha', type=float, default=0.5, 24 | help='the coefficient of entropy') 25 | parser.add_argument('--auto_alpha_tuning', action='store_true', default=False, 26 | help='whether automatic tune alpha') 27 | parser.add_argument('--explore_step', type=int, default=20000, 28 | help='the steps of exploration before train') 29 | parser.add_argument('--eval_freq', type=int, default=5000, 30 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 31 | parser.add_argument('--max_train_step', type=int, default=3000000, 32 | help='the max train step') 33 | parser.add_argument('--log_interval', type=int, default=1000, 34 | help='The number of steps taken to record the model and the tensorboard') 35 | parser.add_argument('--train_id', type=str, default='sac_mujoco_test', 36 | help='Path to save model and log tensorboard') 37 | parser.add_argument('--resume', action='store_true', default=False, 38 | help='whether load the last saved model to train') 39 | parser.add_argument('--device', type=str, default='cpu', 40 | help='Choose cpu or cuda') 41 | parser.add_argument('--show', action='store_true', default=False, 42 | help='show the trained model visually') 43 | parser.add_argument('--seed', type=int, default=10, 44 | help='the random seed') 45 | 46 | args = parser.parse_args() 47 | 48 | torch.manual_seed(args.seed) 49 | np.random.seed(args.seed) 50 | 51 | # create environment 52 | env = gym.make(args.env) 53 | env.seed(args.seed) 54 | env.action_space.seed(args.seed) 55 | train_tools.EVAL_SEED = args.seed 56 | 57 | obs_dim = env.observation_space.shape[0] 58 | act_dim = env.action_space.shape[0] 59 | act_bound = env.action_space.high[0] 60 | 61 | # create nets 62 | policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 63 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 64 | q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 65 | hidden_activation=nn.ReLU) 66 | 67 | q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 68 | hidden_activation=nn.ReLU) 69 | 70 | # create buffer 71 | if args.show: 72 | replay_buffer = None 73 | else: 74 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, 75 | act_dim=act_dim, 76 | capacity=args.capacity, 77 | batch_size=args.batch_size) 78 | 79 | agent = SAC_Agent( 80 | # parameters of PolicyBase 81 | env=env, 82 | gamma=0.99, 83 | eval_freq=args.eval_freq, 84 | max_train_step=args.max_train_step, 85 | train_id=args.train_id, 86 | log_interval=args.log_interval, 87 | resume=args.resume, 88 | device=args.device, 89 | 90 | # Parameters of OffPolicyBase 91 | replay_buffer=replay_buffer, 92 | explore_step=args.explore_step, 93 | 94 | # Parameters of SAC_Agent 95 | policy_net=policy_net, 96 | q_net1=q_net1, # critic 97 | q_net2=q_net2, 98 | policy_lr=3e-4, 99 | qf_lr=3e-4, 100 | tau=0.005, 101 | alpha=args.alpha, 102 | auto_alpha_tuning=args.auto_alpha_tuning, 103 | ) 104 | 105 | if args.show: 106 | train_tools.evaluate(agent, 10, show=True) 107 | else: 108 | agent.learn() 109 | -------------------------------------------------------------------------------- /run/sac_offline_mujoco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.offline.sac_offline import SAC_Offline_Agent 11 | from common.buffers import OfflineBuffer 12 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy 13 | from utils import train_tools, data_tools 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser(description='Offline SAC in mujoco environment') 17 | parser.add_argument('--env', type=str, default='hopper-medium-v0', 18 | help='the name of environment') 19 | parser.add_argument('--batch_size', type=int, default=256, 20 | help='the size of batch that sampled from buffer') 21 | parser.add_argument('--auto_alpha_tuning', action='store_true', default=False, 22 | help='whether automatic tune alpha') 23 | 24 | parser.add_argument('--max_train_step', type=int, default=1000000, 25 | help='the max train step') 26 | parser.add_argument('--log_interval', type=int, default=1000, 27 | help='The number of steps taken to record the model and the tensorboard') 28 | parser.add_argument('--train_id', type=str, default='sac_offline_mujoco_test', 29 | help='Path to save model and log tensorboard') 30 | parser.add_argument('--resume', action='store_true', default=False, 31 | help='whether load the last saved model to train') 32 | parser.add_argument('--device', type=str, default='cpu', 33 | help='Choose cpu or cuda') 34 | parser.add_argument('--show', action='store_true', default=False, 35 | help='show the trained model visually') 36 | parser.add_argument('--eval_freq', type=int, default=5000, 37 | help='how often (time steps) we evaluate') 38 | parser.add_argument('--seed', type=int, default=10, 39 | help='the random seed') 40 | 41 | args = parser.parse_args() 42 | 43 | torch.manual_seed(args.seed) 44 | np.random.seed(args.seed) 45 | 46 | # create environment 47 | env = gym.make(args.env) 48 | env.seed(args.seed) 49 | env.action_space.seed(args.seed) 50 | train_tools.EVAL_SEED = args.seed 51 | 52 | obs_dim = env.observation_space.shape[0] 53 | act_dim = env.action_space.shape[0] 54 | act_bound = env.action_space.high[0] 55 | 56 | # create nets 57 | policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 58 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 59 | q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 60 | hidden_activation=nn.ReLU) 61 | 62 | q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 63 | hidden_activation=nn.ReLU) 64 | 65 | # create buffer 66 | if args.show: 67 | data_buffer = None 68 | else: 69 | data = data_tools.get_d4rl_dataset(env) 70 | data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size) 71 | 72 | agent = SAC_Offline_Agent( 73 | # parameters of PolicyBase 74 | env=env, 75 | gamma=0.99, 76 | eval_freq=args.eval_freq, 77 | max_train_step=args.max_train_step, 78 | train_id=args.train_id, 79 | log_interval=args.log_interval, 80 | resume=args.resume, 81 | device=args.device, 82 | 83 | # Parameters of OfflineBase 84 | data_buffer=data_buffer, 85 | 86 | # Parameters of SAC_Offline_Agent 87 | policy_net=policy_net, 88 | q_net1=q_net1, 89 | q_net2=q_net2, 90 | policy_lr=1e-4, 91 | qf_lr=3e-4, 92 | tau=0.05, 93 | alpha=0.5, 94 | auto_alpha_tuning=args.auto_alpha_tuning 95 | ) 96 | 97 | if args.show: 98 | train_tools.evaluate(agent, 10, show=True) 99 | else: 100 | agent.learn() 101 | -------------------------------------------------------------------------------- /run/sac_unity.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import torch 7 | import torch.nn as nn 8 | import numpy as np 9 | from algos.sac import SAC_Agent 10 | from common.buffers import ReplayBuffer 11 | from common.networks import MLPQsaNet, MLPSquashedReparamGaussianPolicy 12 | from utils import train_tools 13 | from mlagents_envs.environment import UnityEnvironment 14 | from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel 15 | from gym_unity.envs import UnityToGymWrapper 16 | 17 | 18 | if __name__ == '__main__': 19 | parser = argparse.ArgumentParser(description='SAC algorithm in unity environment') 20 | parser.add_argument('--env', type=str, default=None, 21 | help='the path of unity environment') 22 | parser.add_argument('--capacity', type=int, default=50000, 23 | help='the max size of data buffer') 24 | parser.add_argument('--batch_size', type=int, default=128, 25 | help='the size of batch that sampled from buffer') 26 | parser.add_argument('--alpha', type=float, default=0.5, 27 | help='the coefficient of entropy') 28 | parser.add_argument('--auto_alpha_tuning', action='store_true', default=False, 29 | help='whether automatic tune alpha') 30 | parser.add_argument('--explore_step', type=int, default=2000, 31 | help='the steps of exploration before train') 32 | # parser.add_argument('--eval_freq', type=int, default=1000, 33 | # help='how often (time steps) we evaluate') 34 | parser.add_argument('--max_train_step', type=int, default=100000, 35 | help='the max train step') 36 | parser.add_argument('--log_interval', type=int, default=1000, 37 | help='The number of steps taken to record the model and the tensorboard') 38 | parser.add_argument('--train_id', type=str, default='sac_unity_test', 39 | help='Path to save model and log tensorboard') 40 | parser.add_argument('--resume', action='store_true', default=False, 41 | help='whether load the last saved model to train') 42 | parser.add_argument('--device', type=str, default='cpu', 43 | help='Choose cpu or cuda') 44 | parser.add_argument('--show', action='store_true', default=False, 45 | help='show the trained model visually') 46 | parser.add_argument('--seed', type=int, default=10, 47 | help='the random seed') 48 | 49 | args = parser.parse_args() 50 | 51 | torch.manual_seed(args.seed) 52 | np.random.seed(args.seed) 53 | 54 | engine_configuration_channel = EngineConfigurationChannel() 55 | unity_env = UnityEnvironment(side_channels=[engine_configuration_channel], file_name=args.env) 56 | engine_configuration_channel.set_configuration_parameters( 57 | width=200, 58 | height=200, 59 | quality_level=5, 60 | time_scale=1 if args.show else 20, 61 | target_frame_rate=-1, 62 | capture_frame_rate=60) 63 | 64 | env = UnityToGymWrapper(unity_env=unity_env) 65 | env.seed(args.seed) 66 | env.action_space.seed(args.seed) 67 | train_tools.EVAL_SEED = args.seed 68 | 69 | obs_dim = env.observation_space.shape[0] 70 | act_dim = env.action_space.shape[0] 71 | act_bound = env.action_space.high[0] 72 | 73 | # create nets 74 | policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 75 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 76 | q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 77 | hidden_activation=nn.ReLU) 78 | 79 | q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], 80 | hidden_activation=nn.ReLU) 81 | 82 | # create buffer 83 | if args.show: 84 | replay_buffer = None 85 | else: 86 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, 87 | act_dim=act_dim, 88 | capacity=args.capacity, 89 | batch_size=args.batch_size) 90 | 91 | agent = SAC_Agent( 92 | # parameters of PolicyBase 93 | env=env, 94 | gamma=0.99, 95 | eval_freq=args.eval_freq, 96 | max_train_step=args.max_train_step, 97 | train_id=args.train_id, 98 | log_interval=args.log_interval, 99 | resume=args.resume, 100 | device=args.device, 101 | 102 | # Parameters of OffPolicyBase 103 | replay_buffer=replay_buffer, 104 | explore_step=args.explore_step, 105 | 106 | # Parameters of SAC_Agent 107 | policy_net=policy_net, 108 | q_net1=q_net1, # critic 109 | q_net2=q_net2, 110 | policy_lr=3e-4, 111 | qf_lr=3e-4, 112 | tau=0.005, 113 | alpha=args.alpha, 114 | auto_alpha_tuning=args.auto_alpha_tuning, 115 | ) 116 | 117 | if args.show: 118 | train_tools.evaluate_unity(agent, 10) 119 | else: 120 | agent.learn() 121 | -------------------------------------------------------------------------------- /run/td3_bc_mujoco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.offline.td3_bc import TD3_BC_Agent 11 | from common.buffers import OfflineBuffer 12 | from common.networks import MLPQsaNet, DDPGMLPActor 13 | from utils import train_tools, data_tools 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser(description='TD3_BC algorithm in mujoco environment') 17 | parser.add_argument('--env', type=str, default='hopper-medium-v0', 18 | help='the name of environment') 19 | parser.add_argument('--batch_size', type=int, default=256, 20 | help='the size of batch that sampled from buffer') 21 | 22 | parser.add_argument('--max_train_step', type=int, default=1000000, 23 | help='the max train step') 24 | parser.add_argument('--log_interval', type=int, default=1000, 25 | help='The number of steps taken to record the model and the tensorboard') 26 | parser.add_argument('--train_id', type=str, default='td3bc_mujoco_test', 27 | help='Path to save model and log tensorboard') 28 | parser.add_argument('--resume', action='store_true', default=False, 29 | help='whether load the last saved model to train') 30 | parser.add_argument('--device', type=str, default='cpu', 31 | help='Choose cpu or cuda') 32 | parser.add_argument('--show', action='store_true', default=False, 33 | help='show the trained model visually') 34 | parser.add_argument('--eval_freq', type=int, default=5000, 35 | help='how often (time steps) we evaluate') 36 | parser.add_argument('--seed', type=int, default=10, 37 | help='the random seed') 38 | 39 | args = parser.parse_args() 40 | 41 | torch.manual_seed(args.seed) 42 | np.random.seed(args.seed) 43 | 44 | # create environment 45 | env = gym.make(args.env) 46 | env.seed(args.seed) 47 | env.action_space.seed(args.seed) 48 | train_tools.EVAL_SEED = args.seed 49 | 50 | obs_dim = env.observation_space.shape[0] 51 | act_dim = env.action_space.shape[0] 52 | act_bound = env.action_space.high[0] 53 | 54 | # create nets 55 | actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 56 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 57 | 58 | critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 59 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 60 | critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 61 | hidden_size=[256, 256], hidden_activation=nn.ReLU) 62 | 63 | # create buffer 64 | if args.show: 65 | data_buffer = None 66 | else: 67 | data = data_tools.get_d4rl_dataset(env) 68 | data_buffer = OfflineBuffer(data=data, batch_size=args.batch_size) 69 | 70 | # create agent 71 | agent = TD3_BC_Agent( 72 | # parameters of PolicyBase 73 | env=env, 74 | gamma=0.99, 75 | eval_freq=args.eval_freq, 76 | max_train_step=args.max_train_step, 77 | train_id=args.train_id, 78 | log_interval=args.log_interval, 79 | resume=args.resume, 80 | device=args.device, 81 | 82 | # Parameters of OfflineBase 83 | data_buffer=data_buffer, 84 | 85 | # Parameters of TD3BC_Agent 86 | actor_net=actor_net, critic_net1=critic_net1, critic_net2=critic_net2, 87 | actor_lr=3e-4, critic_lr=3e-4, 88 | tau=0.005, 89 | policy_noise=0.2, 90 | noise_clip=0.5, 91 | policy_delay=2, 92 | alpha=2.5, 93 | ) 94 | 95 | if args.show: 96 | train_tools.evaluate(agent, 10, show=True) 97 | else: 98 | agent.learn() 99 | -------------------------------------------------------------------------------- /run/td3_gym.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.td3 import TD3_Agent 11 | from common.buffers import ReplayBuffer 12 | from common.networks import MLPQsaNet, DDPGMLPActor 13 | from utils import train_tools 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser(description='TD3 algorithm in gym environment') 17 | parser.add_argument('--env', type=str, default='Pendulum-v0', 18 | help='the name of environment') 19 | parser.add_argument('--capacity', type=int, default=50000, 20 | help='the max size of data buffer') 21 | parser.add_argument('--batch_size', type=int, default=100, 22 | help='the size of batch that sampled from buffer') 23 | parser.add_argument('--explore_step', type=int, default=2000, 24 | help='the steps of exploration before train') 25 | parser.add_argument('--eval_freq', type=int, default=1000, 26 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 27 | parser.add_argument('--max_train_step', type=int, default=100000, 28 | help='the max train step') 29 | parser.add_argument('--log_interval', type=int, default=1000, 30 | help='The number of steps taken to record the model and the tensorboard') 31 | parser.add_argument('--train_id', type=str, default='td3_gym_test', 32 | help='Path to save model and log tensorboard') 33 | parser.add_argument('--resume', action='store_true', default=False, 34 | help='whether load the last saved model to train') 35 | parser.add_argument('--device', type=str, default='cpu', 36 | help='Choose cpu or cuda') 37 | parser.add_argument('--show', action='store_true', default=False, 38 | help='show the trained model visually') 39 | parser.add_argument('--seed', type=int, default=10, 40 | help='the random seed') 41 | 42 | args = parser.parse_args() 43 | 44 | torch.manual_seed(args.seed) 45 | np.random.seed(args.seed) 46 | 47 | # create environment 48 | env = gym.make(args.env) 49 | env.seed(args.seed) 50 | env.action_space.seed(args.seed) 51 | train_tools.EVAL_SEED = args.seed 52 | 53 | obs_dim = env.observation_space.shape[0] 54 | act_dim = env.action_space.shape[0] 55 | act_bound = env.action_space.high[0] 56 | 57 | # create nets 58 | actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 59 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 60 | 61 | critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 62 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 63 | critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 64 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 65 | 66 | # create buffer 67 | if args.show: 68 | replay_buffer = None 69 | else: 70 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, 71 | act_dim=act_dim, 72 | capacity=args.capacity, 73 | batch_size=args.batch_size) 74 | 75 | agent = TD3_Agent( 76 | # parameters of PolicyBase 77 | env=env, 78 | gamma=0.99, 79 | eval_freq=args.eval_freq, 80 | max_train_step=args.max_train_step, 81 | train_id=args.train_id, 82 | log_interval=args.log_interval, 83 | resume=args.resume, 84 | device=args.device, 85 | 86 | # Parameters of OffPolicyBase 87 | replay_buffer=replay_buffer, 88 | explore_step=args.explore_step, 89 | 90 | # Parameters of TD3_Agent 91 | actor_net=actor_net, critic_net1=critic_net1, critic_net2=critic_net2, 92 | actor_lr=1e-3, critic_lr=1e-3, # Or 3e-4 93 | tau=0.005, 94 | act_noise=0.1, 95 | policy_noise=0.2, 96 | noise_clip=0.5, 97 | policy_delay=2, 98 | ) 99 | 100 | if args.show: 101 | train_tools.evaluate(agent, 10, show=True) 102 | else: 103 | agent.learn() 104 | -------------------------------------------------------------------------------- /run/td3_mujoco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import gym 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from algos.td3 import TD3_Agent 11 | from common.buffers import ReplayBuffer 12 | from common.networks import MLPQsaNet, DDPGMLPActor 13 | from utils import train_tools 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser(description='TD3 algorithm in mujoco environment') 17 | parser.add_argument('--env', type=str, default='Hopper-v2', 18 | help='the name of environment') 19 | parser.add_argument('--capacity', type=int, default=1000000, 20 | help='the max size of data buffer') 21 | parser.add_argument('--batch_size', type=int, default=100, 22 | help='the size of batch that sampled from buffer') 23 | parser.add_argument('--explore_step', type=int, default=10000, 24 | help='the steps of exploration before train') 25 | parser.add_argument('--eval_freq', type=int, default=5000, 26 | help='how often (time steps) we evaluate during training, and it will not eval if eval_freq < 0') 27 | parser.add_argument('--max_train_step', type=int, default=1000000, 28 | help='the max train step') 29 | parser.add_argument('--log_interval', type=int, default=1000, 30 | help='The number of steps taken to record the model and the tensorboard') 31 | parser.add_argument('--train_id', type=str, default='td3_mujoco_test', 32 | help='Path to save model and log tensorboard') 33 | parser.add_argument('--resume', action='store_true', default=False, 34 | help='whether load the last saved model to train') 35 | parser.add_argument('--device', type=str, default='cpu', 36 | help='Choose cpu or cuda') 37 | parser.add_argument('--show', action='store_true', default=False, 38 | help='show the trained model visually') 39 | parser.add_argument('--seed', type=int, default=10, 40 | help='the random seed') 41 | 42 | args = parser.parse_args() 43 | 44 | torch.manual_seed(args.seed) 45 | np.random.seed(args.seed) 46 | 47 | # create environment 48 | env = gym.make(args.env) 49 | env.seed(args.seed) 50 | env.action_space.seed(args.seed) 51 | train_tools.EVAL_SEED = args.seed 52 | 53 | obs_dim = env.observation_space.shape[0] 54 | act_dim = env.action_space.shape[0] 55 | act_bound = env.action_space.high[0] 56 | 57 | # create nets 58 | actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 59 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 60 | 61 | critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 62 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 63 | critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 64 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 65 | 66 | # create buffer 67 | if args.show: 68 | replay_buffer = None 69 | else: 70 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, 71 | act_dim=act_dim, 72 | capacity=args.capacity, 73 | batch_size=args.batch_size) 74 | 75 | agent = TD3_Agent( 76 | # parameters of PolicyBase 77 | env=env, 78 | gamma=0.99, 79 | eval_freq=args.eval_freq, 80 | max_train_step=args.max_train_step, 81 | train_id=args.train_id, 82 | log_interval=args.log_interval, 83 | resume=args.resume, 84 | device=args.device, 85 | 86 | # Parameters of OffPolicyBase 87 | replay_buffer=replay_buffer, 88 | explore_step=args.explore_step, 89 | 90 | # Parameters of TD3_Agent 91 | actor_net=actor_net, critic_net1=critic_net1, critic_net2=critic_net2, 92 | actor_lr=1e-3, critic_lr=1e-3, # Or 3e-4 93 | tau=0.005, 94 | act_noise=0.1, 95 | policy_noise=0.2, 96 | noise_clip=0.5, 97 | policy_delay=2, 98 | ) 99 | 100 | if args.show: 101 | train_tools.evaluate(agent, 10, show=True) 102 | else: 103 | agent.learn() 104 | -------------------------------------------------------------------------------- /run/td3_unity.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | 5 | import argparse 6 | import torch 7 | import torch.nn as nn 8 | import numpy as np 9 | from algos.td3 import TD3_Agent 10 | from common.buffers import ReplayBuffer 11 | from common.networks import MLPQsaNet, DDPGMLPActor 12 | from utils import train_tools 13 | from mlagents_envs.environment import UnityEnvironment 14 | from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel 15 | from gym_unity.envs import UnityToGymWrapper 16 | 17 | 18 | if __name__ == '__main__': 19 | parser = argparse.ArgumentParser(description='TD3 algorithm in gym environment') 20 | parser.add_argument('--env', type=str, default=None, 21 | help='the path of unity environment') 22 | parser.add_argument('--capacity', type=int, default=50000, 23 | help='the max size of data buffer') 24 | parser.add_argument('--batch_size', type=int, default=100, 25 | help='the size of batch that sampled from buffer') 26 | parser.add_argument('--explore_step', type=int, default=2000, 27 | help='the steps of exploration before train') 28 | # parser.add_argument('--eval_freq', type=int, default=1000, 29 | # help='how often (time steps) we evaluate') 30 | parser.add_argument('--max_train_step', type=int, default=100000, 31 | help='the max train step') 32 | parser.add_argument('--log_interval', type=int, default=1000, 33 | help='The number of steps taken to record the model and the tensorboard') 34 | parser.add_argument('--train_id', type=str, default='td3_gym_test', 35 | help='Path to save model and log tensorboard') 36 | parser.add_argument('--resume', action='store_true', default=False, 37 | help='whether load the last saved model to train') 38 | parser.add_argument('--device', type=str, default='cpu', 39 | help='Choose cpu or cuda') 40 | parser.add_argument('--show', action='store_true', default=False, 41 | help='show the trained model visually') 42 | parser.add_argument('--seed', type=int, default=10, 43 | help='the random seed') 44 | 45 | args = parser.parse_args() 46 | 47 | torch.manual_seed(args.seed) 48 | np.random.seed(args.seed) 49 | 50 | engine_configuration_channel = EngineConfigurationChannel() 51 | unity_env = UnityEnvironment(side_channels=[engine_configuration_channel], file_name=args.env) 52 | engine_configuration_channel.set_configuration_parameters( 53 | width=200, 54 | height=200, 55 | quality_level=5, 56 | time_scale=1 if args.show else 20, 57 | target_frame_rate=-1, 58 | capture_frame_rate=60) 59 | 60 | env = UnityToGymWrapper(unity_env=unity_env) 61 | env.seed(args.seed) 62 | env.action_space.seed(args.seed) 63 | train_tools.EVAL_SEED = args.seed 64 | 65 | obs_dim = env.observation_space.shape[0] 66 | act_dim = env.action_space.shape[0] 67 | act_bound = env.action_space.high[0] 68 | 69 | # create nets 70 | actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, 71 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 72 | 73 | critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 74 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 75 | critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, 76 | hidden_size=[400, 300], hidden_activation=nn.ReLU) 77 | 78 | # create buffer 79 | if args.show: 80 | replay_buffer = None 81 | else: 82 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, 83 | act_dim=act_dim, 84 | capacity=args.capacity, 85 | batch_size=args.batch_size) 86 | 87 | agent = TD3_Agent( 88 | # parameters of PolicyBase 89 | env=env, 90 | gamma=0.99, 91 | eval_freq=args.eval_freq, 92 | max_train_step=args.max_train_step, 93 | train_id=args.train_id, 94 | log_interval=args.log_interval, 95 | resume=args.resume, 96 | device=args.device, 97 | 98 | # Parameters of OffPolicyBase 99 | replay_buffer=replay_buffer, 100 | explore_step=args.explore_step, 101 | 102 | # Parameters of TD3_Agent 103 | actor_net=actor_net, critic_net1=critic_net1, critic_net2=critic_net2, 104 | actor_lr=1e-3, critic_lr=1e-3, # Or 3e-4 105 | tau=0.005, 106 | act_noise=0.1, 107 | policy_noise=0.2, 108 | noise_clip=0.5, 109 | policy_delay=2, 110 | ) 111 | 112 | if args.show: 113 | train_tools.evaluate_unity(agent, 10) 114 | else: 115 | agent.learn() 116 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dragon-wang/RL_Algorithms/3d43ece583e80f2828a42f28b790b1d7d73c07bf/utils/__init__.py -------------------------------------------------------------------------------- /utils/atari_preprocess.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.wrappers import atari_preprocessing 3 | from gym.wrappers import FrameStack 4 | import numpy as np 5 | 6 | 7 | def make_env(env_id, # 环境id 8 | noop_max=30, # 最大的no-op操作步数 9 | frame_skip=4, # 跳帧步数 10 | screen_size=84, # 帧的尺寸 11 | terminal_on_life_loss=True, # 是否在一条命没后结束Episode 12 | grayscale_obs=True, # True的话返回灰度图,否则返回RGB彩色图 13 | grayscale_newaxis=False, # 将输出的灰度图由2维转换为1维 14 | scale_obs=True, # 是否对obs标准化到[0,1] 15 | num_stack=4, # 叠加帧的步数 16 | lz4_compress=False, # 是否使用lz4压缩 17 | obs_LazyFramesToNumpy=True, # 是否将输出的图像由LazyFrames转化为numpy 18 | ): 19 | 20 | assert gym.envs.registry.spec(env_id).entry_point == 'gym.envs.atari:AtariEnv', "env is not Atari" 21 | 22 | env = gym.make(env_id) 23 | env = atari_preprocessing.AtariPreprocessing(env=env, 24 | noop_max=noop_max, 25 | frame_skip=frame_skip, 26 | screen_size=screen_size, 27 | terminal_on_life_loss=terminal_on_life_loss, 28 | grayscale_obs=grayscale_obs, 29 | grayscale_newaxis=grayscale_newaxis, 30 | scale_obs=scale_obs) 31 | env = FrameStack(env, num_stack=num_stack, lz4_compress=lz4_compress) 32 | if obs_LazyFramesToNumpy: 33 | env = ObsLazyFramesToNumpy(env) 34 | return env 35 | 36 | 37 | class ObsLazyFramesToNumpy(gym.Wrapper): 38 | def __init__(self, env): 39 | super(ObsLazyFramesToNumpy, self).__init__(env) 40 | 41 | def reset(self, **kwargs): 42 | obs = self.env.reset() 43 | return np.array(obs) 44 | 45 | def step(self, action): 46 | next_obs, reward, done, info = self.env.step(action) 47 | return np.array(next_obs), reward, done, info 48 | 49 | -------------------------------------------------------------------------------- /utils/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from OpenAI Baselines 3 | https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py 4 | """ 5 | import numpy as np 6 | from collections import deque 7 | import gym 8 | from gym import spaces 9 | import cv2 10 | cv2.ocl.setUseOpenCL(False) 11 | 12 | 13 | def make_atari_env(env_id, episodic_life=True, scale_obs=False, clip_rewards=True, stack_frames=True): 14 | env = gym.make(env_id) 15 | assert 'NoFrameskip' in env.spec.id 16 | env = NoopResetEnv(env, noop_max=30) 17 | env = MaxAndSkipEnv(env, skip=4) 18 | if episodic_life: 19 | env = EpisodicLifeEnv(env) 20 | if 'FIRE' in env.unwrapped.get_action_meanings(): 21 | env = FireResetEnv(env) 22 | env = WarpFrame(env) 23 | if clip_rewards: 24 | env = ClipRewardEnv(env) 25 | if stack_frames: 26 | env = FrameStack(env, 4) 27 | env = PyTorchStyleFrames(env) 28 | if scale_obs: 29 | env = ScaledFloatFrame(env) 30 | return env 31 | 32 | 33 | class NoopResetEnv(gym.Wrapper): 34 | def __init__(self, env, noop_max=30): 35 | """Sample initial states by taking random number of no-ops on reset. 36 | No-op is assumed to be action 0. 37 | """ 38 | gym.Wrapper.__init__(self, env) 39 | self.noop_max = noop_max 40 | self.override_num_noops = None 41 | self.noop_action = 0 42 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 43 | 44 | def reset(self, **kwargs): 45 | """ Do no-op action for a number of steps in [1, noop_max].""" 46 | self.env.reset(**kwargs) 47 | if self.override_num_noops is not None: 48 | noops = self.override_num_noops 49 | else: 50 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 51 | assert noops > 0 52 | obs = None 53 | for _ in range(noops): 54 | obs, _, done, _ = self.env.step(self.noop_action) 55 | if done: 56 | obs = self.env.reset(**kwargs) 57 | return obs 58 | 59 | def step(self, ac): 60 | return self.env.step(ac) 61 | 62 | 63 | class FireResetEnv(gym.Wrapper): 64 | def __init__(self, env): 65 | """Take action on reset for environments that are fixed until firing.""" 66 | gym.Wrapper.__init__(self, env) 67 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 68 | assert len(env.unwrapped.get_action_meanings()) >= 3 69 | 70 | def reset(self, **kwargs): 71 | self.env.reset(**kwargs) 72 | obs, _, done, _ = self.env.step(1) 73 | if done: 74 | self.env.reset(**kwargs) 75 | obs, _, done, _ = self.env.step(2) 76 | if done: 77 | self.env.reset(**kwargs) 78 | return obs 79 | 80 | def step(self, ac): 81 | return self.env.step(ac) 82 | 83 | 84 | class EpisodicLifeEnv(gym.Wrapper): 85 | def __init__(self, env): 86 | """Make end-of-life == end-of-episode, but only reset on true game over. 87 | Done by DeepMind for the DQN and co. since it helps value estimation. 88 | """ 89 | gym.Wrapper.__init__(self, env) 90 | self.lives = 0 91 | self.was_real_done = True 92 | 93 | def step(self, action): 94 | obs, reward, done, info = self.env.step(action) 95 | self.was_real_done = done 96 | # check current lives, make loss of life terminal, 97 | # then update lives to handle bonus lives 98 | lives = self.env.unwrapped.ale.lives() 99 | if lives < self.lives and lives > 0: 100 | # for Qbert sometimes we stay in lives == 0 condition for a few frames 101 | # so it's important to keep lives > 0, so that we only reset once 102 | # the environment advertises done. 103 | done = True 104 | self.lives = lives 105 | return obs, reward, done, info 106 | 107 | def reset(self, **kwargs): 108 | """Reset only when lives are exhausted. 109 | This way all states are still reachable even though lives are episodic, 110 | and the learner need not know about any of this behind-the-scenes. 111 | """ 112 | if self.was_real_done: 113 | obs = self.env.reset(**kwargs) 114 | else: 115 | # no-op step to advance from terminal/lost life state 116 | obs, _, _, _ = self.env.step(0) 117 | self.lives = self.env.unwrapped.ale.lives() 118 | return obs 119 | 120 | 121 | class MaxAndSkipEnv(gym.Wrapper): 122 | def __init__(self, env, skip=4): 123 | """Return only every `skip`-th frame""" 124 | gym.Wrapper.__init__(self, env) 125 | # most recent raw observations (for max pooling across time steps) 126 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 127 | self._skip = skip 128 | 129 | def step(self, action): 130 | """Repeat action, sum reward, and max over last observations.""" 131 | total_reward = 0.0 132 | done = None 133 | for i in range(self._skip): 134 | obs, reward, done, info = self.env.step(action) 135 | if i == self._skip - 2: self._obs_buffer[0] = obs 136 | if i == self._skip - 1: self._obs_buffer[1] = obs 137 | total_reward += reward 138 | if done: 139 | break 140 | # Note that the observation on the done=True frame 141 | # doesn't matter 142 | max_frame = self._obs_buffer.max(axis=0) 143 | 144 | return max_frame, total_reward, done, info 145 | 146 | def reset(self, **kwargs): 147 | return self.env.reset(**kwargs) 148 | 149 | 150 | class ClipRewardEnv(gym.RewardWrapper): 151 | def __init__(self, env): 152 | gym.RewardWrapper.__init__(self, env) 153 | 154 | def reward(self, reward): 155 | """Bin reward to {+1, 0, -1} by its sign.""" 156 | return np.sign(reward) 157 | 158 | 159 | class WarpFrame(gym.ObservationWrapper): 160 | def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None): 161 | """ 162 | Warp frames to 84x84 as done in the Nature paper and later work. 163 | If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which 164 | observation should be warped. 165 | """ 166 | super().__init__(env) 167 | self._width = width 168 | self._height = height 169 | self._grayscale = grayscale 170 | self._key = dict_space_key 171 | if self._grayscale: 172 | num_colors = 1 173 | else: 174 | num_colors = 3 175 | 176 | new_space = gym.spaces.Box( 177 | low=0, 178 | high=255, 179 | shape=(self._height, self._width, num_colors), 180 | dtype=np.uint8, 181 | ) 182 | if self._key is None: 183 | original_space = self.observation_space 184 | self.observation_space = new_space 185 | else: 186 | original_space = self.observation_space.spaces[self._key] 187 | self.observation_space.spaces[self._key] = new_space 188 | assert original_space.dtype == np.uint8 and len(original_space.shape) == 3 189 | 190 | def observation(self, obs): 191 | if self._key is None: 192 | frame = obs 193 | else: 194 | frame = obs[self._key] 195 | 196 | if self._grayscale: 197 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 198 | frame = cv2.resize( 199 | frame, (self._width, self._height), interpolation=cv2.INTER_AREA 200 | ) 201 | if self._grayscale: 202 | frame = np.expand_dims(frame, -1) 203 | 204 | if self._key is None: 205 | obs = frame 206 | else: 207 | obs = obs.copy() 208 | obs[self._key] = frame 209 | return obs 210 | 211 | 212 | class FrameStack(gym.Wrapper): 213 | def __init__(self, env, k): 214 | """Stack k last frames. 215 | Returns lazy array, which is much more memory efficient. 216 | See Also 217 | -------- 218 | baselines.common.atari_wrappers.LazyFrames 219 | """ 220 | gym.Wrapper.__init__(self, env) 221 | self.k = k 222 | self.frames = deque([], maxlen=k) 223 | shp = env.observation_space.shape 224 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype) 225 | 226 | def reset(self): 227 | ob = self.env.reset() 228 | for _ in range(self.k): 229 | self.frames.append(ob) 230 | return self._get_ob() 231 | 232 | def step(self, action): 233 | ob, reward, done, info = self.env.step(action) 234 | self.frames.append(ob) 235 | return self._get_ob(), reward, done, info 236 | 237 | def _get_ob(self): 238 | assert len(self.frames) == self.k 239 | return LazyFrames(list(self.frames)) 240 | 241 | 242 | class ScaledFloatFrame(gym.ObservationWrapper): 243 | def __init__(self, env): 244 | gym.ObservationWrapper.__init__(self, env) 245 | self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) 246 | 247 | def observation(self, observation): 248 | # careful! This undoes the memory optimization, use 249 | # with smaller replay buffers only. 250 | return np.array(observation).astype(np.float32) / 255.0 251 | 252 | 253 | class LazyFrames(object): 254 | def __init__(self, frames): 255 | """This object ensures that common frames between the observations are only stored once. 256 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 257 | buffers. 258 | This object should only be converted to numpy array before being passed to the model. 259 | You'd not believe how complex the previous solution was.""" 260 | self._frames = frames 261 | self._out = None 262 | 263 | def _force(self): 264 | if self._out is None: 265 | self._out = np.concatenate(self._frames, axis=-1) 266 | self._frames = None 267 | return self._out 268 | 269 | def __array__(self, dtype=None): 270 | out = self._force() 271 | if dtype is not None: 272 | out = out.astype(dtype) 273 | return out 274 | 275 | def __len__(self): 276 | return len(self._force()) 277 | 278 | def __getitem__(self, i): 279 | return self._force()[i] 280 | 281 | def count(self): 282 | frames = self._force() 283 | return frames.shape[frames.ndim - 1] 284 | 285 | def frame(self, i): 286 | return self._force()[..., i] 287 | 288 | 289 | class PyTorchStyleFrames(gym.Wrapper): 290 | """ 291 | 1.Change "LazyFrames" obs to "Numpy" obs 292 | 2.Change Image shape from "height x width x channels" to "channels x height x width" 293 | """ 294 | def __init__(self, env): 295 | super(PyTorchStyleFrames, self).__init__(env) 296 | shp = env.observation_space.shape 297 | self.observation_space = spaces.Box(low=0, high=255, 298 | shape=((shp[-1], ) + shp[:-1]), dtype=env.observation_space.dtype) 299 | 300 | def reset(self, **kwargs): 301 | obs = np.array(self.env.reset()).transpose((2, 0, 1)) 302 | return obs 303 | 304 | def step(self, action): 305 | next_obs, reward, done, info = self.env.step(action) 306 | next_obs = np.array(next_obs).transpose((2, 0, 1)) 307 | return next_obs, reward, done, info 308 | 309 | 310 | if __name__ == '__main__': 311 | env = make_atari_env("PongNoFrameskip-v4") 312 | print(env.observation_space) 313 | env.reset() 314 | obs, _, _, _ = env.step(env.action_space.sample()) 315 | print(obs) 316 | print(np.array(obs).shape) 317 | # from matplotlib import pyplot as plt 318 | # plt.imshow(obs[0], cmap='gray', interpolation='bicubic') 319 | # plt.show() 320 | -------------------------------------------------------------------------------- /utils/data_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | try: 3 | import d4rl 4 | except ImportError: 5 | print('No module named "d4rl" , and you can install in https://github.com/rail-berkeley/d4rl') 6 | 7 | try: 8 | import d4rl_atari 9 | except ImportError: 10 | print('No module named "d4rl_atari" , and you can install in https://github.com/takuseno/d4rl-atari') 11 | 12 | 13 | def get_d4rl_dataset(env, get_num=None) -> dict: 14 | """ 15 | d4rl dataset: https://github.com/rail-berkeley/d4rl 16 | install: pip install git+https://github.com/rail-berkeley/d4rl@master#egg=d4rl 17 | :param get_num: how many data get form dataset 18 | """ 19 | dataset = d4rl.qlearning_dataset(env) 20 | if get_num is None: 21 | data = dict( 22 | obs=dataset['observations'], 23 | acts=dataset['actions'], 24 | rews=dataset['rewards'], 25 | next_obs=dataset['next_observations'], 26 | done=dataset['terminals'] 27 | ) 28 | else: 29 | data_num = dataset['actions'].shape[0] 30 | ind = np.random.choice(data_num, size=get_num, replace=False) 31 | data = dict( 32 | obs=dataset['observations'][ind], 33 | acts=dataset['actions'][ind], 34 | rews=dataset['rewards'][ind], 35 | next_obs=dataset['next_observations'][ind], 36 | done=dataset['terminals'][ind] 37 | ) 38 | 39 | return data 40 | 41 | 42 | def get_d4rl_dataset_atari(env) -> dict: 43 | """ 44 | d4rl atari dataset: https://github.com/takuseno/d4rl-atari 45 | install: pip install git+https://github.com/takuseno/d4rl-atari 46 | """ 47 | dataset = env.get_dataset() 48 | data = dict( 49 | obs=dataset['observations'], 50 | acts=dataset['actions'], 51 | rews=dataset['rewards'], 52 | done=dataset['terminals'] 53 | ) 54 | 55 | return data 56 | -------------------------------------------------------------------------------- /utils/eval_plot.py: -------------------------------------------------------------------------------- 1 | from tensorboard.backend.event_processing import event_accumulator 2 | import pandas as pd 3 | import os 4 | import seaborn as sns 5 | from matplotlib import pyplot as plt 6 | 7 | 8 | def smooth(df, column, weight=0.6): 9 | """ 10 | Smooth a column of data in the DataFrame 11 | """ 12 | scalars = df[column].to_numpy() 13 | last = scalars[0] 14 | smoothed_scalars = [] 15 | for scalar in scalars: 16 | smoothed_scalar = last * weight + (1 - weight) * scalar # Calculate smoothed value 17 | smoothed_scalars.append(smoothed_scalar) 18 | last = smoothed_scalar 19 | df[column] = smoothed_scalars 20 | 21 | 22 | def get_pd(tensorboard_path, tag='evaluate_data/eval_episode_reward'): 23 | """ 24 | Get pandas from one tensorboard file 25 | """ 26 | event_data = event_accumulator.EventAccumulator(tensorboard_path) # a python interface for loading Event data 27 | event_data.Reload() 28 | scalars = event_data.scalars.Items(tag) 29 | df = pd.DataFrame(scalars)[['step', 'value']] 30 | return df 31 | 32 | 33 | def get_pd_from_parent_path(parents_path, tag='evaluate_data/eval_episode_reward'): 34 | """ 35 | Get pandas from tensorboard files with common parent path 36 | """ 37 | child_paths = os.listdir(parents_path) 38 | df = pd.DataFrame(columns=['step', 'value']) 39 | for child_path in child_paths: 40 | tens_path = os.path.join(parents_path, child_path) 41 | if os.path.isdir(tens_path): 42 | event_data = event_accumulator.EventAccumulator(tens_path) # a python interface for loading Event data 43 | event_data.Reload() 44 | scalars = event_data.scalars.Items(tag) 45 | df = df.append(pd.DataFrame(scalars)[['step', 'value']], ignore_index=True) 46 | return df 47 | 48 | 49 | def is_parent_path(parent_path): 50 | child_paths = os.listdir(parent_path) 51 | for child_path in child_paths: 52 | tens_path = os.path.join(parent_path, child_path) 53 | if os.path.isdir(tens_path): 54 | return True 55 | return False 56 | 57 | 58 | def plot_from_paths(path_list, label_list, tag='evaluate_data/eval_episode_reward', smooth_weight=0.6): 59 | """ 60 | Plot tensorboard file from paths from path_list and with label from label_list on one figure 61 | """ 62 | for i in range(len(path_list)): 63 | if is_parent_path(path_list[i]): 64 | df_temp = get_pd_from_parent_path(path_list[i], tag=tag) 65 | else: 66 | df_temp = get_pd(path_list[i], tag=tag) 67 | if smooth_weight > 0: 68 | smooth(df_temp, "value", weight=smooth_weight) 69 | sns.lineplot(x="step", y="value", data=df_temp, label=label_list[i]) 70 | else: 71 | sns.lineplot(x="step", y="value", data=df_temp, label=label_list[i]) 72 | plt.legend(loc="upper left") 73 | plt.xlabel("time step", fontsize=13) 74 | plt.ylabel("average reward", fontsize=13) 75 | plt.show() 76 | 77 | 78 | if __name__ == '__main__': 79 | path_list = ["E:/PycharmProjects/RL_Algorithms/run/results/bcq/Hopper-v0/medium-expert", 80 | "E:/PycharmProjects/RL_Algorithms/run/results/bear/Hopper-v0/medium-expert", 81 | "E:/PycharmProjects/RL_Algorithms/run/results/cql/Hopper-v0/medium-expert", 82 | ] 83 | 84 | label_list = ["BCQ", 85 | "BEAR", 86 | "CQL", 87 | ] 88 | 89 | plot_from_paths(path_list, label_list, smooth_weight=0.7) 90 | -------------------------------------------------------------------------------- /utils/log_tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | from torch.utils.tensorboard import SummaryWriter 3 | 4 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 5 | 6 | 7 | def make_dir(path): 8 | if not os.path.exists(path): 9 | os.makedirs(path) 10 | return path 11 | 12 | 13 | def del_all_files_in_dir(path): 14 | ls = os.listdir(path) 15 | for file in ls: 16 | os.remove(os.path.join(path, file)) 17 | 18 | 19 | class TensorboardLogger: 20 | def __init__(self, log_dir): 21 | self.log_dir = log_dir 22 | self.writer = SummaryWriter(log_dir) 23 | 24 | def log_train_data(self, log_datas: dict, step): 25 | for log_data in log_datas.items(): 26 | self.writer.add_scalar("train_data/" + log_data[0], log_data[1], step) 27 | self.writer.flush() 28 | 29 | def log_learn_data(self, log_datas: dict, step): 30 | for log_data in log_datas.items(): 31 | self.writer.add_scalar("learn_data/" + log_data[0], log_data[1], step) 32 | self.writer.flush() 33 | 34 | def log_eval_data(self, log_datas: dict, step): 35 | for log_data in log_datas.items(): 36 | self.writer.add_scalar("evaluate_data/" + log_data[0], log_data[1], step) 37 | self.writer.flush() 38 | -------------------------------------------------------------------------------- /utils/train_tools.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import Env 3 | from tqdm import tqdm 4 | from common.buffers import ReplayBuffer 5 | import numpy as np 6 | import copy 7 | 8 | EVAL_SEED = 10 # used for evaluation env's seed 9 | 10 | 11 | def hard_target_update(main, target): 12 | target.load_state_dict(main.state_dict()) 13 | 14 | 15 | def soft_target_update(main, target, tau=0.005): 16 | for main_param, target_param in zip(main.parameters(), target.parameters()): 17 | target_param.data.copy_(tau * main_param.data + (1.0 - tau) * target_param.data) 18 | 19 | 20 | def explore_before_train(env: Env, buffer, explore_step): 21 | obs = env.reset() 22 | done = False 23 | t = tqdm(range(explore_step)) 24 | t.set_description("explore before train") 25 | for _ in t: 26 | action = env.action_space.sample() 27 | next_obs, reward, done, info = env.step(action) 28 | buffer.add(obs, action, reward, next_obs, done) 29 | 30 | if done: 31 | obs = env.reset() 32 | done = False 33 | else: 34 | obs = next_obs 35 | 36 | 37 | def evaluate(agent, episode_num, seed_offset=100, show=False): 38 | if show: 39 | agent.load_agent_checkpoint() 40 | eval_env = copy.deepcopy(agent.env) 41 | eval_env.seed(EVAL_SEED + seed_offset) # reset environment's seed for evaluate(the seed will not be copied by deepcopy) 42 | total_reward = 0 43 | total_length = 0 44 | print("---------------------------------- evaluating at time step {} ----------------------------------".format(agent.train_step)) 45 | for i in range(episode_num): 46 | episode_reward = 0 47 | episode_length = 0 48 | obs, done = eval_env.reset(), False 49 | while not done: 50 | if show: 51 | eval_env.render() 52 | action = agent.choose_action(obs, eval=True) 53 | action = action[0] if isinstance(action, tuple) else action 54 | obs, reward, done, _ = eval_env.step(action) 55 | episode_reward += reward 56 | episode_length += 1 57 | if done: 58 | total_reward += episode_reward 59 | total_length += episode_length 60 | if show: 61 | print("episode:{} \t step length: {} \t reward: {:.2f}".format(i + 1, episode_length, episode_reward)) 62 | 63 | avg_reward = total_reward / episode_num 64 | avg_length = total_length / episode_num 65 | 66 | print("=====> evaluate {} episode <===> average step length: {:.2f} <===> average reward: {:.2f} <=====".format(episode_num, avg_length, avg_reward)) 67 | print("---------------------------------------------------------------------------------------------------") 68 | 69 | evaluate_summaries = {"eval_episode_length": avg_length, "eval_episode_reward": avg_reward} 70 | return evaluate_summaries 71 | 72 | 73 | def evaluate_unity(agent, episode_num): 74 | agent.load_agent_checkpoint() 75 | eval_env = agent.env 76 | total_reward = 0 77 | total_length = 0 78 | print("---------------------------------- evaluating at time step {} ----------------------------------".format(agent.train_step)) 79 | for i in range(episode_num): 80 | episode_reward = 0 81 | episode_length = 0 82 | obs, done = eval_env.reset(), False 83 | while not done: 84 | action = agent.choose_action(obs, eval=True) 85 | action = action[0] if isinstance(action, tuple) else action 86 | obs, reward, done, _ = eval_env.step(action) 87 | episode_reward += reward 88 | episode_length += 1 89 | if done: 90 | total_reward += episode_reward 91 | total_length += episode_length 92 | print("episode:{} \t step length: {} \t reward: {:.2f}".format(i + 1, episode_length, episode_reward)) 93 | 94 | avg_reward = total_reward / episode_num 95 | avg_length = total_length / episode_num 96 | 97 | print("=====> evaluate {} episode <===> average step length: {:.2f} <===> average reward: {:.2f} <=====".format(episode_num, avg_length, avg_reward)) 98 | print("---------------------------------------------------------------------------------------------------") 99 | 100 | return avg_reward, avg_length 101 | 102 | 103 | class OrnsteinUhlenbeckActionNoise: 104 | """ 105 | used in DDPG. OU noise 106 | """ 107 | def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2): 108 | self.action_dim = action_dim 109 | self.mu = mu 110 | self.theta = theta 111 | self.sigma = sigma 112 | self.X = np.ones(self.action_dim) * self.mu 113 | 114 | def reset(self): 115 | self.X = np.ones(self.action_dim) * self.mu 116 | 117 | def sample(self): 118 | dx = self.theta * (self.mu - self.X) 119 | dx = dx + self.sigma * np.random.randn(len(self.X)) 120 | self.X = self.X + dx 121 | return self.X 122 | 123 | 124 | if __name__ == '__main__': 125 | a = OrnsteinUhlenbeckActionNoise(action_dim=3) 126 | print(a.sample()) 127 | --------------------------------------------------------------------------------