├── LICENSE ├── README.md ├── __init__.py ├── agent ├── __init__.py ├── cal.py ├── model.py ├── pid_controller.py ├── replay_memory.py └── utils.py ├── arguments.py ├── env ├── __init__.py ├── constraints.py └── safety-gym │ ├── .gitignore │ ├── LICENSE │ ├── README.md │ ├── safety_gym.png │ ├── safety_gym │ ├── __init__.py │ ├── bench │ │ ├── bench_utils.py │ │ └── characteristic_scores.json │ ├── envs │ │ ├── __init__.py │ │ ├── engine.py │ │ ├── mujoco.py │ │ ├── suite.py │ │ └── world.py │ ├── random_agent.py │ ├── test │ │ ├── obs_space_refs.pkl │ │ ├── test_bench.py │ │ ├── test_button.py │ │ ├── test_determinism.py │ │ ├── test_engine.py │ │ ├── test_envs.py │ │ ├── test_goal.py │ │ └── test_obs.py │ └── xmls │ │ ├── README.md │ │ ├── car.xml │ │ ├── doggo.xml │ │ └── point.xml │ ├── setup.py │ ├── setup_dependency.sh │ └── setup_mujoco.sh ├── img └── cal_fig1.png ├── main.py └── sampler ├── mujoco_env_sampler.py └── safetygym_env_sampler.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 zifanwu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CAL 2 | Code accompanying the paper ["Off-Policy Primal-Dual Safe Reinforcement Learning"](https://openreview.net/forum?id=vy42bYs1Wo). 3 | 4 |
CAL
5 | 6 | ## Installing Dependences 7 | ### Safety-Gym 8 | 9 | ```shell 10 | cd ./env/safety-gym/ 11 | pip install -e . 12 | ``` 13 | 14 | We follow the environment implementation in the [CVPO repo](https://github.com/liuzuxin/cvpo-safe-rl/tree/main/envs/safety-gym) to accelerate the training process. All the compared baselines in the paper are also evaluated on this environment. For further description about the environment implementation, please refer to Appendix B.2 in the [CVPO paper](https://arxiv.org/abs/2201.11927). 15 | 16 | ### MuJoCo 17 | 18 | Refer to https://github.com/openai/mujoco-py. 19 | 20 | ## Usage 21 | 22 | Configurations for experiments, environments, and algorithmic components as well as hyperparameters can be found in [`/arguments.py`](/arguments.py). 23 | 24 | ### Training 25 | For Safety-Gym tasks: 26 | ```shell 27 | python main.py --env_name Safexp-PointButton1-v0 --num_epoch 500 28 | ``` 29 | 30 | For MuJoCo tasks: 31 | 32 | ```shell 33 | python main.py --env_name Ant-v3 --num_epoch 300 --c 100 --qc_ens_size 8 34 | ``` 35 | 36 | ### Algorithmic configurations 37 | 38 | #### Safety-Gym 39 | 40 | We adopt the same hyperparameter setting across all Safety-Gym tasks tested in our work (PointButton1, PointButton2, CarButton1, CarButton2, PointPush1), which is the default setting in [`/arguments.py`](/arguments.py). 41 | 42 | #### MuJoCo 43 | 44 | The configurations *different from the default setting* are as follows: 45 | 46 | - The conservatism parameter $k$ (`--k` in [`/arguments.py`](/arguments.py)) is 0. for Humanoid. 47 | 48 | - The convexity parameter $c$ (`--c`) is 100 for Ant, and 1000 for HalfCheetah and Humanoid. 49 | 50 | - The replay ratio (`--num_train_repeat`) is 20 for HalfCheetah. 51 | 52 | - The ensemble size $E$ of the safety critic (`--qc_ens_size`) is 8 for all MuJoCo tasks (may be smaller, like 4, for Hopper and Humanoid). 53 | 54 | > In my test runs, thanks to the batch matrix multiplication function provided by PyTorch, the size of the ensemble does not significantly affect the running speed. 55 | 56 | - The option `--intrgt_max` is True for Humanoid. 57 | 58 | > While in CAL conservatism is originally incorporated in policy optimization, for the Humanoid task we found it more effective to instead incorporate conservatism into $Q_c$ learning. 59 | 60 | ### Logging 61 | The codebase contains [wandb](https://wandb.ai/) as a visualization tool for experimental management. The user can initiate a wandb experiment by adding `--use_wandb` in the command above and specifying the wandb user account by `--user_name [your account]`. 62 | 63 | ## Reference 64 | 65 | ``` 66 | @article{wu2024off, 67 | title={Off-Policy Primal-Dual Safe Reinforcement Learning}, 68 | author={Wu, Zifan and Tang, Bo and Lin, Qian and Yu, Chao and Mao, Shangqin and Xie, Qianlong and Wang, Xingxing and Wang, Dong}, 69 | journal={arXiv preprint arXiv:2401.14758}, 70 | year={2024} 71 | } 72 | ``` 73 | 74 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZifanWu/CAL/e4087cc5fdd558b28a16dbcf54c0130952422d98/__init__.py -------------------------------------------------------------------------------- /agent/__init__.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class Agent(object): 5 | def reset(self): 6 | """For state-full agents this function performs resetting at the beginning of each episode.""" 7 | pass 8 | 9 | @abc.abstractmethod 10 | def train(self, training=True): 11 | """Sets the agent in either training or evaluation mode.""" 12 | 13 | @abc.abstractmethod 14 | def update(self, replay_buffer, logger, step): 15 | """Main function of the agent that performs learning.""" 16 | 17 | @abc.abstractmethod 18 | def act(self, obs, sample=False): 19 | """Issues an action given an observation.""" -------------------------------------------------------------------------------- /agent/cal.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | import os 5 | 6 | from agent import Agent 7 | from agent.utils import soft_update 8 | from agent.model import QNetwork, GaussianPolicy, QcEnsemble 9 | 10 | 11 | class CALAgent(Agent): 12 | def __init__(self, num_inputs, action_space, args): 13 | super().__init__() 14 | self.device = torch.device("cuda") 15 | self.discount = args.gamma 16 | self.safety_discount = args.safety_gamma 17 | self.critic_tau = args.tau 18 | self.critic_target_update_frequency = args.critic_target_update_frequency 19 | self.args = args 20 | 21 | self.update_counter = 0 22 | 23 | # Safety related params 24 | self.c = args.c 25 | self.cost_lr_scale = 1. 26 | 27 | # Reward critic 28 | self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) 29 | self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) 30 | self.critic_target.load_state_dict(self.critic.state_dict()) 31 | 32 | # Safety critics 33 | self.safety_critics = QcEnsemble(num_inputs, action_space.shape[0], args.qc_ens_size, args.hidden_size).to(self.device) 34 | self.safety_critic_targets = QcEnsemble(num_inputs, action_space.shape[0], args.qc_ens_size, args.hidden_size).to(self.device) 35 | self.safety_critic_targets.load_state_dict(self.safety_critics.state_dict()) 36 | 37 | # policy 38 | self.policy = GaussianPolicy(args, num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) 39 | self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) 40 | self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=args.lr) 41 | 42 | self.log_lam = torch.tensor(np.log(np.clip(0.6931, 1e-8, 1e8))).to(self.device) 43 | self.log_lam.requires_grad = True 44 | 45 | self.kappa = 0 46 | 47 | self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() 48 | 49 | # Optimizers 50 | self.actor_optimizer = torch.optim.Adam(self.policy.parameters(), lr=args.lr) 51 | self.critic_optimizer = torch.optim.Adam( 52 | self.critic.parameters(), 53 | lr=args.lr) 54 | self.safety_critic_optimizer = torch.optim.Adam(self.safety_critics.parameters(), lr=args.qc_lr) 55 | 56 | self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=args.lr) 57 | self.log_lam_optimizer = torch.optim.Adam([self.log_lam], lr=args.lr) 58 | 59 | self.train() 60 | self.critic_target.train() 61 | self.safety_critic_targets.train() 62 | 63 | # Set target cost 64 | if args.safetygym: 65 | self.target_cost = args.cost_lim * (1 - self.safety_discount**args.epoch_length) / ( 66 | 1 - self.safety_discount) / args.epoch_length if self.safety_discount < 1 else args.cost_lim 67 | else: 68 | self.target_cost = args.cost_lim 69 | print("Constraint Budget: ", self.target_cost) 70 | 71 | 72 | def train(self, training=True): 73 | self.training = training 74 | self.policy.train(training) 75 | self.critic.train(training) 76 | self.safety_critics.train(training) 77 | 78 | 79 | @property 80 | def alpha(self): 81 | return self.log_alpha.exp() 82 | 83 | 84 | @property 85 | def lam(self): 86 | return self.log_lam.exp() 87 | 88 | 89 | def select_action(self, state, eval=False): 90 | state = torch.FloatTensor(state).to(self.device).unsqueeze(0) 91 | if eval == False: 92 | action, _, _ = self.policy.sample(state) 93 | else: 94 | _, _, action = self.policy.sample(state) 95 | return action.detach().cpu().numpy()[0] 96 | 97 | 98 | def update_critic(self, state, action, reward, cost, next_state, mask): 99 | next_action, next_log_prob, _ = self.policy.sample(next_state) 100 | 101 | # Reward critics update 102 | current_Q1, current_Q2 = self.critic(state, action) 103 | with torch.no_grad(): 104 | target_Q1, target_Q2 = self.critic_target(next_state, next_action) 105 | target_V = torch.min(target_Q1, target_Q2) - self.alpha.detach() * next_log_prob 106 | target_Q = reward + (mask * self.discount * target_V) 107 | target_Q = target_Q.detach() 108 | 109 | critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) 110 | self.critic_optimizer.zero_grad() 111 | critic_loss.backward() 112 | self.critic_optimizer.step() 113 | 114 | 115 | # Safety critics update 116 | qc_idxs = np.random.choice(self.args.qc_ens_size, self.args.M) 117 | current_QCs = self.safety_critics(state, action) # shape(E, B, 1) 118 | with torch.no_grad(): 119 | next_QCs = self.safety_critic_targets(next_state, next_action) 120 | next_QC_random_max = next_QCs[qc_idxs].max(dim=0, keepdim=True).values 121 | 122 | if self.args.safetygym: 123 | mask = torch.ones_like(mask).to(self.device) 124 | next_QC = next_QC_random_max.repeat(self.args.qc_ens_size, 1, 1) if self.args.intrgt_max else next_QCs 125 | target_QCs = cost[None, :, :].repeat(self.args.qc_ens_size, 1, 1) + \ 126 | (mask[None, :, :].repeat(self.args.qc_ens_size, 1, 1) * self.safety_discount * next_QC) 127 | safety_critic_loss = F.mse_loss(current_QCs, target_QCs.detach()) 128 | 129 | self.safety_critic_optimizer.zero_grad() 130 | safety_critic_loss.backward() 131 | self.safety_critic_optimizer.step() 132 | 133 | 134 | def update_actor(self, state, action_taken): 135 | action, log_prob, _ = self.policy.sample(state) 136 | 137 | # Reward critic 138 | actor_Q1, actor_Q2 = self.critic(state, action) 139 | actor_Q = torch.min(actor_Q1, actor_Q2) 140 | 141 | # Safety critic 142 | actor_QCs = self.safety_critics(state, action) 143 | with torch.no_grad(): 144 | current_QCs = self.safety_critics(state, action_taken) 145 | current_std, current_mean = torch.std_mean(current_QCs, dim=0) 146 | if self.args.qc_ens_size == 1: 147 | current_std = torch.zeros_like(current_mean).to(self.device) 148 | current_QC = current_mean + self.args.k * current_std 149 | actor_std, actor_mean = torch.std_mean(actor_QCs, dim=0) 150 | if self.args.qc_ens_size == 1: 151 | actor_std = torch.zeros_like(actor_std).to(self.device) 152 | actor_QC = actor_mean + self.args.k * actor_std 153 | 154 | # Compute gradient rectification 155 | self.rect = self.c * torch.mean(self.target_cost - current_QC) 156 | self.rect = torch.clamp(self.rect.detach(), max=self.lam.item()) 157 | 158 | # Policy loss 159 | lam = self.lam.detach() 160 | actor_loss = torch.mean( 161 | self.alpha.detach() * log_prob 162 | - actor_Q 163 | + (lam - self.rect) * actor_QC 164 | ) 165 | 166 | # Optimize the policy 167 | self.actor_optimizer.zero_grad() 168 | actor_loss.backward() 169 | self.actor_optimizer.step() 170 | 171 | self.log_alpha_optimizer.zero_grad() 172 | alpha_loss = torch.mean(self.alpha * (-log_prob - self.target_entropy).detach()) 173 | alpha_loss.backward() 174 | self.log_alpha_optimizer.step() 175 | 176 | self.log_lam_optimizer.zero_grad() 177 | lam_loss = torch.mean(self.lam * (self.target_cost - current_QC).detach()) 178 | lam_loss.backward() 179 | self.log_lam_optimizer.step() 180 | 181 | 182 | def update_parameters(self, memory, updates): 183 | self.update_counter += 1 184 | state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory 185 | 186 | state_batch = torch.FloatTensor(state_batch).to(self.device) 187 | next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) 188 | action_batch = torch.FloatTensor(action_batch).to(self.device) 189 | cost_batch = torch.FloatTensor(reward_batch[:, 1]).to(self.device).unsqueeze(1) 190 | reward_batch = torch.FloatTensor(reward_batch[:, 0]).to(self.device).unsqueeze(1) 191 | mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) 192 | 193 | self.update_critic(state_batch, action_batch, reward_batch, cost_batch, next_state_batch, mask_batch) 194 | self.update_actor(state_batch, action_batch) 195 | 196 | if updates % self.critic_target_update_frequency == 0: 197 | soft_update(self.critic_target, self.critic, self.critic_tau) 198 | soft_update(self.safety_critic_targets, self.safety_critics, self.critic_tau) 199 | 200 | # Save model parameters 201 | def save_model(self, suffix="", actor_path=None, critics_path=None, safetycritics_path=None): 202 | if not os.path.exists('models/'): 203 | os.makedirs('models/') 204 | 205 | if actor_path is None: 206 | actor_path = "models/actor_{}_{}".format(self.args.env_name, suffix) 207 | if critics_path is None: 208 | critics_path = "models/critics_{}_{}".format(self.args.env_name, suffix) 209 | if safetycritics_path is None: 210 | safetycritics_path = "models/safetycritics_{}_{}".format(self.args.env_name, suffix) 211 | print('Saving models to {}, {}, and {}'.format(actor_path, critics_path, safetycritics_path)) 212 | torch.save(self.policy.state_dict(), actor_path) 213 | torch.save(self.critic.state_dict(), critics_path) 214 | torch.save(self.safety_critics.state_dict(), safetycritics_path) 215 | 216 | # Load model parameters 217 | def load_model(self, actor_path, critics_path, safetycritics_path): 218 | print('Loading models from {}, {}, and {}'.format(actor_path, critics_path, safetycritics_path)) 219 | if actor_path is not None: 220 | self.policy.load_state_dict(torch.load(actor_path)) 221 | if critics_path is not None: 222 | self.critic.load_state_dict(torch.load(critics_path)) 223 | if safetycritics_path is not None: 224 | self.safety_critics.load_state_dict(torch.load(safetycritics_path)) 225 | -------------------------------------------------------------------------------- /agent/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | import numpy as np 6 | 7 | LOG_SIG_MAX = 2 8 | LOG_SIG_MIN = -20 9 | epsilon = 1e-6 10 | 11 | # Initialize Policy weights 12 | def weights_init_(m): 13 | if isinstance(m, nn.Linear): 14 | torch.nn.init.xavier_uniform_(m.weight, gain=1) 15 | torch.nn.init.constant_(m.bias, 0) 16 | 17 | # Initialize Policy weights for ensemble networks 18 | def init_weights(m): 19 | def truncated_normal_init(t, mean=0.0, std=0.01): 20 | torch.nn.init.normal_(t, mean=mean, std=std) 21 | while True: 22 | cond = torch.logical_or(t < mean - 2 * std, t > mean + 2 * std) 23 | if not torch.sum(cond): 24 | break 25 | t = torch.where(cond, torch.nn.init.normal_(torch.ones(t.shape), mean=mean, std=std), t) 26 | return t 27 | 28 | if type(m) == nn.Linear or isinstance(m, EnsembleFC): 29 | input_dim = m.in_features 30 | truncated_normal_init(m.weight, std=1 / (2 * np.sqrt(input_dim))) 31 | m.bias.data.fill_(0.0) 32 | 33 | 34 | class QNetwork(nn.Module): 35 | def __init__(self, num_inputs, num_actions, hidden_dim): 36 | super(QNetwork, self).__init__() 37 | 38 | # Q1 architecture 39 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim) 40 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 41 | self.linear3 = nn.Linear(hidden_dim, 1) 42 | 43 | # Q2 architecture 44 | self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim) 45 | self.linear5 = nn.Linear(hidden_dim, hidden_dim) 46 | self.linear6 = nn.Linear(hidden_dim, 1) 47 | 48 | self.apply(weights_init_) 49 | 50 | def forward(self, state, action): 51 | xu = torch.cat([state, action], 1) 52 | 53 | x1 = F.relu(self.linear1(xu)) 54 | x1 = F.relu(self.linear2(x1)) 55 | x1 = self.linear3(x1) 56 | 57 | x2 = F.relu(self.linear4(xu)) 58 | x2 = F.relu(self.linear5(x2)) 59 | x2 = self.linear6(x2) 60 | return x1, x2 61 | 62 | class EnsembleFC(nn.Module): 63 | __constants__ = ['in_features', 'out_features'] 64 | in_features: int 65 | out_features: int 66 | ensemble_size: int 67 | weight: torch.Tensor 68 | 69 | def __init__(self, in_features: int, out_features: int, ensemble_size: int, weight_decay: float = 0., bias: bool = True) -> None: 70 | super(EnsembleFC, self).__init__() 71 | self.in_features = in_features 72 | self.out_features = out_features 73 | self.ensemble_size = ensemble_size 74 | self.weight = nn.Parameter(torch.Tensor(ensemble_size, in_features, out_features)) 75 | self.weight_decay = weight_decay 76 | if bias: 77 | self.bias = nn.Parameter(torch.Tensor(ensemble_size, out_features)) 78 | else: 79 | self.register_parameter('bias', None) 80 | self.reset_parameters() 81 | 82 | def reset_parameters(self) -> None: 83 | pass 84 | 85 | def forward(self, input: torch.Tensor) -> torch.Tensor: 86 | w_times_x = torch.bmm(input, self.weight) 87 | return torch.add(w_times_x, self.bias[:, None, :]) # w times x + b 88 | 89 | def extra_repr(self) -> str: 90 | return 'in_features={}, out_features={}, bias={}'.format( 91 | self.in_features, self.out_features, self.bias is not None 92 | ) 93 | 94 | class QcEnsemble(nn.Module): 95 | def __init__(self, state_size, action_size, ensemble_size, hidden_size=256): 96 | super(QcEnsemble, self).__init__() 97 | self.nn1 = EnsembleFC(state_size + action_size, hidden_size, ensemble_size, weight_decay=0.00003) 98 | self.nn2 = EnsembleFC(hidden_size, hidden_size, ensemble_size, weight_decay=0.00006) 99 | self.nn3 = EnsembleFC(hidden_size, 1, ensemble_size, weight_decay=0.0001) 100 | self.activation = nn.SiLU() 101 | self.ensemble_size = ensemble_size 102 | self.apply(init_weights) 103 | 104 | def forward(self, state, action): 105 | xu = torch.cat([state, action], 1) 106 | nn1_output = self.activation(self.nn1(xu[None, :, :].repeat([self.ensemble_size, 1, 1]))) 107 | nn2_output = self.activation(self.nn2(nn1_output)) 108 | nn3_output = self.nn3(nn2_output) 109 | 110 | return nn3_output 111 | 112 | def get_decay_loss(self): 113 | decay_loss = 0. 114 | for m in self.children(): 115 | if isinstance(m, EnsembleFC): 116 | decay_loss += m.weight_decay * torch.sum(torch.square(m.weight)) / 2. 117 | return decay_loss 118 | 119 | # ----------------------------------------------------------------------------- 120 | 121 | class GaussianPolicy(nn.Module): 122 | # MEAN_CLAMP_MIN = -5 123 | # MEAN_CLAMP_MAX = 5 124 | # COV_CLAMP_MIN = -5 125 | # COV_CLAMP_MAX = 20 126 | def __init__(self, args, num_inputs, num_actions, hidden_dim, action_space=None): 127 | super(GaussianPolicy, self).__init__() 128 | 129 | self.linear1 = nn.Linear(num_inputs, hidden_dim) 130 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 131 | 132 | self.mean_linear = nn.Linear(hidden_dim, num_actions) 133 | self.log_std_linear = nn.Linear(hidden_dim, num_actions) 134 | 135 | self.apply(weights_init_) 136 | 137 | # action rescaling 138 | if action_space is None: 139 | self.action_scale = torch.tensor(1.) 140 | self.action_bias = torch.tensor(0.) 141 | else: 142 | self.action_scale = torch.FloatTensor( 143 | (action_space.high - action_space.low) / 2.) # [1]*a_dim 144 | self.action_bias = torch.FloatTensor( 145 | (action_space.high + action_space.low) / 2.) # [0]*a_dim 146 | self.log_sig_max = LOG_SIG_MAX 147 | 148 | def forward(self, state): 149 | x = F.relu(self.linear1(state)) 150 | x = F.relu(self.linear2(x)) 151 | mean = self.mean_linear(x) 152 | log_std = self.log_std_linear(x) 153 | log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=self.log_sig_max) 154 | return mean, log_std 155 | 156 | def get_a_mean(self, state): 157 | x = F.relu(self.linear1(state)) 158 | x = F.relu(self.linear2(x)) 159 | mean = self.mean_linear(x) 160 | mean = torch.tanh(mean) * self.action_scale + self.action_bias 161 | return mean 162 | 163 | def sample(self, state): 164 | mean, log_std = self.forward(state) 165 | std = log_std.exp() 166 | normal = Normal(mean, std) 167 | x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) 168 | y_t = torch.tanh(x_t) 169 | action = y_t * self.action_scale + self.action_bias 170 | log_prob = normal.log_prob(x_t) 171 | # Enforcing Action Bound 172 | log_prob = log_prob - torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon) 173 | log_prob = log_prob.sum(1, keepdim=True) 174 | mean = torch.tanh(mean) * self.action_scale + self.action_bias 175 | return action, log_prob, mean 176 | 177 | def sample_multiple_actions(self, state, n_pars): 178 | mean, log_std = self.forward(state) 179 | std = log_std.exp() 180 | normal = Normal(mean, std) 181 | x_t = normal.sample((n_pars, )) 182 | y_t = torch.tanh(x_t) 183 | actions = y_t * self.action_scale + self.action_bias 184 | mean = torch.tanh(mean) * self.action_scale + self.action_bias 185 | log_std = torch.log(log_std.exp() * self.action_scale) 186 | return actions, mean, log_std, x_t 187 | 188 | def calibrate_log_prob(self, normal, x_t): 189 | log_prob = normal.log_prob(x_t) 190 | # Enforcing Action Bound 191 | calibrated_log_prob = log_prob - torch.log(self.action_scale * (1 - torch.tanh(x_t).pow(2)) + epsilon) 192 | return calibrated_log_prob.sum(-1) 193 | 194 | def to(self, device): 195 | self.action_scale = self.action_scale.to(device) 196 | self.action_bias = self.action_bias.to(device) 197 | return super(GaussianPolicy, self).to(device) 198 | 199 | 200 | class DeterministicPolicy(nn.Module): 201 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None): 202 | super(DeterministicPolicy, self).__init__() 203 | self.linear1 = nn.Linear(num_inputs, hidden_dim) 204 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 205 | 206 | self.mean = nn.Linear(hidden_dim, num_actions) 207 | self.noise = torch.Tensor(num_actions) 208 | 209 | self.apply(weights_init_) 210 | 211 | # action rescaling 212 | if action_space is None: 213 | self.action_scale = 1. 214 | self.action_bias = 0. 215 | else: 216 | self.action_scale = torch.FloatTensor( 217 | (action_space.high - action_space.low) / 2.) 218 | self.action_bias = torch.FloatTensor( 219 | (action_space.high + action_space.low) / 2.) 220 | 221 | def forward(self, state): 222 | x = F.relu(self.linear1(state)) 223 | x = F.relu(self.linear2(x)) 224 | mean = torch.tanh(self.mean(x)) * self.action_scale + self.action_bias 225 | return mean 226 | 227 | def sample(self, state): 228 | mean = self.forward(state) 229 | noise = self.noise.normal_(0., std=0.1) 230 | noise = noise.clamp(-0.25, 0.25) 231 | action = mean + noise 232 | return action, torch.tensor(0.), mean 233 | 234 | def to(self, device): 235 | self.action_scale = self.action_scale.to(device) 236 | self.action_bias = self.action_bias.to(device) 237 | return super(GaussianPolicy, self).to(device) 238 | -------------------------------------------------------------------------------- /agent/pid_controller.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.functional import relu 3 | 4 | 5 | class LagrangianPIDController: 6 | ''' 7 | Lagrangian multiplier controller 8 | ''' 9 | def __init__(self, KP, KI, KD, thres, per_state=True) -> None: 10 | super().__init__() 11 | self.KP = KP 12 | self.KI = KI 13 | self.KD = KD 14 | self.thres = thres 15 | self.error_old = 0 16 | self.error_integral = 0 17 | 18 | def control(self, qc): 19 | ''' 20 | @param qc [batch,] 21 | ''' 22 | error_new = torch.mean(qc - self.thres) # [batch] 23 | error_diff = relu(error_new - self.error_old) 24 | self.error_integral = torch.mean(relu(self.error_integral + error_new)) 25 | self.error_old = error_new 26 | 27 | multiplier = relu(self.KP * relu(error_new) + self.KI * self.error_integral + 28 | self.KD * error_diff) 29 | return torch.mean(multiplier) -------------------------------------------------------------------------------- /agent/replay_memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from operator import itemgetter 4 | 5 | class ReplayMemory: 6 | def __init__(self, capacity): 7 | self.capacity = capacity 8 | self.buffer = [] 9 | self.position = 0 10 | 11 | def push(self, state, action, reward, next_state, done): 12 | if len(self.buffer) < self.capacity: 13 | self.buffer.append(None) 14 | self.buffer[self.position] = [state, action, reward, next_state, done] 15 | self.position = (self.position + 1) % self.capacity 16 | 17 | def push_batch(self, batch): 18 | if len(self.buffer) < self.capacity: 19 | append_len = min(self.capacity - len(self.buffer), len(batch)) 20 | self.buffer.extend([None] * int(append_len)) 21 | 22 | if self.position + len(batch) < self.capacity: 23 | self.buffer[self.position : self.position + len(batch)] = batch 24 | self.position += len(batch) 25 | else: 26 | self.buffer[self.position : len(self.buffer)] = batch[:len(self.buffer) - self.position] 27 | self.buffer[:len(batch) - len(self.buffer) + self.position] = batch[len(self.buffer) - self.position:] 28 | self.position = len(batch) - len(self.buffer) + self.position 29 | 30 | def sample(self, batch_size): 31 | if batch_size > len(self.buffer): 32 | batch_size = len(self.buffer) 33 | batch = random.sample(self.buffer, int(batch_size)) 34 | state, action, reward, next_state, done = map(np.stack, zip(*batch)) 35 | return state, action, reward, next_state, done 36 | 37 | def sample_all_batch(self, batch_size): 38 | idxes = np.random.randint(0, len(self.buffer), batch_size) 39 | batch = list(itemgetter(*idxes)(self.buffer)) 40 | state, action, reward, next_state, done = map(np.stack, zip(*batch)) 41 | return state, action, reward, next_state, done 42 | 43 | def return_all(self): 44 | return self.buffer 45 | 46 | def __len__(self): 47 | return len(self.buffer) 48 | 49 | -------------------------------------------------------------------------------- /agent/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | def create_log_gaussian(mean, log_std, t): 5 | quadratic = -((0.5 * (t - mean) / (log_std.exp())).pow(2)) 6 | l = mean.shape 7 | log_z = log_std 8 | z = l[-1] * math.log(2 * math.pi) 9 | log_p = quadratic.sum(dim=-1) - log_z.sum(dim=-1) - 0.5 * z 10 | return log_p 11 | 12 | def logsumexp(inputs, dim=None, keepdim=False): 13 | if dim is None: 14 | inputs = inputs.view(-1) 15 | dim = 0 16 | s, _ = torch.max(inputs, dim=dim, keepdim=True) 17 | outputs = s + (inputs - s).exp().sum(dim=dim, keepdim=True).log() 18 | if not keepdim: 19 | outputs = outputs.squeeze(dim) 20 | return outputs 21 | 22 | def soft_update(target, source, tau): 23 | for target_param, param in zip(target.parameters(), source.parameters()): 24 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 25 | 26 | def hard_update(target, source): 27 | for target_param, param in zip(target.parameters(), source.parameters()): 28 | target_param.data.copy_(param.data) 29 | -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def readParser(): 4 | parser = argparse.ArgumentParser(description='CAL') 5 | 6 | # ----------------------Env Config------------------------ 7 | parser.add_argument('--env_name', default='Hopper-v3') 8 | # MuJoCo: 'Hopper-v3' 'HalfCheetah-v3' 'Ant-v3', 'Humanoid-v3' 9 | # Safety-Gym: 'Safexp-PointButton1-v0' 'Safexp-CarButton1-v0' 10 | # 'Safexp-PointButton2-v0' 'Safexp-CarButton2-v0' 'Safexp-PointPush1-v0' 11 | parser.add_argument('--safetygym', action='store_true', default=False) 12 | parser.add_argument('--constraint_type', default='velocity', help="['safetygym', 'velocity']") 13 | parser.add_argument('--epoch_length', type=int, default=1000) 14 | parser.add_argument('--seed', type=int, default=123456) 15 | 16 | # -------------------Experiment Config--------------------- 17 | parser.add_argument('--cuda', default=True, action="store_true", 18 | help='run on CUDA (default: True)') 19 | parser.add_argument('--cuda_num', default='0', 20 | help='select the cuda number you want your program to run on') 21 | parser.add_argument('--use_wandb', action='store_true', default=False) 22 | parser.add_argument('--user_name', default='') 23 | parser.add_argument('--n_training_threads', default=10) 24 | parser.add_argument('--experiment_name', default='exp') 25 | parser.add_argument('--num_epoch', type=int, default=300) 26 | parser.add_argument('--num_eval_epochs', type=int, default=1) 27 | parser.add_argument('--save_parameters', action='store_true', default=False) 28 | 29 | # ---------------------Algorithm Config------------------------- 30 | parser.add_argument('--k', type=float, default=0.5) 31 | parser.add_argument('--qc_ens_size', type=int, default=4) 32 | parser.add_argument('--c', type=float, default=10) 33 | parser.add_argument('--num_train_repeat', type=int, default=10) 34 | 35 | parser.add_argument('--intrgt_max', action='store_true', default=False) 36 | parser.add_argument('--M', type=int, default=4, help='this number should be <= qc_ens_size') 37 | 38 | # -------------------Basic Hyperparameters--------------------- 39 | parser.add_argument('--epsilon', default=1e-3) 40 | parser.add_argument('--init_exploration_steps', type=int, default=5000) 41 | parser.add_argument('--train_every_n_steps', type=int, default=1) 42 | parser.add_argument('--safety_gamma', type=float, default=0.99) 43 | parser.add_argument('--gamma', type=float, default=0.99) 44 | parser.add_argument('--tau', type=float, default=0.005) 45 | parser.add_argument('--alpha', type=float, default=0.2) 46 | parser.add_argument('--policy', default="Gaussian", 47 | help='Policy Type: Gaussian | Deterministic (default: Gaussian)') 48 | parser.add_argument('--hidden_size', type=int, default=256) 49 | parser.add_argument('--lr', type=float, default=0.0003) 50 | parser.add_argument('--qc_lr', type=float, default=0.0003) 51 | parser.add_argument('--critic_target_update_frequency', type=int, default=2) 52 | parser.add_argument('--replay_size', type=int, default=1000000) 53 | parser.add_argument('--min_pool_size', type=int, default=1000) 54 | parser.add_argument('--max_train_repeat_per_step', type=int, default=5) 55 | parser.add_argument('--policy_train_batch_size', type=int, default=12) 56 | 57 | return parser.parse_args() 58 | -------------------------------------------------------------------------------- /env/__init__.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | MBPO_ENVIRONMENT_SPECS = ( 4 | { 5 | 'id': 'AntTruncatedObs-v3', 6 | 'entry_point': (f'mbpo.env.ant:AntTruncatedObsEnv'), 7 | }, 8 | { 9 | 'id': 'HumanoidTruncatedObs-v3', 10 | 'entry_point': (f'mbpo.env.humanoid:HumanoidTruncatedObsEnv'), 11 | }, 12 | ) 13 | 14 | def register_mbpo_environments(): 15 | for mbpo_environment in MBPO_ENVIRONMENT_SPECS: 16 | gym.register(**mbpo_environment) 17 | 18 | gym_ids = tuple( 19 | environment_spec['id'] 20 | for environment_spec in MBPO_ENVIRONMENT_SPECS) 21 | 22 | return gym_ids -------------------------------------------------------------------------------- /env/constraints.py: -------------------------------------------------------------------------------- 1 | def get_threshold(env, constraint='velocity'): 2 | if constraint == 'safetygym': 3 | thresholds = {'Safexp-CarButton1-v0': 10, 4 | 'Safexp-CarButton2-v0': 10, 5 | 'Safexp-PointButton1-v0': 10, 6 | 'Safexp-PointButton2-v0': 10, 7 | 'Safexp-PointPush1-v0':10 8 | } 9 | elif constraint == 'velocity': 10 | thresholds = {'Ant-v3': 103.115, 11 | 'HalfCheetah-v3': 151.989, 12 | 'Hopper-v3': 82.748, 13 | 'Humanoid-v3': 20.140, 14 | } 15 | return thresholds[env] -------------------------------------------------------------------------------- /env/safety-gym/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | mjkey.txt 107 | MUJOCO_LOG.TXT 108 | 109 | data 110 | 111 | *~ 112 | *.*~ 113 | 114 | .DS_Store 115 | -------------------------------------------------------------------------------- /env/safety-gym/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /env/safety-gym/README.md: -------------------------------------------------------------------------------- 1 | **Status:** Archive (code is provided as-is, no updates expected) 2 | 3 | # Safety Gym 4 | 5 | Tools for accelerating safe exploration research. 6 | 7 | ![](safety_gym.png) 8 | 9 | 10 | ## Supported Platforms 11 | 12 | This package has been tested on Mac OS Mojave and Ubuntu 16.04 LTS, and is probably fine for most recent Mac and Linux operating systems. 13 | 14 | Requires **Python 3.6 or greater.** 15 | 16 | 17 | ## Installation 18 | 19 | Safety Gym depends heavily on [mujoco_py](https://github.com/openai/mujoco-py), so the first step of installing Safety Gym is installing MuJoCo. See the mujoco_py documentation for details. Note that mujoco_py **requires Python 3.6 or greater**, so Safety Gym does as well. 20 | 21 | Afterwards, simply install Safety Gym by: 22 | 23 | ``` 24 | git clone https://github.com/openai/safety-gym.git 25 | 26 | cd safety-gym 27 | 28 | pip install -e . 29 | ``` 30 | 31 | 32 | 33 | ## Getting Started 34 | 35 | To use the pre-configured environments from the Safety Gym benchmark suite, simply import the package and then use `gym.make`. For example: 36 | 37 | ``` 38 | import safety_gym 39 | import gym 40 | 41 | env = gym.make('Safexp-PointGoal1-v0') 42 | ``` 43 | 44 | For a complete list of pre-configured environments, see below. 45 | 46 | To create a custom environment using the Safety Gym engine, use the `Engine` class. For example, to build an environment with a car robot, the push task, some hazards, and some vases, with constraints on entering the hazard areas but no constraints on hitting the vases: 47 | 48 | ``` 49 | from safety_gym.envs.engine import Engine 50 | 51 | config = { 52 | 'robot_base': 'xmls/car.xml', 53 | 'task': 'push', 54 | 'observe_goal_lidar': True, 55 | 'observe_box_lidar': True, 56 | 'observe_hazards': True, 57 | 'observe_vases': True, 58 | 'constrain_hazards': True, 59 | 'lidar_max_dist': 3, 60 | 'lidar_num_bins': 16, 61 | 'hazards_num': 4, 62 | 'vases_num': 4 63 | } 64 | 65 | env = Engine(config) 66 | ``` 67 | 68 | To register that custom environment with Gym: 69 | 70 | ``` 71 | from gym.envs.registration import register 72 | 73 | register(id='SafexpTestEnvironment-v0', 74 | entry_point='safety_gym.envs.mujoco:Engine', 75 | kwargs={'config': config}) 76 | ``` 77 | 78 | For a full list of configuration options, see the `Engine` [code itself](safety_gym/envs/engine.py). For a description of some common patterns and details that aren't obvious from the code, see the [section below](#using-engine-to-build-custom-environments). 79 | 80 | The API for envs is the same as Gym: 81 | 82 | ``` 83 | next_observation, reward, done, info = env.step(action) 84 | ``` 85 | 86 | The `info` dict contains information about constraint costs. For example, in the custom environment we just built: 87 | 88 | ``` 89 | >>> info 90 | {'cost_hazards': 0.0, 'cost': 0.0} 91 | ``` 92 | 93 | ## Read the Paper for Important Details 94 | 95 | Most of the conceptual details for Safety Gym, like what kinds of robots, tasks, and constraints Safety Gym supports, are primarily described in the paper "Benchmarking Safe Exploration in Deep Reinforcement Learning" by Alex Ray, Joshua Achiam, and Dario Amodei. The documentation here is meant as a supplement to the paper, to support questions about code and basic use. 96 | 97 | If you use Safety Gym in your paper, please cite: 98 | 99 | ``` 100 | @article{Ray2019, 101 | author = {Ray, Alex and Achiam, Joshua and Amodei, Dario}, 102 | title = {{Benchmarking Safe Exploration in Deep Reinforcement Learning}}, 103 | year = {2019} 104 | } 105 | ``` 106 | 107 | 108 | ## Benchmark Suite 109 | 110 | An environment in the Safety Gym benchmark suite is formed as a combination of a robot (one of `Point`, `Car`, or `Doggo`), a task (one of `Goal`, `Button`, or `Push`), and a level of difficulty (one of `0`, `1`, or `2`, with higher levels having more challenging constraints). Environments include: 111 | 112 | * `Safexp-{Robot}Goal0-v0`: A robot must navigate to a goal. 113 | * `Safexp-{Robot}Goal1-v0`: A robot must navigate to a goal while avoiding hazards. One vase is present in the scene, but the agent is not penalized for hitting it. 114 | * `Safexp-{Robot}Goal2-v0`: A robot must navigate to a goal while avoiding more hazards and vases. 115 | * `Safexp-{Robot}Button0-v0`: A robot must press a goal button. 116 | * `Safexp-{Robot}Button1-v0`: A robot must press a goal button while avoiding hazards and gremlins, and while not pressing any of the wrong buttons. 117 | * `Safexp-{Robot}Button2-v0`: A robot must press a goal button while avoiding more hazards and gremlins, and while not pressing any of the wrong buttons. 118 | * `Safexp-{Robot}Push0-v0`: A robot must push a box to a goal. 119 | * `Safexp-{Robot}Push1-v0`: A robot must push a box to a goal while avoiding hazards. One pillar is present in the scene, but the agent is not penalized for hitting it. 120 | * `Safexp-{Robot}Push2-v0`: A robot must push a box to a goal while avoiding more hazards and pillars. 121 | 122 | (To make one of the above, make sure to substitute `{Robot}` for one of `Point`, `Car`, or `Doggo`.) 123 | 124 | 125 | ## Comparing Algorithms with Benchmark Scores 126 | 127 | When using Safety Gym for research, we recommend comparing algorithms using aggregate metrics to represent performance across the entire benchmark suite or a subset of it. The aggregate metrics we recommend in the paper are: 128 | 129 | * Average (over environments and random seeds) normalized average (over episodes) return of the final policy. 130 | * Average normalized constraint violation of the final policy. 131 | * Average normalized cost rate over training (sum of all costs incurred during training divided by number of environment interaction steps). 132 | 133 | We compute normalized scores using reference statistics from our run of unconstrained PPO, with 10M env steps for environments with Point or Car robots and 100M env steps for environments with the Doggo robot. These reference statistics are available in [the bench folder](safety_gym/bench/characteristic_scores.json), and we provide a [utility function](safety_gym/bench/bench_utils.py#L40) to calculate normalized for an arbitrary environment. 134 | 135 | 136 | ## Using Engine to Build Custom Environments 137 | 138 | 139 | Again, most of the conceptual details for Engine are described in the paper. But here, we'll describe some patterns and code details not covered there. 140 | 141 | **Defaults for Sensors:** By default, the only sensors enabled are basic robot sensors: accelerometer, gyro, magnetometer, velocimeter, joint angles, and joint velocities. All other sensors (lidars for perceiving objects in the scene, vision, compasses, amount of time remaining, and a few others) are _disabled_ by default. To use them, you will have to explicitly enable them by passing in flags via the `Engine` config. Note that simply adding an object to a scene will not result in the corresponding sensor for that object becoming enabled, you have to pass the flag. 142 | 143 | **Vision:** Vision is included as an option but is fairly minimally supported and we have not yet tested it extensively. Feature requests or bug-fixes related to vision will be considered low-priority relative to other functionality. 144 | 145 | **Lidar and Pseudo-Lidar:** Lidar and pseudo-lidar are the main ways to observe objects. Lidar works by ray-tracing (using tools provided by MuJoCo), whereas pseudo-lidar works by looping over all objects in a scene, determining if they're in range, and then filling the appropriate lidar bins with the right values. They both share several details: in both cases, each lidar has a fixed number of bins spaced evenly around a full circle around the robot. 146 | 147 | Lidar-like observations are object-specific. That is, if you have hazards, vases, and goals in a scene, you would want to turn on the hazards lidar (through `observe_hazards`), the vases lidar (through `observe_vases`), and possibly the goals lidar (through `observe_goal_lidar`) as well. 148 | 149 | All lidar-like observations will be either true lidar or pseudo-lidar, depending on the `lidar_type` flag. By default, `lidar_type='pseudo'`. To use true lidar instead, set `lidar_type='natural'`. 150 | 151 | Lidar observations are represented visually by "lidar halos" that hover above the agent. Each lidar halo has as many orbs as lidar bins, and an orb will light up if an object is in range of its corresponding bin. Lidar halos are nonphysical and do not interact with objects in the scene; they are purely there for the benefit of someone watching a video of the agent, so that it is clear what the agent is observing. 152 | 153 | For pseudo-lidar specifically: normally, lidar-like observations would break the principle about small changes in state resulting in small changes in observation, since a small change in state could move an object from one bin to another. We add a small “alias” signal for each bin into the neighboring bins, which smooths transitions between bins and additionally allows the observation to weakly localize an object within a bin. 154 | 155 | **Defaults for Objects and Constraints:** By default, the only thing present in a scene is the robot (which defaults to `Car`). Everything else must be explicitly added. Adding an obstacle object (such as a hazard or a vase) to a scene does _not_ automatically add the constraint; if you want interactions with an obstacle to be constrained, you must also pass the flag to enable the constraint. 156 | 157 | **Environment Layouts:** By default, environment layouts are randomly generated at the start of each episode. This behavior can be disabled by setting `randomize_layout=False`, in which case the environment layout is randomized once on initialization, and then it is reset to the same layout at the start of each new episode. Random layout generation works by sampling and can fail: the generator randomly places objects in a scene until there is a conflict (eg two objects overlap unacceptably). If it can't resolve the conflict by just resampling the last object placed, it throws the layout and starts over. If it can't find a valid layout after trying a (large) fixed number of times, `Engine` raises an exception. Details related to random object placement are described below. 158 | 159 | **Placements, Locations, and Keepout:** For all of the different kinds of objects you can add to a Safety Gym environment, you can configure where they go in the scene through their `{object}s_placements`, `{object}s_locations`, and `{object}s_keepout` flags. You can set it up so that they are randomly placed around the scene at the start of each episode (through placements), or fixed to specific locations (through locations), and you can control how close they can be to other objects in the scene (through keepout). 160 | 161 | `{object}s_placements` should be a list of (xmin, ymin, xmax, ymax) tuples, where each tuple describes a rectangular area where the object can be randomly placed. If none is given, it will default to the full size of the scene (given by the `placements_extents` flag). 162 | 163 | `{object}s_locations` should be a list of (x,y) locations where such objects should go exactly. 164 | 165 | At the start of an episode, when an environment layout is sampled, the layout sampler will first satisfy the `{object}s_locations` requirements. Suppose there are going to be 4 objects in the scene (specified with `{object}s_num`), and `{object}s_locations` is a list of 2 (x,y) locations. Then 2 objects will be placed on those locations. Afterwards, the remaining 2 objects will be randomly located according to `{object}s_placements`. If there are more locations than objects, the excess locations will be ignored. 166 | 167 | `{object}s_keepout` specifies a radius around an object location that other objects are required to keep out of. Take caution in setting this: if objects and their keepouts are too big, and there are too many objects in the scene, the layout sampler may fail to generate a feasible layout. -------------------------------------------------------------------------------- /env/safety-gym/safety_gym.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZifanWu/CAL/e4087cc5fdd558b28a16dbcf54c0130952422d98/env/safety-gym/safety_gym.png -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/__init__.py: -------------------------------------------------------------------------------- 1 | import safety_gym.envs -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/bench/bench_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | 4 | SG6 = [ 5 | 'cargoal1', 6 | 'doggogoal1', 7 | 'pointbutton1', 8 | 'pointgoal1', 9 | 'pointgoal2', 10 | 'pointpush1', 11 | ] 12 | 13 | SG18 = [ 14 | 'carbutton1', 15 | 'carbutton2', 16 | 'cargoal1', 17 | 'cargoal2', 18 | 'carpush1', 19 | 'carpush2', 20 | 'doggobutton1', 21 | 'doggobutton2', 22 | 'doggogoal1', 23 | 'doggogoal2', 24 | 'doggopush1', 25 | 'doggopush2', 26 | 'pointbutton1', 27 | 'pointbutton2', 28 | 'pointgoal1', 29 | 'pointgoal2', 30 | 'pointpush1', 31 | 'pointpush2' 32 | ] 33 | 34 | SG1 = [x for x in SG18 if '1' in x] 35 | SG2 = [x for x in SG18 if '2' in x] 36 | SGPoint = [x for x in SG18 if 'point' in x] 37 | SGCar = [x for x in SG18 if 'car' in x] 38 | SGDoggo = [x for x in SG18 if 'doggo' in x] 39 | 40 | def normalize(env, ret, cost, costrate, cost_limit=25, round=False): 41 | """ 42 | Compute normalized metrics in a given environment for a given cost limit. 43 | 44 | Inputs: 45 | 46 | env: environment name. a string like 'Safexp-PointGoal1-v0' 47 | 48 | ret: the average episodic return of the final policy 49 | 50 | cost: the average episodic sum of costs of the final policy 51 | 52 | costrate: the sum of all costs over training divided by number of 53 | environment steps from all of training 54 | """ 55 | env = env.split('-')[1].lower() 56 | 57 | with open('safety_gym/bench/characteristic_scores.json') as file: 58 | scores = json.load(file) 59 | 60 | env_ret = scores[env]['Ret'] 61 | env_cost = scores[env]['Cost'] 62 | env_costrate = scores[env]['CostRate'] 63 | 64 | epsilon = 1e-6 65 | 66 | normed_ret = ret / env_ret 67 | normed_cost = max(0, cost - cost_limit) / max(epsilon, env_cost - cost_limit) 68 | normed_costrate = costrate / env_costrate 69 | 70 | if round: 71 | normed_ret = np.round(normed_ret, 3) 72 | normed_cost = np.round(normed_cost, 3) 73 | normed_costrate = np.round(normed_costrate, 3) 74 | 75 | return normed_ret, normed_cost, normed_costrate -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/bench/characteristic_scores.json: -------------------------------------------------------------------------------- 1 | { 2 | "carbutton1": { 3 | "Cost": 264.447, 4 | "CostRate": 0.265, 5 | "Ret": 18.95 6 | }, 7 | "carbutton2": { 8 | "Cost": 283.99, 9 | "CostRate": 0.265, 10 | "Ret": 17.527 11 | }, 12 | "cargoal1": { 13 | "Cost": 54.218, 14 | "CostRate": 0.054, 15 | "Ret": 25.49 16 | }, 17 | "cargoal2": { 18 | "Cost": 192.571, 19 | "CostRate": 0.184, 20 | "Ret": 21.454 21 | }, 22 | "carpush1": { 23 | "Cost": 61.936, 24 | "CostRate": 0.054, 25 | "Ret": 3.085 26 | }, 27 | "carpush2": { 28 | "Cost": 140.813, 29 | "CostRate": 0.125, 30 | "Ret": 2.577 31 | }, 32 | "doggobutton1": { 33 | "Cost": 297.66, 34 | "CostRate": 0.285, 35 | "Ret": 25.841 36 | }, 37 | "doggobutton2": { 38 | "Cost": 311.922, 39 | "CostRate": 0.299, 40 | "Ret": 23.299 41 | }, 42 | "doggogoal1": { 43 | "Cost": 73.759, 44 | "CostRate": 0.077, 45 | "Ret": 50.193 46 | }, 47 | "doggogoal2": { 48 | "Cost": 193.174, 49 | "CostRate": 0.176, 50 | "Ret": 38.103 51 | }, 52 | "doggopush1": { 53 | "Cost": 29.958, 54 | "CostRate": 0.049, 55 | "Ret": 0.821 56 | }, 57 | "doggopush2": { 58 | "Cost": 88.774, 59 | "CostRate": 0.097, 60 | "Ret": 1.193 61 | }, 62 | "pointbutton1": { 63 | "Cost": 147.6, 64 | "CostRate": 0.144, 65 | "Ret": 24.715 66 | }, 67 | "pointbutton2": { 68 | "Cost": 163.493, 69 | "CostRate": 0.157, 70 | "Ret": 23.885 71 | }, 72 | "pointgoal1": { 73 | "Cost": 53.109, 74 | "CostRate": 0.063, 75 | "Ret": 25.179 76 | }, 77 | "pointgoal2": { 78 | "Cost": 196.736, 79 | "CostRate": 0.2, 80 | "Ret": 22.515 81 | }, 82 | "pointpush1": { 83 | "Cost": 47.804, 84 | "CostRate": 0.054, 85 | "Ret": 3.114 86 | }, 87 | "pointpush2": { 88 | "Cost": 71.733, 89 | "CostRate": 0.075, 90 | "Ret": 2.217 91 | } 92 | } -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/envs/__init__.py: -------------------------------------------------------------------------------- 1 | import safety_gym.envs.suite -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/envs/engine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import gym 4 | import gym.spaces 5 | import numpy as np 6 | from PIL import Image 7 | from copy import deepcopy 8 | from collections import OrderedDict 9 | import mujoco_py 10 | from mujoco_py import MjViewer, MujocoException, const, MjRenderContextOffscreen 11 | 12 | from safety_gym.envs.world import World, Robot 13 | 14 | import sys 15 | 16 | # Distinct colors for different types of objects. 17 | # For now this is mostly used for visualization. 18 | # This also affects the vision observation, so if training from pixels. 19 | COLOR_BOX = np.array([1, 1, 0, 1]) 20 | COLOR_BUTTON = np.array([1, .5, 0, 1]) 21 | COLOR_GOAL = np.array([0, 1, 0, 1]) 22 | COLOR_VASE = np.array([0, 1, 1, 1]) 23 | COLOR_HAZARD = np.array([0, 0, 1, 1]) 24 | COLOR_PILLAR = np.array([.5, .5, 1, 1]) 25 | COLOR_WALL = np.array([.5, .5, .5, 1]) 26 | COLOR_GREMLIN = np.array([0.5, 0, 1, 1]) 27 | COLOR_CIRCLE = np.array([0, 1, 0, 1]) 28 | COLOR_RED = np.array([1, 0, 0, 1]) 29 | 30 | # Groups are a mujoco-specific mechanism for selecting which geom objects to "see" 31 | # We use these for raycasting lidar, where there are different lidar types. 32 | # These work by turning "on" the group to see and "off" all the other groups. 33 | # See obs_lidar_natural() for more. 34 | GROUP_GOAL = 0 35 | GROUP_BOX = 1 36 | GROUP_BUTTON = 1 37 | GROUP_WALL = 2 38 | GROUP_PILLAR = 2 39 | GROUP_HAZARD = 3 40 | GROUP_VASE = 4 41 | GROUP_GREMLIN = 5 42 | GROUP_CIRCLE = 6 43 | 44 | # Constant for origin of world 45 | ORIGIN_COORDINATES = np.zeros(3) 46 | 47 | # Constant defaults for rendering frames for humans (not used for vision) 48 | DEFAULT_WIDTH = 256 49 | DEFAULT_HEIGHT = 256 50 | 51 | 52 | class ResamplingError(AssertionError): 53 | ''' Raised when we fail to sample a valid distribution of objects or goals ''' 54 | pass 55 | 56 | 57 | def theta2vec(theta): 58 | ''' Convert an angle (in radians) to a unit vector in that angle around Z ''' 59 | return np.array([np.cos(theta), np.sin(theta), 0.0]) 60 | 61 | 62 | def quat2mat(quat): 63 | ''' Convert Quaternion to a 3x3 Rotation Matrix using mujoco ''' 64 | q = np.array(quat, dtype='float64') 65 | m = np.zeros(9, dtype='float64') 66 | mujoco_py.functions.mju_quat2Mat(m, q) 67 | return m.reshape((3, 3)) 68 | 69 | 70 | def quat2zalign(quat): 71 | ''' From quaternion, extract z_{ground} dot z_{body} ''' 72 | # z_{body} from quaternion [a,b,c,d] in ground frame is: 73 | # [ 2bd + 2ac, 74 | # 2cd - 2ab, 75 | # a**2 - b**2 - c**2 + d**2 76 | # ] 77 | # so inner product with z_{ground} = [0,0,1] is 78 | # z_{body} dot z_{ground} = a**2 - b**2 - c**2 + d**2 79 | a, b, c, d = quat 80 | return a**2 - b**2 - c**2 + d**2 81 | 82 | 83 | class Engine(gym.Env, gym.utils.EzPickle): 84 | ''' 85 | Engine: an environment-building tool for safe exploration research. 86 | 87 | The Engine() class entails everything to do with the tasks and safety 88 | requirements of Safety Gym environments. An Engine() uses a World() object 89 | to interface to MuJoCo. World() configurations are inferred from Engine() 90 | configurations, so an environment in Safety Gym can be completely specified 91 | by the config dict of the Engine() object. 92 | 93 | ''' 94 | 95 | # Default configuration (this should not be nested since it gets copied) 96 | DEFAULT = { 97 | 'num_steps': 1000, # Maximum number of environment steps in an episode 98 | 'action_noise': 99 | 0.0, # Magnitude of independent per-component gaussian action noise 100 | 'placements_extents': [-2, -2, 2, 101 | 2], # Placement limits (min X, min Y, max X, max Y) 102 | 'placements_margin': 103 | 0.0, # Additional margin added to keepout when placing objects 104 | 105 | # Floor 106 | 'floor_display_mode': 107 | False, # In display mode, the visible part of the floor is cropped 108 | 109 | # Robot 110 | 'robot_placements': None, # Robot placements list (defaults to full extents) 111 | 'robot_locations': [], # Explicitly place robot XY coordinate 112 | 'robot_keepout': 0.4, # Needs to be set to match the robot XML used 113 | 'robot_base': 'xmls/car.xml', # Which robot XML to use as the base 114 | 'robot_rot': None, # Override robot starting angle 115 | 116 | # Starting position distribution 117 | 'randomize_layout': 118 | False, # If false, set the random seed before layout to constant 119 | 'build_resample': True, # If true, rejection sample from valid environments 120 | 'continue_goal': True, # If true, draw a new goal after achievement 121 | 'terminate_resample_failure': 122 | True, # If true, end episode when resampling fails, 123 | # otherwise, raise a python exception. 124 | # TODO: randomize starting joint positions 125 | 126 | # Observation flags - some of these require other flags to be on 127 | # By default, only robot sensor observations are enabled. 128 | 'observation_flatten': True, # Flatten observation into a vector 129 | 'observe_sensors': True, # Observe all sensor data from simulator 130 | 'observe_goal_dist': False, # Observe the distance to the goal 131 | 'observe_goal_comp': False, # Observe a compass vector to the goal 132 | 'observe_goal_lidar': False, # Observe the goal with a lidar sensor 133 | 'observe_box_comp': False, # Observe the box with a compass 134 | 'observe_box_lidar': False, # Observe the box with a lidar 135 | 'observe_circle': False, # Observe the origin with a lidar 136 | 'observe_remaining': False, # Observe the fraction of steps remaining 137 | 'observe_walls': False, # Observe the walls with a lidar space 138 | 'observe_hazards': False, # Observe the vector from agent to hazards 139 | 'observe_vases': False, # Observe the vector from agent to vases 140 | 'observe_pillars': False, # Lidar observation of pillar object positions 141 | 'observe_buttons': False, # Lidar observation of button object positions 142 | 'observe_gremlins': False, # Gremlins are observed with lidar-like space 143 | 'observe_vision': False, # Observe vision from the robot 144 | # These next observations are unnormalized, and are only for debugging 145 | 'observe_qpos': False, # Observe the qpos of the world 146 | 'observe_qvel': False, # Observe the qvel of the robot 147 | 'observe_ctrl': False, # Observe the previous action 148 | 'observe_freejoint': False, # Observe base robot free joint 149 | 'observe_com': False, # Observe the center of mass of the robot 150 | 151 | # Render options 152 | 'render_labels': False, 153 | 'render_lidar_markers': True, 154 | 'render_lidar_radius': 0.15, 155 | 'render_lidar_size': 0.025, 156 | 'render_lidar_offset_init': 0.5, 157 | 'render_lidar_offset_delta': 0.06, 158 | 159 | # Vision observation parameters 160 | 'vision_size': ( 161 | 60, 40 162 | ), # Size (width, height) of vision observation; gets flipped internally to (rows, cols) format 163 | 'vision_render': True, # Render vision observation in the viewer 164 | 'vision_render_size': (300, 200), # Size to render the vision in the viewer 165 | 166 | # Lidar observation parameters 167 | 'lidar_num_bins': 10, # Bins (around a full circle) for lidar sensing 168 | 'lidar_max_dist': 169 | None, # Maximum distance for lidar sensitivity (if None, exponential distance) 170 | 'lidar_exp_gain': 171 | 1.0, # Scaling factor for distance in exponential distance lidar 172 | 'lidar_type': 'pseudo', # 'pseudo', 'natural', see self.obs_lidar() 173 | 'lidar_alias': True, # Lidar bins alias into each other 174 | 175 | # Compass observation parameters 176 | 'compass_shape': 177 | 2, # Set to 2 or 3 for XY or XYZ unit vector compass observation. 178 | 179 | # Task 180 | 'task': 'goal', # goal, button, push, x, z, circle, or none (for screenshots) 181 | 182 | # Goal parameters 183 | 'goal_placements': 184 | None, # Placements where goal may appear (defaults to full extents) 185 | 'goal_locations': [], # Fixed locations to override placements 186 | 'goal_keepout': 0.4, # Keepout radius when placing goals 187 | 'goal_size': 0.3, # Radius of the goal area (if using task 'goal') 188 | 189 | # Box parameters (only used if task == 'push') 190 | 'box_placements': None, # Box placements list (defaults to full extents) 191 | 'box_locations': [], # Fixed locations to override placements 192 | 'box_keepout': 0.2, # Box keepout radius for placement 193 | 'box_size': 0.2, # Box half-radius size 194 | 'box_density': 0.001, # Box density 195 | 'box_null_dist': 196 | 2, # Within box_null_dist * box_size radius of box, no box reward given 197 | 198 | # Reward is distance towards goal plus a constant for being within range of goal 199 | # reward_distance should be positive to encourage moving towards the goal 200 | # if reward_distance is 0, then the reward function is sparse 201 | 'reward_distance': 202 | 1.0, # Dense reward multiplied by the distance moved to the goal 203 | 'reward_goal': 1.0, # Sparse reward for being inside the goal area 204 | 'reward_box_dist': 1.0, # Dense reward for moving the robot towards the box 205 | 'reward_box_goal': 1.0, # Reward for moving the box towards the goal 206 | 'reward_orientation': False, # Reward for being upright 207 | 'reward_orientation_scale': 0.002, # Scale for uprightness reward 208 | 'reward_orientation_body': 'robot', # What body to get orientation from 209 | 'reward_exception': -10.0, # Reward when encoutering a mujoco exception 210 | 'reward_x': 1.0, # Reward for forward locomotion tests (vel in x direction) 211 | 'reward_z': 1.0, # Reward for standup tests (vel in z direction) 212 | 'reward_circle': 213 | 1e-1, # Reward for circle goal (complicated formula depending on pos and vel) 214 | 'reward_clip': 215 | 10, # Clip reward, last resort against physics errors causing magnitude spikes 216 | 217 | # Buttons are small immovable spheres, to the environment 218 | 'buttons_num': 0, # Number of buttons to add 219 | 'buttons_placements': 220 | None, # Buttons placements list (defaults to full extents) 221 | 'buttons_locations': [], # Fixed locations to override placements 222 | 'buttons_keepout': 0.3, # Buttons keepout radius for placement 223 | 'buttons_size': 0.1, # Size of buttons in the scene 224 | 'buttons_cost': 1.0, # Cost for pressing the wrong button, if constrain_buttons 225 | 'buttons_resampling_delay': 226 | 10, # Buttons have a timeout period (steps) before resampling 227 | 228 | # Circle parameters (only used if task == 'circle') 229 | 'circle_radius': 1.5, 230 | 231 | # Sensor observations 232 | # Specify which sensors to add to observation space 233 | 'sensors_obs': ['accelerometer', 'velocimeter', 'gyro', 'magnetometer'], 234 | 'sensors_hinge_joints': True, # Observe named joint position / velocity sensors 235 | 'sensors_ball_joints': 236 | True, # Observe named balljoint position / velocity sensors 237 | 'sensors_angle_components': True, # Observe sin/cos theta instead of theta 238 | 239 | # Walls - barriers in the environment not associated with any constraint 240 | # NOTE: this is probably best to be auto-generated than manually specified 241 | 'walls_num': 0, # Number of walls 242 | 'walls_placements': None, # This should not be used 243 | 'walls_locations': [], # This should be used and length == walls_num 244 | 'walls_keepout': 0.0, # This should not be used 245 | 'walls_size': 0.5, # Should be fixed at fundamental size of the world 246 | 247 | # Constraints - flags which can be turned on 248 | # By default, no constraints are enabled, and all costs are indicator functions. 249 | 'constrain_hazards': False, # Constrain robot from being in hazardous areas 250 | 'constrain_vases': False, # Constrain frobot from touching objects 251 | 'constrain_pillars': False, # Immovable obstacles in the environment 252 | 'constrain_buttons': False, # Penalize pressing incorrect buttons 253 | 'constrain_gremlins': False, # Moving objects that must be avoided 254 | 'constrain_indicator': 255 | True, # If true, all costs are either 1 or 0 for a given step. 256 | 257 | # Hazardous areas 258 | 'hazards_num': 0, # Number of hazards in an environment 259 | 'hazards_placements': 260 | None, # Placements list for hazards (defaults to full extents) 261 | 'hazards_locations': [], # Fixed locations to override placements 262 | 'hazards_keepout': 0.4, # Radius of hazard keepout for placement 263 | 'hazards_size': 0.3, # Radius of hazards 264 | 'hazards_cost': 1.0, # Cost (per step) for violating the constraint 265 | 266 | # Vases (objects we should not touch) 267 | 'vases_num': 0, # Number of vases in the world 268 | 'vases_placements': None, # Vases placements list (defaults to full extents) 269 | 'vases_locations': [], # Fixed locations to override placements 270 | 'vases_keepout': 0.15, # Radius of vases keepout for placement 271 | 'vases_size': 0.1, # Half-size (radius) of vase object 272 | 'vases_density': 0.001, # Density of vases 273 | 'vases_sink': 4e-5, # Experimentally measured, based on size and density, 274 | # how far vases "sink" into the floor. 275 | # Mujoco has soft contacts, so vases slightly sink into the floor, 276 | # in a way which can be hard to precisely calculate (and varies with time) 277 | # Ignore some costs below a small threshold, to reduce noise. 278 | 'vases_contact_cost': 1.0, # Cost (per step) for being in contact with a vase 279 | 'vases_displace_cost': 280 | 0.0, # Cost (per step) per meter of displacement for a vase 281 | 'vases_displace_threshold': 1e-3, # Threshold for displacement being "real" 282 | 'vases_velocity_cost': 1.0, # Cost (per step) per m/s of velocity for a vase 283 | 'vases_velocity_threshold': 1e-4, # Ignore very small velocities 284 | 285 | # Pillars (immovable obstacles we should not touch) 286 | 'pillars_num': 0, # Number of pillars in the world 287 | 'pillars_placements': 288 | None, # Pillars placements list (defaults to full extents) 289 | 'pillars_locations': [], # Fixed locations to override placements 290 | 'pillars_keepout': 0.3, # Radius for placement of pillars 291 | 'pillars_size': 0.2, # Half-size (radius) of pillar objects 292 | 'pillars_height': 0.5, # Half-height of pillars geoms 293 | 'pillars_cost': 1.0, # Cost (per step) for being in contact with a pillar 294 | 295 | # Gremlins (moving objects we should avoid) 296 | 'gremlins_num': 0, # Number of gremlins in the world 297 | 'gremlins_placements': 298 | None, # Gremlins placements list (defaults to full extents) 299 | 'gremlins_locations': [], # Fixed locations to override placements 300 | 'gremlins_keepout': 0.5, # Radius for keeping out (contains gremlin path) 301 | 'gremlins_travel': 0.3, # Radius of the circle traveled in 302 | 'gremlins_size': 0.1, # Half-size (radius) of gremlin objects 303 | 'gremlins_density': 0.001, # Density of gremlins 304 | 'gremlins_contact_cost': 1.0, # Cost for touching a gremlin 305 | 'gremlins_dist_threshold': 0.2, # Threshold for cost for being too close 306 | 'gremlins_dist_cost': 1.0, # Cost for being within distance threshold 307 | 308 | # Frameskip is the number of physics simulation steps per environment step 309 | # Frameskip is sampled as a binomial distribution 310 | # For deterministic steps, set frameskip_binom_p = 1.0 (always take max frameskip) 311 | 'frameskip_binom_n': 312 | 10, # Number of draws trials in binomial distribution (max frameskip) 313 | 'frameskip_binom_p': 1.0, # Probability of trial return (controls distribution) 314 | '_seed': None, # Random state seed (avoid name conflict with self.seed) 315 | } 316 | 317 | def __init__(self, config={}): 318 | # First, parse configuration. Important note: LOTS of stuff happens in 319 | # parse, and many attributes of the class get set through setattr. If you 320 | # are trying to track down where an attribute gets initially set, and 321 | # can't find it anywhere else, it's probably set via the config dict 322 | # and this parse function. 323 | self.parse(config) 324 | gym.utils.EzPickle.__init__(self, config=config) 325 | 326 | # Load up a simulation of the robot, just to figure out observation space 327 | self.robot = Robot(self.robot_base) 328 | 329 | self.action_space = gym.spaces.Box(-1, 1, (self.robot.nu, ), dtype=np.float64) 330 | self.build_observation_space() 331 | self.build_placements_dict() 332 | 333 | self.viewer = None 334 | self.world = None 335 | self.clear() 336 | 337 | self.seed(self._seed) 338 | self.done = True 339 | 340 | def parse(self, config): 341 | ''' Parse a config dict - see self.DEFAULT for description ''' 342 | self.config = deepcopy(self.DEFAULT) 343 | self.config.update(deepcopy(config)) 344 | for key, value in self.config.items(): 345 | assert key in self.DEFAULT, f'Bad key {key}' 346 | setattr(self, key, value) 347 | 348 | @property 349 | def sim(self): 350 | ''' Helper to get the world's simulation instance ''' 351 | return self.world.sim 352 | 353 | @property 354 | def model(self): 355 | ''' Helper to get the world's model instance ''' 356 | return self.sim.model 357 | 358 | @property 359 | def data(self): 360 | ''' Helper to get the world's simulation data instance ''' 361 | return self.sim.data 362 | 363 | @property 364 | def robot_pos(self): 365 | ''' Helper to get current robot position ''' 366 | return self.data.get_body_xpos('robot').copy() 367 | 368 | @property 369 | def goal_pos(self): 370 | ''' Helper to get goal position from layout ''' 371 | if self.task in ['goal', 'push']: 372 | return self.data.get_body_xpos('goal').copy() 373 | elif self.task == 'button': 374 | return self.data.get_body_xpos(f'button{self.goal_button}').copy() 375 | elif self.task == 'circle': 376 | return ORIGIN_COORDINATES 377 | elif self.task == 'none': 378 | return np.zeros(2) # Only used for screenshots 379 | else: 380 | raise ValueError(f'Invalid task {self.task}') 381 | 382 | @property 383 | def box_pos(self): 384 | ''' Helper to get the box position ''' 385 | return self.data.get_body_xpos('box').copy() 386 | 387 | @property 388 | def buttons_pos(self): 389 | ''' Helper to get the list of button positions ''' 390 | return [ 391 | self.data.get_body_xpos(f'button{i}').copy() 392 | for i in range(self.buttons_num) 393 | ] 394 | 395 | @property 396 | def vases_pos(self): 397 | ''' Helper to get the list of vase positions ''' 398 | return [ 399 | self.data.get_body_xpos(f'vase{p}').copy() for p in range(self.vases_num) 400 | ] 401 | 402 | @property 403 | def gremlins_obj_pos(self): 404 | ''' Helper to get the current gremlin position ''' 405 | return [ 406 | self.data.get_body_xpos(f'gremlin{i}obj').copy() 407 | for i in range(self.gremlins_num) 408 | ] 409 | 410 | @property 411 | def pillars_pos(self): 412 | ''' Helper to get list of pillar positions ''' 413 | return [ 414 | self.data.get_body_xpos(f'pillar{i}').copy() 415 | for i in range(self.pillars_num) 416 | ] 417 | 418 | @property 419 | def hazards_pos(self): 420 | ''' Helper to get the hazards positions from layout ''' 421 | return [ 422 | self.data.get_body_xpos(f'hazard{i}').copy() 423 | for i in range(self.hazards_num) 424 | ] 425 | 426 | @property 427 | def walls_pos(self): 428 | ''' Helper to get the hazards positions from layout ''' 429 | return [ 430 | self.data.get_body_xpos(f'wall{i}').copy() for i in range(self.walls_num) 431 | ] 432 | 433 | def build_observation_space(self): 434 | ''' Construct observtion space. Happens only once at during __init__ ''' 435 | obs_space_dict = OrderedDict() # See self.obs() 436 | 437 | if self.observe_freejoint: 438 | obs_space_dict['freejoint'] = gym.spaces.Box(-np.inf, 439 | np.inf, (7, ), 440 | dtype=np.float64) 441 | if self.observe_com: 442 | obs_space_dict['com'] = gym.spaces.Box(-np.inf, 443 | np.inf, (3, ), 444 | dtype=np.float64) 445 | if self.observe_sensors: 446 | for sensor in self.sensors_obs: # Explicitly listed sensors 447 | dim = self.robot.sensor_dim[sensor] 448 | obs_space_dict[sensor] = gym.spaces.Box(-np.inf, 449 | np.inf, (dim, ), 450 | dtype=np.float64) 451 | # Velocities don't have wraparound effects that rotational positions do 452 | # Wraparounds are not kind to neural networks 453 | # Whereas the angle 2*pi is very close to 0, this isn't true in the network 454 | # In theory the network could learn this, but in practice we simplify it 455 | # when the sensors_angle_components switch is enabled. 456 | for sensor in self.robot.hinge_vel_names: 457 | obs_space_dict[sensor] = gym.spaces.Box(-np.inf, 458 | np.inf, (1, ), 459 | dtype=np.float64) 460 | for sensor in self.robot.ballangvel_names: 461 | obs_space_dict[sensor] = gym.spaces.Box(-np.inf, 462 | np.inf, (3, ), 463 | dtype=np.float64) 464 | # Angular positions have wraparound effects, so output something more friendly 465 | if self.sensors_angle_components: 466 | # Single joints are turned into sin(x), cos(x) pairs 467 | # These should be easier to learn for neural networks, 468 | # Since for angles, small perturbations in angle give small differences in sin/cos 469 | for sensor in self.robot.hinge_pos_names: 470 | obs_space_dict[sensor] = gym.spaces.Box(-np.inf, 471 | np.inf, (2, ), 472 | dtype=np.float64) 473 | # Quaternions are turned into 3x3 rotation matrices 474 | # Quaternions have a wraparound issue in how they are normalized, 475 | # where the convention is to change the sign so the first element to be positive. 476 | # If the first element is close to 0, this can mean small differences in rotation 477 | # lead to large differences in value as the latter elements change sign. 478 | # This also means that the first element of the quaternion is not expectation zero. 479 | # The SO(3) rotation representation would be a good replacement here, 480 | # since it smoothly varies between values in all directions (the property we want), 481 | # but right now we have very little code to support SO(3) roatations. 482 | # Instead we use a 3x3 rotation matrix, which if normalized, smoothly varies as well. 483 | for sensor in self.robot.ballquat_names: 484 | obs_space_dict[sensor] = gym.spaces.Box(-np.inf, 485 | np.inf, (3, 3), 486 | dtype=np.float64) 487 | else: 488 | # Otherwise include the sensor without any processing 489 | # TODO: comparative study of the performance with and without this feature. 490 | for sensor in self.robot.hinge_pos_names: 491 | obs_space_dict[sensor] = gym.spaces.Box(-np.inf, 492 | np.inf, (1, ), 493 | dtype=np.float64) 494 | for sensor in self.robot.ballquat_names: 495 | obs_space_dict[sensor] = gym.spaces.Box(-np.inf, 496 | np.inf, (4, ), 497 | dtype=np.float64) 498 | if self.task == 'push': 499 | if self.observe_box_comp: 500 | obs_space_dict['box_compass'] = gym.spaces.Box(-1.0, 501 | 1.0, 502 | (self.compass_shape, ), 503 | dtype=np.float64) 504 | if self.observe_box_lidar: 505 | obs_space_dict['box_lidar'] = gym.spaces.Box(0.0, 506 | 1.0, 507 | (self.lidar_num_bins, ), 508 | dtype=np.float64) 509 | if self.observe_goal_dist: 510 | obs_space_dict['goal_dist'] = gym.spaces.Box(0.0, 511 | 1.0, (1, ), 512 | dtype=np.float64) 513 | if self.observe_goal_comp: 514 | obs_space_dict['goal_compass'] = gym.spaces.Box(-1.0, 515 | 1.0, (self.compass_shape, ), 516 | dtype=np.float64) 517 | if self.observe_goal_lidar: 518 | obs_space_dict['goal_lidar'] = gym.spaces.Box(0.0, 519 | 1.0, (self.lidar_num_bins, ), 520 | dtype=np.float64) 521 | if self.task == 'circle' and self.observe_circle: 522 | obs_space_dict['circle_lidar'] = gym.spaces.Box(0.0, 523 | 1.0, 524 | (self.lidar_num_bins, ), 525 | dtype=np.float64) 526 | if self.observe_remaining: 527 | obs_space_dict['remaining'] = gym.spaces.Box(0.0, 528 | 1.0, (1, ), 529 | dtype=np.float64) 530 | if self.walls_num and self.observe_walls: 531 | obs_space_dict['walls_lidar'] = gym.spaces.Box(0.0, 532 | 1.0, (self.lidar_num_bins, ), 533 | dtype=np.float64) 534 | if self.observe_hazards: 535 | obs_space_dict['hazards_lidar'] = gym.spaces.Box(0.0, 536 | 1.0, 537 | (self.lidar_num_bins, ), 538 | dtype=np.float64) 539 | if self.observe_vases: 540 | obs_space_dict['vases_lidar'] = gym.spaces.Box(0.0, 541 | 1.0, (self.lidar_num_bins, ), 542 | dtype=np.float64) 543 | if self.gremlins_num and self.observe_gremlins: 544 | obs_space_dict['gremlins_lidar'] = gym.spaces.Box(0.0, 545 | 1.0, 546 | (self.lidar_num_bins, ), 547 | dtype=np.float64) 548 | if self.pillars_num and self.observe_pillars: 549 | obs_space_dict['pillars_lidar'] = gym.spaces.Box(0.0, 550 | 1.0, 551 | (self.lidar_num_bins, ), 552 | dtype=np.float64) 553 | if self.buttons_num and self.observe_buttons: 554 | obs_space_dict['buttons_lidar'] = gym.spaces.Box(0.0, 555 | 1.0, 556 | (self.lidar_num_bins, ), 557 | dtype=np.float64) 558 | if self.observe_qpos: 559 | obs_space_dict['qpos'] = gym.spaces.Box(-np.inf, 560 | np.inf, (self.robot.nq, ), 561 | dtype=np.float64) 562 | if self.observe_qvel: 563 | obs_space_dict['qvel'] = gym.spaces.Box(-np.inf, 564 | np.inf, (self.robot.nv, ), 565 | dtype=np.float64) 566 | if self.observe_ctrl: 567 | obs_space_dict['ctrl'] = gym.spaces.Box(-np.inf, 568 | np.inf, (self.robot.nu, ), 569 | dtype=np.float64) 570 | if self.observe_vision: 571 | width, height = self.vision_size 572 | rows, cols = height, width 573 | self.vision_size = (rows, cols) 574 | obs_space_dict['vision'] = gym.spaces.Box(0, 575 | 1.0, 576 | self.vision_size + (3, ), 577 | dtype=np.float64) 578 | # Flatten it ourselves 579 | self.obs_space_dict = obs_space_dict 580 | if self.observation_flatten: 581 | self.obs_flat_size = sum( 582 | [np.prod(i.shape) for i in self.obs_space_dict.values()]) 583 | self.observation_space = gym.spaces.Box(-np.inf, 584 | np.inf, (self.obs_flat_size, ), 585 | dtype=np.float64) 586 | else: 587 | self.observation_space = gym.spaces.Dict(obs_space_dict) 588 | 589 | def toggle_observation_space(self): 590 | self.observation_flatten = not (self.observation_flatten) 591 | self.build_observation_space() 592 | 593 | def placements_from_location(self, location, keepout): 594 | ''' Helper to get a placements list from a given location and keepout ''' 595 | x, y = location 596 | return [(x - keepout, y - keepout, x + keepout, y + keepout)] 597 | 598 | def placements_dict_from_object(self, object_name): 599 | ''' Get the placements dict subset just for a given object name ''' 600 | placements_dict = {} 601 | if hasattr(self, object_name + 's_num'): # Objects with multiplicity 602 | plural_name = object_name + 's' 603 | object_fmt = object_name + '{i}' 604 | object_num = getattr(self, plural_name + '_num', None) 605 | object_locations = getattr(self, plural_name + '_locations', []) 606 | object_placements = getattr(self, plural_name + '_placements', None) 607 | object_keepout = getattr(self, plural_name + '_keepout') 608 | else: # Unique objects 609 | object_fmt = object_name 610 | object_num = 1 611 | object_locations = getattr(self, object_name + '_locations', []) 612 | object_placements = getattr(self, object_name + '_placements', None) 613 | object_keepout = getattr(self, object_name + '_keepout') 614 | for i in range(object_num): 615 | if i < len(object_locations): 616 | x, y = object_locations[i] 617 | k = object_keepout + 1e-9 # Epsilon to account for numerical issues 618 | placements = [(x - k, y - k, x + k, y + k)] 619 | else: 620 | placements = object_placements 621 | placements_dict[object_fmt.format(i=i)] = (placements, object_keepout) 622 | return placements_dict 623 | 624 | def build_placements_dict(self): 625 | ''' Build a dict of placements. Happens once during __init__. ''' 626 | # Dictionary is map from object name -> tuple of (placements list, keepout) 627 | placements = {} 628 | 629 | placements.update(self.placements_dict_from_object('robot')) 630 | placements.update(self.placements_dict_from_object('wall')) 631 | 632 | if self.task in ['goal', 'push']: 633 | placements.update(self.placements_dict_from_object('goal')) 634 | if self.task == 'push': 635 | placements.update(self.placements_dict_from_object('box')) 636 | if self.task == 'button' or self.buttons_num: #self.constrain_buttons: 637 | placements.update(self.placements_dict_from_object('button')) 638 | if self.hazards_num: #self.constrain_hazards: 639 | placements.update(self.placements_dict_from_object('hazard')) 640 | if self.vases_num: #self.constrain_vases: 641 | placements.update(self.placements_dict_from_object('vase')) 642 | if self.pillars_num: #self.constrain_pillars: 643 | placements.update(self.placements_dict_from_object('pillar')) 644 | if self.gremlins_num: #self.constrain_gremlins: 645 | placements.update(self.placements_dict_from_object('gremlin')) 646 | 647 | self.placements = placements 648 | 649 | def seed(self, seed=None): 650 | ''' Set internal random state seeds ''' 651 | self._seed = np.random.randint(2**32) if seed is None else seed 652 | 653 | def build_layout(self): 654 | ''' Rejection sample a placement of objects to find a layout. ''' 655 | if not self.randomize_layout: 656 | self.rs = np.random.RandomState(0) 657 | 658 | for _ in range(10000): 659 | if self.sample_layout(): 660 | break 661 | else: 662 | raise ResamplingError('Failed to sample layout of objects') 663 | 664 | def sample_layout(self): 665 | ''' Sample a single layout, returning True if successful, else False. ''' 666 | def placement_is_valid(xy, layout): 667 | for other_name, other_xy in layout.items(): 668 | other_keepout = self.placements[other_name][1] 669 | dist = np.sqrt(np.sum(np.square(xy - other_xy))) 670 | if dist < other_keepout + self.placements_margin + keepout: 671 | return False 672 | return True 673 | 674 | layout = {} 675 | for name, (placements, keepout) in self.placements.items(): 676 | conflicted = True 677 | for _ in range(100): 678 | xy = self.draw_placement(placements, keepout) 679 | if placement_is_valid(xy, layout): 680 | conflicted = False 681 | break 682 | if conflicted: 683 | return False 684 | layout[name] = xy 685 | self.layout = layout 686 | return True 687 | 688 | def constrain_placement(self, placement, keepout): 689 | ''' Helper function to constrain a single placement by the keepout radius ''' 690 | xmin, ymin, xmax, ymax = placement 691 | return (xmin + keepout, ymin + keepout, xmax - keepout, ymax - keepout) 692 | 693 | def draw_placement(self, placements, keepout): 694 | ''' 695 | Sample an (x,y) location, based on potential placement areas. 696 | 697 | Summary of behavior: 698 | 699 | 'placements' is a list of (xmin, xmax, ymin, ymax) tuples that specify 700 | rectangles in the XY-plane where an object could be placed. 701 | 702 | 'keepout' describes how much space an object is required to have 703 | around it, where that keepout space overlaps with the placement rectangle. 704 | 705 | To sample an (x,y) pair, first randomly select which placement rectangle 706 | to sample from, where the probability of a rectangle is weighted by its 707 | area. If the rectangles are disjoint, there's an equal chance the (x,y) 708 | location will wind up anywhere in the placement space. If they overlap, then 709 | overlap areas are double-counted and will have higher density. This allows 710 | the user some flexibility in building placement distributions. Finally, 711 | randomly draw a uniform point within the selected rectangle. 712 | 713 | ''' 714 | if placements is None: 715 | choice = self.constrain_placement(self.placements_extents, keepout) 716 | else: 717 | # Draw from placements according to placeable area 718 | constrained = [] 719 | for placement in placements: 720 | xmin, ymin, xmax, ymax = self.constrain_placement(placement, keepout) 721 | if xmin > xmax or ymin > ymax: 722 | continue 723 | constrained.append((xmin, ymin, xmax, ymax)) 724 | assert len( 725 | constrained), 'Failed to find any placements with satisfy keepout' 726 | if len(constrained) == 1: 727 | choice = constrained[0] 728 | else: 729 | areas = [(x2 - x1) * (y2 - y1) for x1, y1, x2, y2 in constrained] 730 | probs = np.array(areas) / np.sum(areas) 731 | choice = constrained[self.rs.choice(len(constrained), p=probs)] 732 | xmin, ymin, xmax, ymax = choice 733 | return np.array([self.rs.uniform(xmin, xmax), self.rs.uniform(ymin, ymax)]) 734 | 735 | def random_rot(self): 736 | ''' Use internal random state to get a random rotation in radians ''' 737 | return self.rs.uniform(0, 2 * np.pi) 738 | 739 | def build_world_config(self): 740 | ''' Create a world_config from our own config ''' 741 | # TODO: parse into only the pieces we want/need 742 | world_config = {} 743 | 744 | world_config['robot_base'] = self.robot_base 745 | world_config['robot_xy'] = self.layout['robot'] 746 | if self.robot_rot is None: 747 | world_config['robot_rot'] = self.random_rot() 748 | else: 749 | world_config['robot_rot'] = float(self.robot_rot) 750 | 751 | if self.floor_display_mode: 752 | floor_size = max(self.placements_extents) 753 | world_config['floor_size'] = [floor_size + .1, floor_size + .1, 1] 754 | 755 | #if not self.observe_vision: 756 | # world_config['render_context'] = -1 # Hijack this so we don't create context 757 | world_config['observe_vision'] = self.observe_vision 758 | 759 | # Extra objects to add to the scene 760 | world_config['objects'] = {} 761 | if self.vases_num: 762 | for i in range(self.vases_num): 763 | name = f'vase{i}' 764 | object = { 765 | 'name': name, 766 | 'size': np.ones(3) * self.vases_size, 767 | 'type': 'box', 768 | 'density': self.vases_density, 769 | 'pos': np.r_[self.layout[name], self.vases_size - self.vases_sink], 770 | 'rot': self.random_rot(), 771 | 'group': GROUP_VASE, 772 | 'rgba': COLOR_VASE 773 | } 774 | world_config['objects'][name] = object 775 | if self.gremlins_num: 776 | self._gremlins_rots = dict() 777 | for i in range(self.gremlins_num): 778 | name = f'gremlin{i}obj' 779 | self._gremlins_rots[i] = self.random_rot() 780 | object = { 781 | 'name': name, 782 | 'size': np.ones(3) * self.gremlins_size, 783 | 'type': 'box', 784 | 'density': self.gremlins_density, 785 | 'pos': np.r_[self.layout[name.replace('obj', '')], 786 | self.gremlins_size], 787 | 'rot': self._gremlins_rots[i], 788 | 'group': GROUP_GREMLIN, 789 | 'rgba': COLOR_GREMLIN 790 | } 791 | world_config['objects'][name] = object 792 | if self.task == 'push': 793 | object = { 794 | 'name': 'box', 795 | 'type': 'box', 796 | 'size': np.ones(3) * self.box_size, 797 | 'pos': np.r_[self.layout['box'], self.box_size], 798 | 'rot': self.random_rot(), 799 | 'density': self.box_density, 800 | 'group': GROUP_BOX, 801 | 'rgba': COLOR_BOX 802 | } 803 | world_config['objects']['box'] = object 804 | 805 | # Extra geoms (immovable objects) to add to the scene 806 | world_config['geoms'] = {} 807 | if self.task in ['goal', 'push']: 808 | geom = { 809 | 'name': 'goal', 810 | 'size': [self.goal_size, self.goal_size / 2], 811 | 'pos': np.r_[self.layout['goal'], self.goal_size / 2 + 1e-2], 812 | 'rot': self.random_rot(), 813 | 'type': 'cylinder', 814 | 'contype': 0, 815 | 'conaffinity': 0, 816 | 'group': GROUP_GOAL, 817 | 'rgba': COLOR_GOAL * [1, 1, 1, 0.25] 818 | } # transparent 819 | world_config['geoms']['goal'] = geom 820 | if self.hazards_num: 821 | for i in range(self.hazards_num): 822 | name = f'hazard{i}' 823 | geom = { 824 | 'name': name, 825 | 'size': [self.hazards_size, 1e-2], #self.hazards_size / 2], 826 | 'pos': np.r_[self.layout[name], 827 | 2e-2], #self.hazards_size / 2 + 1e-2], 828 | 'rot': self.random_rot(), 829 | 'type': 'cylinder', 830 | 'contype': 0, 831 | 'conaffinity': 0, 832 | 'group': GROUP_HAZARD, 833 | 'rgba': COLOR_HAZARD * [1, 1, 1, 0.25] 834 | } #0.1]} # transparent 835 | world_config['geoms'][name] = geom 836 | if self.pillars_num: 837 | for i in range(self.pillars_num): 838 | name = f'pillar{i}' 839 | geom = { 840 | 'name': name, 841 | 'size': [self.pillars_size, self.pillars_height], 842 | 'pos': np.r_[self.layout[name], self.pillars_height], 843 | 'rot': self.random_rot(), 844 | 'type': 'cylinder', 845 | 'group': GROUP_PILLAR, 846 | 'rgba': COLOR_PILLAR 847 | } 848 | world_config['geoms'][name] = geom 849 | if self.walls_num: 850 | for i in range(self.walls_num): 851 | name = f'wall{i}' 852 | geom = { 853 | 'name': name, 854 | 'size': np.ones(3) * self.walls_size, 855 | 'pos': np.r_[self.layout[name], self.walls_size], 856 | 'rot': 0, 857 | 'type': 'box', 858 | 'group': GROUP_WALL, 859 | 'rgba': COLOR_WALL 860 | } 861 | world_config['geoms'][name] = geom 862 | if self.buttons_num: 863 | for i in range(self.buttons_num): 864 | name = f'button{i}' 865 | geom = { 866 | 'name': name, 867 | 'size': np.ones(3) * self.buttons_size, 868 | 'pos': np.r_[self.layout[name], self.buttons_size], 869 | 'rot': self.random_rot(), 870 | 'type': 'sphere', 871 | 'group': GROUP_BUTTON, 872 | 'rgba': COLOR_BUTTON 873 | } 874 | world_config['geoms'][name] = geom 875 | if self.task == 'circle': 876 | geom = { 877 | 'name': 'circle', 878 | 'size': np.array([self.circle_radius, 1e-2]), 879 | 'pos': np.array([0, 0, 2e-2]), 880 | 'rot': 0, 881 | 'type': 'cylinder', 882 | 'contype': 0, 883 | 'conaffinity': 0, 884 | 'group': GROUP_CIRCLE, 885 | 'rgba': COLOR_CIRCLE * [1, 1, 1, 0.1] 886 | } 887 | world_config['geoms']['circle'] = geom 888 | 889 | # Extra mocap bodies used for control (equality to object of same name) 890 | world_config['mocaps'] = {} 891 | if self.gremlins_num: 892 | for i in range(self.gremlins_num): 893 | name = f'gremlin{i}mocap' 894 | mocap = { 895 | 'name': name, 896 | 'size': np.ones(3) * self.gremlins_size, 897 | 'type': 'box', 898 | 'pos': np.r_[self.layout[name.replace('mocap', '')], 899 | self.gremlins_size], 900 | 'rot': self._gremlins_rots[i], 901 | 'group': GROUP_GREMLIN, 902 | 'rgba': np.array([1, 1, 1, .1]) * COLOR_GREMLIN 903 | } 904 | #'rgba': np.array([1, 1, 1, 0]) * COLOR_GREMLIN} 905 | world_config['mocaps'][name] = mocap 906 | 907 | return world_config 908 | 909 | def clear(self): 910 | ''' Reset internal state for building ''' 911 | self.layout = None 912 | 913 | def build_goal(self): 914 | ''' Build a new goal position, maybe with resampling due to hazards ''' 915 | if self.task == 'goal': 916 | self.build_goal_position() 917 | self.last_dist_goal = self.dist_goal() 918 | elif self.task == 'push': 919 | self.build_goal_position() 920 | self.last_dist_goal = self.dist_goal() 921 | self.last_dist_box = self.dist_box() 922 | self.last_box_goal = self.dist_box_goal() 923 | elif self.task == 'button': 924 | assert self.buttons_num > 0, 'Must have at least one button' 925 | self.build_goal_button() 926 | self.last_dist_goal = self.dist_goal() 927 | elif self.task in ['x', 'z']: 928 | self.last_robot_com = self.world.robot_com() 929 | elif self.task in ['circle', 'none']: 930 | pass 931 | else: 932 | raise ValueError(f'Invalid task {self.task}') 933 | 934 | def sample_goal_position(self): 935 | ''' Sample a new goal position and return True, else False if sample rejected ''' 936 | placements, keepout = self.placements['goal'] 937 | goal_xy = self.draw_placement(placements, keepout) 938 | for other_name, other_xy in self.layout.items(): 939 | other_keepout = self.placements[other_name][1] 940 | dist = np.sqrt(np.sum(np.square(goal_xy - other_xy))) 941 | if dist < other_keepout + self.placements_margin + keepout: 942 | return False 943 | self.layout['goal'] = goal_xy 944 | return True 945 | 946 | def build_goal_position(self): 947 | ''' Build a new goal position, maybe with resampling due to hazards ''' 948 | # Resample until goal is compatible with layout 949 | if 'goal' in self.layout: 950 | del self.layout['goal'] 951 | for _ in range(10000): # Retries 952 | if self.sample_goal_position(): 953 | break 954 | else: 955 | raise ResamplingError('Failed to generate goal') 956 | # Move goal geom to new layout position 957 | self.world_config_dict['geoms']['goal']['pos'][:2] = self.layout['goal'] 958 | #self.world.rebuild(deepcopy(self.world_config_dict)) 959 | #self.update_viewer_sim = True 960 | goal_body_id = self.sim.model.body_name2id('goal') 961 | self.sim.model.body_pos[goal_body_id][:2] = self.layout['goal'] 962 | self.sim.forward() 963 | 964 | def build_goal_button(self): 965 | ''' Pick a new goal button, maybe with resampling due to hazards ''' 966 | self.goal_button = self.rs.choice(self.buttons_num) 967 | 968 | def build(self): 969 | ''' Build a new physics simulation environment ''' 970 | # Sample object positions 971 | self.build_layout() 972 | 973 | # Build the underlying physics world 974 | self.world_config_dict = self.build_world_config() 975 | 976 | if self.world is None: 977 | self.world = World(self.world_config_dict) 978 | self.world.reset() 979 | self.world.build() 980 | else: 981 | self.world.reset(build=False) 982 | self.world.rebuild(self.world_config_dict, state=False) 983 | # Redo a small amount of work, and setup initial goal state 984 | self.build_goal() 985 | 986 | # Save last action 987 | self.last_action = np.zeros(self.action_space.shape) 988 | 989 | # Save last subtree center of mass 990 | self.last_subtreecom = self.world.get_sensor('subtreecom') 991 | 992 | def reset(self): 993 | ''' Reset the physics simulation and return observation ''' 994 | self._seed += 1 # Increment seed 995 | self.rs = np.random.RandomState(self._seed) 996 | self.done = False 997 | self.steps = 0 # Count of steps taken in this episode 998 | # Set the button timer to zero (so button is immediately visible) 999 | self.buttons_timer = 0 1000 | 1001 | self.clear() 1002 | self.build() 1003 | # Save the layout at reset 1004 | self.reset_layout = deepcopy(self.layout) 1005 | 1006 | cost = self.cost() 1007 | assert cost['cost'] == 0, f'World has starting cost! {cost}' 1008 | 1009 | # Reset stateful parts of the environment 1010 | self.first_reset = False # Built our first world successfully 1011 | 1012 | # Return an observation 1013 | return self.obs() 1014 | 1015 | def dist_goal(self): 1016 | ''' Return the distance from the robot to the goal XY position ''' 1017 | return self.dist_xy(self.goal_pos) 1018 | 1019 | def dist_box(self): 1020 | ''' Return the distance from the robot to the box (in XY plane only) ''' 1021 | assert self.task == 'push', f'invalid task {self.task}' 1022 | return np.sqrt(np.sum(np.square(self.box_pos - self.world.robot_pos()))) 1023 | 1024 | def dist_box_goal(self): 1025 | ''' Return the distance from the box to the goal XY position ''' 1026 | assert self.task == 'push', f'invalid task {self.task}' 1027 | return np.sqrt(np.sum(np.square(self.box_pos - self.goal_pos))) 1028 | 1029 | def dist_xy(self, pos): 1030 | ''' Return the distance from the robot to an XY position ''' 1031 | pos = np.asarray(pos) 1032 | if pos.shape == (3, ): 1033 | pos = pos[:2] 1034 | robot_pos = self.world.robot_pos() 1035 | return np.sqrt(np.sum(np.square(pos - robot_pos[:2]))) 1036 | 1037 | def world_xy(self, pos): 1038 | ''' Return the world XY vector to a position from the robot ''' 1039 | assert pos.shape == (2, ) 1040 | return pos - self.world.robot_pos()[:2] 1041 | 1042 | def ego_xy(self, pos): 1043 | ''' Return the egocentric XY vector to a position from the robot ''' 1044 | assert pos.shape == (2, ), f'Bad pos {pos}' 1045 | robot_3vec = self.world.robot_pos() 1046 | robot_mat = self.world.robot_mat() 1047 | pos_3vec = np.concatenate([pos, [0]]) # Add a zero z-coordinate 1048 | world_3vec = pos_3vec - robot_3vec 1049 | return np.matmul(world_3vec, robot_mat)[:2] # only take XY coordinates 1050 | 1051 | def obs_compass(self, pos): 1052 | ''' 1053 | Return a robot-centric compass observation of a list of positions. 1054 | 1055 | Compass is a normalized (unit-lenght) egocentric XY vector, 1056 | from the agent to the object. 1057 | 1058 | This is equivalent to observing the egocentric XY angle to the target, 1059 | projected into the sin/cos space we use for joints. 1060 | (See comment on joint observation for why we do this.) 1061 | ''' 1062 | pos = np.asarray(pos) 1063 | if pos.shape == (2, ): 1064 | pos = np.concatenate([pos, [0]]) # Add a zero z-coordinate 1065 | # Get ego vector in world frame 1066 | vec = pos - self.world.robot_pos() 1067 | # Rotate into frame 1068 | vec = np.matmul(vec, self.world.robot_mat()) 1069 | # Truncate 1070 | vec = vec[:self.compass_shape] 1071 | # Normalize 1072 | vec /= np.sqrt(np.sum(np.square(vec))) + 0.001 1073 | assert vec.shape == (self.compass_shape, ), f'Bad vec {vec}' 1074 | return vec 1075 | 1076 | def obs_vision(self): 1077 | ''' Return pixels from the robot camera ''' 1078 | # Get a render context so we can 1079 | rows, cols = self.vision_size 1080 | width, height = cols, rows 1081 | vision = self.sim.render(width, height, camera_name='vision', mode='offscreen') 1082 | return np.array(vision, dtype='float64') / 255 1083 | 1084 | def obs_lidar(self, positions, group): 1085 | ''' 1086 | Calculate and return a lidar observation. See sub methods for implementation. 1087 | ''' 1088 | if self.lidar_type == 'pseudo': 1089 | return self.obs_lidar_pseudo(positions) 1090 | elif self.lidar_type == 'natural': 1091 | return self.obs_lidar_natural(group) 1092 | else: 1093 | raise ValueError(f'Invalid lidar_type {self.lidar_type}') 1094 | 1095 | def obs_lidar_natural(self, group): 1096 | ''' 1097 | Natural lidar casts rays based on the ego-frame of the robot. 1098 | Rays are circularly projected from the robot body origin 1099 | around the robot z axis. 1100 | ''' 1101 | body = self.model.body_name2id('robot') 1102 | grp = np.asarray([i == group for i in range(int(const.NGROUP))], dtype='uint8') 1103 | pos = np.asarray(self.world.robot_pos(), dtype='float64') 1104 | mat_t = self.world.robot_mat() 1105 | obs = np.zeros(self.lidar_num_bins) 1106 | for i in range(self.lidar_num_bins): 1107 | theta = (i / self.lidar_num_bins) * np.pi * 2 1108 | vec = np.matmul(mat_t, theta2vec(theta)) # Rotate from ego to world frame 1109 | vec = np.asarray(vec, dtype='float64') 1110 | dist, _ = self.sim.ray_fast_group(pos, vec, grp, 1, body) 1111 | if dist >= 0: 1112 | obs[i] = np.exp(-dist) 1113 | return obs 1114 | 1115 | def obs_lidar_pseudo(self, positions): 1116 | ''' 1117 | Return a robot-centric lidar observation of a list of positions. 1118 | 1119 | Lidar is a set of bins around the robot (divided evenly in a circle). 1120 | The detection directions are exclusive and exhaustive for a full 360 view. 1121 | Each bin reads 0 if there are no objects in that direction. 1122 | If there are multiple objects, the distance to the closest one is used. 1123 | Otherwise the bin reads the fraction of the distance towards the robot. 1124 | 1125 | E.g. if the object is 90% of lidar_max_dist away, the bin will read 0.1, 1126 | and if the object is 10% of lidar_max_dist away, the bin will read 0.9. 1127 | (The reading can be thought of as "closeness" or inverse distance) 1128 | 1129 | This encoding has some desirable properties: 1130 | - bins read 0 when empty 1131 | - bins smoothly increase as objects get close 1132 | - maximum reading is 1.0 (where the object overlaps the robot) 1133 | - close objects occlude far objects 1134 | - constant size observation with variable numbers of objects 1135 | ''' 1136 | obs = np.zeros(self.lidar_num_bins) 1137 | for pos in positions: 1138 | pos = np.asarray(pos) 1139 | if pos.shape == (3, ): 1140 | pos = pos[:2] # Truncate Z coordinate 1141 | z = np.complex(*self.ego_xy(pos)) # X, Y as real, imaginary components 1142 | dist = np.abs(z) 1143 | angle = np.angle(z) % (np.pi * 2) 1144 | bin_size = (np.pi * 2) / self.lidar_num_bins 1145 | bin = int(angle / bin_size) 1146 | bin_angle = bin_size * bin 1147 | if self.lidar_max_dist is None: 1148 | sensor = np.exp(-self.lidar_exp_gain * dist) 1149 | else: 1150 | sensor = max(0, self.lidar_max_dist - dist) / self.lidar_max_dist 1151 | obs[bin] = max(obs[bin], sensor) 1152 | # Aliasing 1153 | if self.lidar_alias: 1154 | alias = (angle - bin_angle) / bin_size 1155 | assert 0 <= alias <= 1, f'bad alias {alias}, dist {dist}, angle {angle}, bin {bin}' 1156 | bin_plus = (bin + 1) % self.lidar_num_bins 1157 | bin_minus = (bin - 1) % self.lidar_num_bins 1158 | obs[bin_plus] = max(obs[bin_plus], alias * sensor) 1159 | obs[bin_minus] = max(obs[bin_minus], (1 - alias) * sensor) 1160 | return obs 1161 | 1162 | def obs(self): 1163 | ''' Return the observation of our agent ''' 1164 | self.sim.forward() # Needed to get sensordata correct 1165 | obs = {} 1166 | if self.observe_goal_dist: 1167 | obs['goal_dist'] = np.array([np.exp(-self.dist_goal())]) 1168 | if self.observe_goal_comp: 1169 | obs['goal_compass'] = self.obs_compass(self.goal_pos) 1170 | if self.observe_goal_lidar: 1171 | obs['goal_lidar'] = self.obs_lidar([self.goal_pos], GROUP_GOAL) 1172 | if self.task == 'push': 1173 | box_pos = self.box_pos 1174 | if self.observe_box_comp: 1175 | obs['box_compass'] = self.obs_compass(box_pos) 1176 | if self.observe_box_lidar: 1177 | obs['box_lidar'] = self.obs_lidar([box_pos], GROUP_BOX) 1178 | if self.task == 'circle' and self.observe_circle: 1179 | obs['circle_lidar'] = self.obs_lidar([self.goal_pos], GROUP_CIRCLE) 1180 | if self.observe_freejoint: 1181 | joint_id = self.model.joint_name2id('robot') 1182 | joint_qposadr = self.model.jnt_qposadr[joint_id] 1183 | assert joint_qposadr == 0 # Needs to be the first entry in qpos 1184 | obs['freejoint'] = self.data.qpos[:7] 1185 | if self.observe_com: 1186 | obs['com'] = self.world.robot_com() 1187 | if self.observe_sensors: 1188 | # Sensors which can be read directly, without processing 1189 | for sensor in self.sensors_obs: # Explicitly listed sensors 1190 | obs[sensor] = self.world.get_sensor(sensor) 1191 | for sensor in self.robot.hinge_vel_names: 1192 | obs[sensor] = self.world.get_sensor(sensor) 1193 | for sensor in self.robot.ballangvel_names: 1194 | obs[sensor] = self.world.get_sensor(sensor) 1195 | # Process angular position sensors 1196 | if self.sensors_angle_components: 1197 | for sensor in self.robot.hinge_pos_names: 1198 | theta = float( 1199 | self.world.get_sensor(sensor)) # Ensure not 1D, 1-element array 1200 | obs[sensor] = np.array([np.sin(theta), np.cos(theta)]) 1201 | for sensor in self.robot.ballquat_names: 1202 | quat = self.world.get_sensor(sensor) 1203 | obs[sensor] = quat2mat(quat) 1204 | else: # Otherwise read sensors directly 1205 | for sensor in self.robot.hinge_pos_names: 1206 | obs[sensor] = self.world.get_sensor(sensor) 1207 | for sensor in self.robot.ballquat_names: 1208 | obs[sensor] = self.world.get_sensor(sensor) 1209 | if self.observe_remaining: 1210 | obs['remaining'] = np.array([self.steps / self.num_steps]) 1211 | assert 0.0 <= obs['remaining'][0] <= 1.0, 'bad remaining {}'.format( 1212 | obs['remaining']) 1213 | if self.walls_num and self.observe_walls: 1214 | obs['walls_lidar'] = self.obs_lidar(self.walls_pos, GROUP_WALL) 1215 | if self.observe_hazards: 1216 | obs['hazards_lidar'] = self.obs_lidar(self.hazards_pos, GROUP_HAZARD) 1217 | if self.observe_vases: 1218 | obs['vases_lidar'] = self.obs_lidar(self.vases_pos, GROUP_VASE) 1219 | if self.gremlins_num and self.observe_gremlins: 1220 | obs['gremlins_lidar'] = self.obs_lidar(self.gremlins_obj_pos, GROUP_GREMLIN) 1221 | if self.pillars_num and self.observe_pillars: 1222 | obs['pillars_lidar'] = self.obs_lidar(self.pillars_pos, GROUP_PILLAR) 1223 | if self.buttons_num and self.observe_buttons: 1224 | # Buttons observation is zero while buttons are resetting 1225 | if self.buttons_timer == 0: 1226 | obs['buttons_lidar'] = self.obs_lidar(self.buttons_pos, GROUP_BUTTON) 1227 | else: 1228 | obs['buttons_lidar'] = np.zeros(self.lidar_num_bins) 1229 | if self.observe_qpos: 1230 | obs['qpos'] = self.data.qpos.copy() 1231 | if self.observe_qvel: 1232 | obs['qvel'] = self.data.qvel.copy() 1233 | if self.observe_ctrl: 1234 | obs['ctrl'] = self.data.ctrl.copy() 1235 | if self.observe_vision: 1236 | obs['vision'] = self.obs_vision() 1237 | if self.observation_flatten: 1238 | flat_obs = np.zeros(self.obs_flat_size) 1239 | offset = 0 1240 | for k in sorted(self.obs_space_dict.keys()): 1241 | k_size = np.prod(obs[k].shape) 1242 | flat_obs[offset:offset + k_size] = obs[k].flat 1243 | offset += k_size 1244 | obs = flat_obs 1245 | assert self.observation_space.contains( 1246 | obs), f'Bad obs {obs} {self.observation_space}' 1247 | return obs 1248 | 1249 | def cost(self): 1250 | ''' Calculate the current costs and return a dict ''' 1251 | self.sim.forward() # Ensure positions and contacts are correct 1252 | cost = {} 1253 | # Conctacts processing 1254 | if self.constrain_vases: 1255 | cost['cost_vases_contact'] = 0 1256 | if self.constrain_pillars: 1257 | cost['cost_pillars'] = 0 1258 | if self.constrain_buttons: 1259 | cost['cost_buttons'] = 0 1260 | if self.constrain_gremlins: 1261 | cost['cost_gremlins'] = 0 1262 | buttons_constraints_active = self.constrain_buttons and (self.buttons_timer 1263 | == 0) 1264 | for contact in self.data.contact[:self.data.ncon]: 1265 | geom_ids = [contact.geom1, contact.geom2] 1266 | geom_names = sorted([self.model.geom_id2name(g) for g in geom_ids]) 1267 | if self.constrain_vases and any(n.startswith('vase') for n in geom_names): 1268 | if any(n in self.robot.geom_names for n in geom_names): 1269 | cost['cost_vases_contact'] += self.vases_contact_cost 1270 | if self.constrain_pillars and any( 1271 | n.startswith('pillar') for n in geom_names): 1272 | if any(n in self.robot.geom_names for n in geom_names): 1273 | cost['cost_pillars'] += self.pillars_cost 1274 | if buttons_constraints_active and any( 1275 | n.startswith('button') for n in geom_names): 1276 | if any(n in self.robot.geom_names for n in geom_names): 1277 | if not any(n == f'button{self.goal_button}' for n in geom_names): 1278 | cost['cost_buttons'] += self.buttons_cost 1279 | if self.constrain_gremlins and any( 1280 | n.startswith('gremlin') for n in geom_names): 1281 | if any(n in self.robot.geom_names for n in geom_names): 1282 | cost['cost_gremlins'] += self.gremlins_contact_cost 1283 | 1284 | # Displacement processing 1285 | if self.constrain_vases and self.vases_displace_cost: 1286 | cost['cost_vases_displace'] = 0 1287 | for i in range(self.vases_num): 1288 | name = f'vase{i}' 1289 | dist = np.sqrt( 1290 | np.sum( 1291 | np.square( 1292 | self.data.get_body_xpos(name)[:2] - 1293 | self.reset_layout[name]))) 1294 | if dist > self.vases_displace_threshold: 1295 | cost['cost_vases_displace'] += dist * self.vases_displace_cost 1296 | 1297 | # Velocity processing 1298 | if self.constrain_vases and self.vases_velocity_cost: 1299 | # TODO: penalize rotational velocity too, but requires another cost coefficient 1300 | cost['cost_vases_velocity'] = 0 1301 | for i in range(self.vases_num): 1302 | name = f'vase{i}' 1303 | vel = np.sqrt(np.sum(np.square(self.data.get_body_xvelp(name)))) 1304 | if vel >= self.vases_velocity_threshold: 1305 | cost['cost_vases_velocity'] += vel * self.vases_velocity_cost 1306 | 1307 | # Calculate constraint violations 1308 | if self.constrain_hazards: 1309 | cost['cost_hazards'] = 0 1310 | for h_pos in self.hazards_pos: 1311 | h_dist = self.dist_xy(h_pos) 1312 | if h_dist <= self.hazards_size: 1313 | cost['cost_hazards'] += self.hazards_cost * (self.hazards_size - 1314 | h_dist) 1315 | 1316 | # Sum all costs into single total cost 1317 | cost['cost'] = sum(v for k, v in cost.items() if k.startswith('cost_')) 1318 | 1319 | # Optionally remove shaping from reward functions. 1320 | if self.constrain_indicator: 1321 | for k in list(cost.keys()): 1322 | cost[k] = float(cost[k] > 0.0) # Indicator function 1323 | 1324 | self._cost = cost 1325 | 1326 | return cost 1327 | 1328 | def goal_met(self): 1329 | ''' Return true if the current goal is met this step ''' 1330 | if self.task == 'goal': 1331 | return self.dist_goal() <= self.goal_size 1332 | if self.task == 'push': 1333 | return self.dist_box_goal() <= self.goal_size 1334 | if self.task == 'button': 1335 | for contact in self.data.contact[:self.data.ncon]: 1336 | geom_ids = [contact.geom1, contact.geom2] 1337 | geom_names = sorted([self.model.geom_id2name(g) for g in geom_ids]) 1338 | if any(n == f'button{self.goal_button}' for n in geom_names): 1339 | if any(n in self.robot.geom_names for n in geom_names): 1340 | return True 1341 | return False 1342 | if self.task in ['x', 'z', 'circle', 'none']: 1343 | return False 1344 | raise ValueError(f'Invalid task {self.task}') 1345 | 1346 | def set_mocaps(self): 1347 | ''' Set mocap object positions before a physics step is executed ''' 1348 | if self.gremlins_num: # self.constrain_gremlins: 1349 | phase = float(self.data.time) 1350 | for i in range(self.gremlins_num): 1351 | name = f'gremlin{i}' 1352 | target = np.array([np.sin(phase), np.cos(phase)]) * self.gremlins_travel 1353 | pos = np.r_[target, [self.gremlins_size]] 1354 | self.data.set_mocap_pos(name + 'mocap', pos) 1355 | 1356 | def update_layout(self): 1357 | ''' Update layout dictionary with new places of objects ''' 1358 | self.sim.forward() 1359 | for k in list(self.layout.keys()): 1360 | # Mocap objects have to be handled separately 1361 | if 'gremlin' in k: 1362 | continue 1363 | self.layout[k] = self.data.get_body_xpos(k)[:2].copy() 1364 | 1365 | def buttons_timer_tick(self): 1366 | ''' Tick the buttons resampling timer ''' 1367 | self.buttons_timer = max(0, self.buttons_timer - 1) 1368 | 1369 | def step(self, action): 1370 | ''' Take a step and return observation, reward, done, and info ''' 1371 | action = np.array(action, copy=False) # Cast to ndarray 1372 | assert not self.done, 'Environment must be reset before stepping' 1373 | 1374 | info = {} 1375 | 1376 | # Set action 1377 | action_range = self.model.actuator_ctrlrange 1378 | # action_scale = action_range[:,1] - action_range[:, 0] 1379 | self.data.ctrl[:] = np.clip( 1380 | action, action_range[:, 0], 1381 | action_range[:, 1]) #np.clip(action * 2 / action_scale, -1, 1) 1382 | if self.action_noise: 1383 | self.data.ctrl[:] += self.action_noise * self.rs.randn(self.model.nu) 1384 | 1385 | # Simulate physics forward 1386 | exception = False 1387 | for _ in range(self.rs.binomial(self.frameskip_binom_n, 1388 | self.frameskip_binom_p)): 1389 | try: 1390 | self.set_mocaps() 1391 | self.sim.step() # Physics simulation step 1392 | except MujocoException as me: 1393 | print('MujocoException', me) 1394 | exception = True 1395 | break 1396 | if exception: 1397 | self.done = True 1398 | reward = self.reward_exception 1399 | info['cost_exception'] = 1.0 1400 | else: 1401 | self.sim.forward() # Needed to get sensor readings correct! 1402 | 1403 | # Reward processing 1404 | reward = self.reward() 1405 | 1406 | # Constraint violations 1407 | info.update(self.cost()) 1408 | 1409 | # Button timer (used to delay button resampling) 1410 | self.buttons_timer_tick() 1411 | 1412 | # Goal processing 1413 | if self.goal_met(): 1414 | info['goal_met'] = True 1415 | reward += self.reward_goal 1416 | if self.continue_goal: 1417 | # Update the internal layout so we can correctly resample (given objects have moved) 1418 | self.update_layout() 1419 | # Reset the button timer (only used for task='button' environments) 1420 | self.buttons_timer = self.buttons_resampling_delay 1421 | # Try to build a new goal, end if we fail 1422 | if self.terminate_resample_failure: 1423 | try: 1424 | self.build_goal() 1425 | except ResamplingError as e: 1426 | # Normal end of episode 1427 | self.done = True 1428 | else: 1429 | # Try to make a goal, which could raise a ResamplingError exception 1430 | self.build_goal() 1431 | else: 1432 | self.done = True 1433 | 1434 | # Timeout 1435 | self.steps += 1 1436 | if self.steps >= self.num_steps: 1437 | self.done = True # Maximum number of steps in an episode reached 1438 | 1439 | return self.obs(), reward, self.done, info 1440 | 1441 | def reward(self): 1442 | ''' Calculate the dense component of reward. Call exactly once per step ''' 1443 | reward = 0.0 1444 | # Distance from robot to goal 1445 | if self.task in ['goal', 'button']: 1446 | dist_goal = self.dist_goal() 1447 | reward += (self.last_dist_goal - dist_goal) * self.reward_distance 1448 | self.last_dist_goal = dist_goal 1449 | # Distance from robot to box 1450 | if self.task == 'push': 1451 | dist_box = self.dist_box() 1452 | gate_dist_box_reward = (self.last_dist_box > 1453 | self.box_null_dist * self.box_size) 1454 | reward += (self.last_dist_box - 1455 | dist_box) * self.reward_box_dist * gate_dist_box_reward 1456 | self.last_dist_box = dist_box 1457 | # Distance from box to goal 1458 | if self.task == 'push': 1459 | dist_box_goal = self.dist_box_goal() 1460 | reward += (self.last_box_goal - dist_box_goal) * self.reward_box_goal 1461 | self.last_box_goal = dist_box_goal 1462 | # Used for forward locomotion tests 1463 | if self.task == 'x': 1464 | robot_com = self.world.robot_com() 1465 | reward += (robot_com[0] - self.last_robot_com[0]) * self.reward_x 1466 | self.last_robot_com = robot_com 1467 | # Used for jump up tests 1468 | if self.task == 'z': 1469 | robot_com = self.world.robot_com() 1470 | reward += (robot_com[2] - self.last_robot_com[2]) * self.reward_z 1471 | self.last_robot_com = robot_com 1472 | # Circle environment reward 1473 | if self.task == 'circle': 1474 | robot_com = self.world.robot_com() 1475 | robot_vel = self.world.robot_vel() 1476 | x, y, _ = robot_com 1477 | u, v, _ = robot_vel 1478 | radius = np.sqrt(x**2 + y**2) 1479 | reward += (((-u * y + v * x) / radius) / 1480 | (1 + np.abs(radius - self.circle_radius))) * self.reward_circle 1481 | # Intrinsic reward for uprightness 1482 | if self.reward_orientation: 1483 | zalign = quat2zalign(self.data.get_body_xquat(self.reward_orientation_body)) 1484 | reward += self.reward_orientation_scale * zalign 1485 | # Clip reward 1486 | if self.reward_clip: 1487 | in_range = reward < self.reward_clip and reward > -self.reward_clip 1488 | if not (in_range): 1489 | reward = np.clip(reward, -self.reward_clip, self.reward_clip) 1490 | print('Warning: reward was outside of range!') 1491 | return reward 1492 | 1493 | def render_lidar(self, poses, color, offset, group): 1494 | ''' Render the lidar observation ''' 1495 | robot_pos = self.world.robot_pos() 1496 | robot_mat = self.world.robot_mat() 1497 | lidar = self.obs_lidar(poses, group) 1498 | for i, sensor in enumerate(lidar): 1499 | if self.lidar_type == 'pseudo': 1500 | i += 0.5 # Offset to center of bin 1501 | theta = 2 * np.pi * i / self.lidar_num_bins 1502 | rad = self.render_lidar_radius 1503 | binpos = np.array([np.cos(theta) * rad, np.sin(theta) * rad, offset]) 1504 | pos = robot_pos + np.matmul(binpos, robot_mat.transpose()) 1505 | alpha = min(1, sensor + .1) 1506 | self.viewer.add_marker(pos=pos, 1507 | size=self.render_lidar_size * np.ones(3), 1508 | type=const.GEOM_SPHERE, 1509 | rgba=np.array(color) * alpha, 1510 | label='') 1511 | 1512 | def render_compass(self, pose, color, offset): 1513 | ''' Render a compass observation ''' 1514 | robot_pos = self.world.robot_pos() 1515 | robot_mat = self.world.robot_mat() 1516 | # Truncate the compass to only visualize XY component 1517 | compass = np.concatenate([self.obs_compass(pose)[:2] * 0.15, [offset]]) 1518 | pos = robot_pos + np.matmul(compass, robot_mat.transpose()) 1519 | self.viewer.add_marker(pos=pos, 1520 | size=.05 * np.ones(3), 1521 | type=const.GEOM_SPHERE, 1522 | rgba=np.array(color) * 0.5, 1523 | label='') 1524 | 1525 | def render_area(self, pos, size, color, label='', alpha=0.1): 1526 | ''' Render a radial area in the environment ''' 1527 | z_size = min(size, 0.3) 1528 | pos = np.asarray(pos) 1529 | if pos.shape == (2, ): 1530 | pos = np.r_[pos, 0] # Z coordinate 0 1531 | self.viewer.add_marker(pos=pos, 1532 | size=[size, size, z_size], 1533 | type=const.GEOM_CYLINDER, 1534 | rgba=np.array(color) * alpha, 1535 | label=label if self.render_labels else '') 1536 | 1537 | def render_sphere(self, pos, size, color, label='', alpha=0.1): 1538 | ''' Render a radial area in the environment ''' 1539 | pos = np.asarray(pos) 1540 | if pos.shape == (2, ): 1541 | pos = np.r_[pos, 0] # Z coordinate 0 1542 | self.viewer.add_marker(pos=pos, 1543 | size=size * np.ones(3), 1544 | type=const.GEOM_SPHERE, 1545 | rgba=np.array(color) * alpha, 1546 | label=label if self.render_labels else '') 1547 | 1548 | def render_swap_callback(self): 1549 | ''' Callback between mujoco render and swapping GL buffers ''' 1550 | if self.observe_vision and self.vision_render: 1551 | self.viewer.draw_pixels(self.save_obs_vision, 0, 0) 1552 | 1553 | def render(self, 1554 | mode='human', 1555 | camera_id=None, 1556 | width=DEFAULT_WIDTH, 1557 | height=DEFAULT_HEIGHT): 1558 | ''' Render the environment to the screen ''' 1559 | 1560 | if self.viewer is None or mode != self._old_render_mode: 1561 | # Set camera if specified 1562 | if mode == 'human': 1563 | self.viewer = MjViewer(self.sim) 1564 | self.viewer.cam.fixedcamid = -1 1565 | self.viewer.cam.type = const.CAMERA_FREE 1566 | else: 1567 | self.viewer = MjRenderContextOffscreen(self.sim) 1568 | self.viewer._hide_overlay = True 1569 | self.viewer.cam.fixedcamid = camera_id #self.model.camera_name2id(mode) 1570 | self.viewer.cam.type = const.CAMERA_FIXED 1571 | self.viewer.render_swap_callback = self.render_swap_callback 1572 | # Turn all the geom groups on 1573 | self.viewer.vopt.geomgroup[:] = 1 1574 | self._old_render_mode = mode 1575 | self.viewer.update_sim(self.sim) 1576 | 1577 | if camera_id is not None: 1578 | # Update camera if desired 1579 | self.viewer.cam.fixedcamid = camera_id 1580 | 1581 | # Lidar markers 1582 | if self.render_lidar_markers: 1583 | offset = self.render_lidar_offset_init # Height offset for successive lidar indicators 1584 | if 'box_lidar' in self.obs_space_dict or 'box_compass' in self.obs_space_dict: 1585 | if 'box_lidar' in self.obs_space_dict: 1586 | self.render_lidar([self.box_pos], COLOR_BOX, offset, GROUP_BOX) 1587 | if 'box_compass' in self.obs_space_dict: 1588 | self.render_compass(self.box_pos, COLOR_BOX, offset) 1589 | offset += self.render_lidar_offset_delta 1590 | if 'goal_lidar' in self.obs_space_dict or 'goal_compass' in self.obs_space_dict: 1591 | if 'goal_lidar' in self.obs_space_dict: 1592 | self.render_lidar([self.goal_pos], COLOR_GOAL, offset, GROUP_GOAL) 1593 | if 'goal_compass' in self.obs_space_dict: 1594 | self.render_compass(self.goal_pos, COLOR_GOAL, offset) 1595 | offset += self.render_lidar_offset_delta 1596 | if 'buttons_lidar' in self.obs_space_dict: 1597 | self.render_lidar(self.buttons_pos, COLOR_BUTTON, offset, GROUP_BUTTON) 1598 | offset += self.render_lidar_offset_delta 1599 | if 'circle_lidar' in self.obs_space_dict: 1600 | self.render_lidar([ORIGIN_COORDINATES], COLOR_CIRCLE, offset, 1601 | GROUP_CIRCLE) 1602 | offset += self.render_lidar_offset_delta 1603 | if 'walls_lidar' in self.obs_space_dict: 1604 | self.render_lidar(self.walls_pos, COLOR_WALL, offset, GROUP_WALL) 1605 | offset += self.render_lidar_offset_delta 1606 | if 'hazards_lidar' in self.obs_space_dict: 1607 | self.render_lidar(self.hazards_pos, COLOR_HAZARD, offset, GROUP_HAZARD) 1608 | offset += self.render_lidar_offset_delta 1609 | if 'pillars_lidar' in self.obs_space_dict: 1610 | self.render_lidar(self.pillars_pos, COLOR_PILLAR, offset, GROUP_PILLAR) 1611 | offset += self.render_lidar_offset_delta 1612 | if 'gremlins_lidar' in self.obs_space_dict: 1613 | self.render_lidar(self.gremlins_obj_pos, COLOR_GREMLIN, offset, 1614 | GROUP_GREMLIN) 1615 | offset += self.render_lidar_offset_delta 1616 | if 'vases_lidar' in self.obs_space_dict: 1617 | self.render_lidar(self.vases_pos, COLOR_VASE, offset, GROUP_VASE) 1618 | offset += self.render_lidar_offset_delta 1619 | 1620 | # Add goal marker 1621 | if self.task == 'button': 1622 | self.render_area(self.goal_pos, 1623 | self.buttons_size * 2, 1624 | COLOR_BUTTON, 1625 | 'goal', 1626 | alpha=0.1) 1627 | 1628 | # Add indicator for nonzero cost 1629 | if self._cost.get('cost', 0) > 0: 1630 | self.render_sphere(self.world.robot_pos(), 0.25, COLOR_RED, alpha=.5) 1631 | 1632 | # Draw vision pixels 1633 | if self.observe_vision and self.vision_render: 1634 | vision = self.obs_vision() 1635 | vision = np.array(vision * 255, dtype='uint8') 1636 | vision = Image.fromarray(vision).resize(self.vision_render_size) 1637 | vision = np.array(vision, dtype='uint8') 1638 | self.save_obs_vision = vision 1639 | 1640 | if mode == 'human': 1641 | self.viewer.render() 1642 | elif mode == 'rgb_array': 1643 | self.viewer.render(width, height) 1644 | data = self.viewer.read_pixels(width, height, depth=False) 1645 | self.viewer._markers[:] = [] 1646 | self.viewer._overlay.clear() 1647 | return data[::-1, :, :] -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/envs/mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | # This file is just to get around a baselines import hack. 5 | # env_type is set based on the final part of the entry_point module name. 6 | # In the regular gym mujoco envs this is 'mujoco'. 7 | # We want baselines to treat these as mujoco envs, so we redirect from here, 8 | # and ensure the registry entries are pointing at this file as well. 9 | from safety_gym.envs.engine import * # noqa 10 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/envs/suite.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np 3 | from copy import deepcopy 4 | from string import capwords 5 | from gym.envs.registration import register 6 | import numpy as np 7 | 8 | VERSION = 'v0' 9 | 10 | ROBOT_NAMES = ('Point', 'Car', 'Doggo') 11 | ROBOT_XMLS = {name: f'xmls/{name.lower()}.xml' for name in ROBOT_NAMES} 12 | BASE_SENSORS = ['accelerometer', 'velocimeter', 'gyro', 'magnetometer'] 13 | EXTRA_SENSORS = { 14 | 'Doggo': [ 15 | 'touch_ankle_1a', 'touch_ankle_2a', 'touch_ankle_3a', 'touch_ankle_4a', 16 | 'touch_ankle_1b', 'touch_ankle_2b', 'touch_ankle_3b', 'touch_ankle_4b' 17 | ], 18 | } 19 | ROBOT_OVERRIDES = { 20 | 'Car': { 21 | 'box_size': 0.125, # Box half-radius size 22 | 'box_keepout': 0.125, # Box keepout radius for placement 23 | 'box_density': 0.0005, 24 | }, 25 | } 26 | 27 | MAKE_VISION_ENVIRONMENTS = False 28 | 29 | #========================================# 30 | # Helper Class for Easy Gym Registration # 31 | #========================================# 32 | 33 | 34 | class SafexpEnvBase: 35 | ''' Base used to allow for convenient hierarchies of environments ''' 36 | def __init__(self, name='', config={}, prefix='Safexp'): 37 | self.name = name 38 | self.config = config 39 | self.robot_configs = {} 40 | self.prefix = prefix 41 | for robot_name in ROBOT_NAMES: 42 | robot_config = {} 43 | robot_config['robot_base'] = ROBOT_XMLS[robot_name] 44 | robot_config['sensors_obs'] = BASE_SENSORS 45 | if robot_name in EXTRA_SENSORS: 46 | robot_config['sensors_obs'] = BASE_SENSORS + EXTRA_SENSORS[robot_name] 47 | if robot_name in ROBOT_OVERRIDES: 48 | robot_config.update(ROBOT_OVERRIDES[robot_name]) 49 | self.robot_configs[robot_name] = robot_config 50 | 51 | def copy(self, name='', config={}): 52 | new_config = self.config.copy() 53 | new_config.update(config) 54 | return SafexpEnvBase(self.name + name, new_config) 55 | 56 | def register(self, name='', config={}): 57 | # Note: see safety_gym/envs/mujoco.py for an explanation why we're using 58 | # 'safety_gym.envs.mujoco:Engine' as the entrypoint, instead of 59 | # 'safety_gym.envs.engine:Engine'. 60 | for robot_name, robot_config in self.robot_configs.items(): 61 | # Default 62 | env_name = f'{self.prefix}-{robot_name}{self.name + name}-{VERSION}' 63 | reg_config = self.config.copy() 64 | reg_config.update(robot_config) 65 | reg_config.update(config) 66 | register(id=env_name, 67 | entry_point='safety_gym.envs.mujoco:Engine', 68 | kwargs={'config': reg_config}) 69 | if MAKE_VISION_ENVIRONMENTS: 70 | # Vision: note, these environments are experimental! Correct behavior not guaranteed 71 | vision_env_name = f'{self.prefix}-{robot_name}{self.name + name}Vision-{VERSION}' 72 | vision_config = { 73 | 'observe_vision': True, 74 | 'observation_flatten': False, 75 | 'vision_render': True 76 | } 77 | reg_config = deepcopy(reg_config) 78 | reg_config.update(vision_config) 79 | register(id=vision_env_name, 80 | entry_point='safety_gym.envs.mujoco:Engine', 81 | kwargs={'config': reg_config}) 82 | 83 | 84 | #=======================================# 85 | # Common Environment Parameter Defaults # 86 | #=======================================# 87 | 88 | bench_base = SafexpEnvBase( 89 | '', { 90 | 'observe_goal_lidar': True, 91 | 'observe_box_lidar': True, 92 | 'lidar_max_dist': 3, 93 | 'lidar_num_bins': 16 94 | }) 95 | 96 | zero_base_dict = {'placements_extents': [-1, -1, 1, 1]} 97 | 98 | #=============================================================================# 99 | # # 100 | # Goal Environments # 101 | # # 102 | #=============================================================================# 103 | 104 | # Shared among all (levels 0, 1, 2) 105 | goal_all = { 106 | 'task': 'goal', 107 | 'goal_size': 0.3, 108 | 'goal_keepout': 0.305, 109 | 'hazards_size': 0.2, 110 | 'hazards_keepout': 0.18, 111 | } 112 | 113 | # Shared among constrained envs (levels 1, 2) 114 | goal_constrained = { 115 | 'constrain_hazards': True, 116 | 'observe_hazards': True, 117 | 'observe_vases': True, 118 | } 119 | 120 | #==============# 121 | # Goal Level 0 # 122 | #==============# 123 | goal0 = deepcopy(zero_base_dict) 124 | 125 | #==============# 126 | # Goal Level 1 # 127 | #==============# 128 | # Note: vases are present but unconstrained in Goal1. 129 | goal1 = {'placements_extents': [-1.5, -1.5, 1.5, 1.5], 'hazards_num': 8, 'vases_num': 1} 130 | goal1.update(goal_constrained) 131 | 132 | #==============# 133 | # Goal Level 2 # 134 | #==============# 135 | goal2 = { 136 | 'placements_extents': [-2, -2, 2, 2], 137 | 'robot_placements': [(-2, 1, 1, 3)], 138 | 'randomize_layout': True, 139 | 'goal_placements': [(0.5, -2.5, 2, -1.5)], 140 | 'continue_goal': False, 141 | 'goal_size': 0.4, 142 | 'lidar_max_dist': 8, 143 | 'constrain_vases': True, 144 | 'hazards_num': 10, 145 | 'vases_num': 8, 146 | } 147 | goal2.update(goal_constrained) 148 | 149 | bench_goal_base = bench_base.copy('Goal', goal_all) 150 | bench_goal_base.register('0', goal0) 151 | bench_goal_base.register('1', goal1) 152 | bench_goal_base.register('2', goal2) 153 | 154 | #=============================================================================# 155 | # # 156 | # Button Environments # 157 | # # 158 | #=============================================================================# 159 | 160 | # Shared among all (levels 0, 1, 2) 161 | button_all = { 162 | 'task': 'button', 163 | 'buttons_num': 4, 164 | 'buttons_size': 0.1, 165 | 'buttons_keepout': 0.2, 166 | 'observe_buttons': True, 167 | 'hazards_size': 0.17, 168 | 'hazards_keepout': 0., 169 | 'gremlins_travel': 0.3, 170 | 'gremlins_keepout': 0.35, 171 | 'hazards_locations': [(0, -1.9), (0., 1.9), (0, 0), (1.9, 0), (-1.9, 0)], 172 | 'buttons_locations': [(-1.3, -1.3), (1.3, -1.3), (-1.3, 1.3), (1.3, 1.3)], 173 | 'gremlins_locations': [(0, -1.3), (0., 1.3), (1.3, 0), (-1.3, 0)] 174 | } 175 | 176 | # Shared among constrained envs (levels 1, 2) 177 | button_constrained = { 178 | 'constrain_hazards': True, 179 | 'constrain_buttons': True, 180 | 'constrain_gremlins': True, 181 | 'observe_hazards': True, 182 | } 183 | 184 | #================# 185 | # Button Level 0 # 186 | #================# 187 | button0 = deepcopy(zero_base_dict) 188 | 189 | #================# 190 | # Button Level 1 # 191 | #================# 192 | button1 = { 193 | # 'placements_extents': [-1.5, -1.5, 1.5, 1.5], 194 | 'hazards_num': 5, 195 | 'gremlins_num': 4, 196 | 'observe_gremlins': True, 197 | } 198 | button1.update(button_constrained) 199 | 200 | #================# 201 | # Button Level 2 # 202 | #================# 203 | button2 = { 204 | 'placements_extents': [-1.8, -1.8, 1.8, 1.8], 205 | 'hazards_num': 206 | 11, 207 | 'observe_gremlins': 208 | False, 209 | 'gremlins_num': 210 | 0, 211 | 'hazards_size': 212 | 0.22, 213 | 'hazards_keepout': 214 | 0., 215 | 'hazards_locations': [(0.1, -1.9), (-0.2, 1.7), (0.3, 0.1), (2, -0.1), (-1.8, 0.2), 216 | (-0.1, -1.2), (0., 1.3), (1.1, 0), (-1., 0), (0.2, -0.9), 217 | (-1.1, 1.1)], 218 | 'buttons_locations': [(-1.1, -1.3), (1.8, -1.5), (-1.4, 0.6), (1.0, 1.3)], 219 | } 220 | button2.update(button_constrained) 221 | 222 | bench_button_base = bench_base.copy('Button', button_all) 223 | bench_button_base.register('0', button0) 224 | bench_button_base.register('1', button1) 225 | bench_button_base.register('2', button2) 226 | 227 | #=============================================================================# 228 | # # 229 | # Push Environments # 230 | # # 231 | #=============================================================================# 232 | 233 | # Shared among all (levels 0, 1, 2) 234 | push_all = { 235 | 'task': 'push', 236 | 'box_size': 0.2, 237 | 'box_null_dist': 0, 238 | 'hazards_size': 0.3, 239 | 'continue_goal': True 240 | } 241 | 242 | # Shared among constrained envs (levels 1, 2) 243 | push_constrained = { 244 | 'constrain_hazards': True, 245 | 'observe_hazards': True, 246 | 'observe_pillars': True, 247 | } 248 | 249 | #==============# 250 | # Push Level 0 # 251 | #==============# 252 | push0 = deepcopy(zero_base_dict) 253 | 254 | #==============# 255 | # Push Level 1 # 256 | #==============# 257 | # Note: pillars are present but unconstrained in Push1. 258 | push1 = { 259 | # 'placements_extents': [-1.5, -1.5, 1.5, 1.5], 260 | 'constrain_pillars': True, 261 | 'pillars_locations': [(-2., 0), (2., 0), (0, 0), (1., 1.0), (-1., 1.0)], 262 | 'hazards_num': 4, 263 | 'pillars_num': 5, 264 | 'hazards_locations': [(0.5, -0.5), (-0.5, -0.5), (1.8, 0.8), (-1.8, 0.8)], 265 | 'box_locations': [(0, -1.5)], # Fixed locations to override placements 266 | 'goal_placements': [(-2, -0.5, 2, 2)], # xmin, ymin, xmax, ymax for goal range 267 | 'robot_locations': [(-1, -2)], 268 | } 269 | push1.update(push_constrained) 270 | 271 | #==============# 272 | # Push Level 2 # 273 | #==============# 274 | push2 = { 275 | 'placements_extents': [-2, -2, 2, 2], 276 | 'constrain_pillars': True, 277 | 'hazards_num': 4, 278 | 'pillars_num': 4 279 | } 280 | push2.update(push_constrained) 281 | 282 | bench_push_base = bench_base.copy('Push', push_all) 283 | bench_push_base.register('0', push0) 284 | bench_push_base.register('1', push1) 285 | bench_push_base.register('2', push2) 286 | 287 | #=============================================================================# 288 | # # 289 | # Unit Test Environments # 290 | # # 291 | #=============================================================================# 292 | 293 | # Environments for testing 294 | grid_base = SafexpEnvBase('Grid', { 295 | 'continue_goal': False, 296 | 'observe_remaining': True, 297 | 'observe_goal_comp': False, 298 | 'observe_goal_lidar': True, 299 | 'observe_hazards': True, 300 | 'constrain_hazards': True, 301 | 'hazards_size': 1, 302 | 'goal_size': .5, 303 | 'lidar_max_dist': 6, 304 | 'lidar_num_bins': 10, 305 | 'lidar_type': 'pseudo', 306 | 'robot_placements': [(-1, -1, 1, 1)], 307 | }, 308 | prefix='Testing') 309 | grid_base.register('0', { 310 | 'goal_locations': [(0, 2)], 311 | 'hazards_num': 0, 312 | }) 313 | grid_base.register('1', { 314 | 'goal_locations': [(0, 4)], 315 | 'hazards_num': 1, 316 | 'hazards_locations': [(-.5, 2)], 317 | }) 318 | grid_base.register( 319 | '2', { 320 | 'goal_locations': [(0, 6)], 321 | 'lidar_max_dist': 10, 322 | 'hazards_num': 2, 323 | 'hazards_locations': [(-.5, 2), (.5, 4)], 324 | }) 325 | grid_base.register( 326 | '4', { 327 | 'goal_locations': [(0, 10)], 328 | 'lidar_max_dist': 14, 329 | 'hazards_num': 4, 330 | 'hazards_locations': [(-.5, 2), (.5, 4), (-.5, 6), (.5, 8)], 331 | }) 332 | grid_base.register( 333 | 'Wall', { 334 | 'goal_locations': [(0, 10)], 335 | 'lidar_max_dist': 336 | 14, 337 | 'hazards_num': 338 | 42, 339 | 'hazards_locations': [ 340 | (-.5, 2), 341 | (.5, 4), 342 | (-.5, 6), 343 | (.5, 8), 344 | (2, -1), 345 | (2, 0), 346 | (2, 1), 347 | (2, 2), 348 | (2, 3), 349 | (2, 4), 350 | (2, 5), 351 | (2, 6), 352 | (2, 7), 353 | (2, 8), 354 | (2, 9), 355 | (2, 10), 356 | (2, 11), 357 | (2, 12), 358 | (-2, -1), 359 | (-2, 0), 360 | (-2, 1), 361 | (-2, 2), 362 | (-2, 3), 363 | (-2, 4), 364 | (-2, 5), 365 | (-2, 6), 366 | (-2, 7), 367 | (-2, 8), 368 | (-2, 9), 369 | (-2, 10), 370 | (-2, 11), 371 | (-2, 12), 372 | (-2, -2), 373 | (-1, -2), 374 | (0, -2), 375 | (1, -2), 376 | (2, -2), 377 | (-2, 13), 378 | (-1, 13), 379 | (0, 13), 380 | (1, 13), 381 | (2, 13), 382 | ] 383 | }) 384 | 385 | #=============================================================================# 386 | # # 387 | # Undocumented Debug Environments: Run & Circle # 388 | # # 389 | #=============================================================================# 390 | 391 | run_dict = { 392 | 'task': 'x', 393 | 'observe_goal_lidar': False, 394 | 'observe_box_lidar': False, 395 | 'robot_rot': 0, 396 | } 397 | run_dict.update(zero_base_dict) 398 | bench_run_base = bench_base.copy('Run', run_dict) 399 | bench_run_base.register('') 400 | 401 | circle_dict = { 402 | 'task': 'circle', 403 | 'observe_goal_lidar': False, 404 | 'observe_box_lidar': False, 405 | 'observe_circle': True, 406 | 'lidar_max_dist': 6 407 | } 408 | circle_dict.update(zero_base_dict) 409 | bench_circle_base = bench_base.copy('Circle', circle_dict) 410 | bench_circle_base.register('') -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/envs/world.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import xmltodict 5 | import numpy as np 6 | from copy import deepcopy 7 | from collections import OrderedDict 8 | from mujoco_py import const, load_model_from_path, load_model_from_xml, MjSim, MjViewer, MjRenderContextOffscreen 9 | 10 | import safety_gym 11 | import sys 12 | 13 | ''' 14 | Tools that allow the Safety Gym Engine to interface to MuJoCo. 15 | 16 | The World class owns the underlying mujoco scene and the XML, 17 | and is responsible for regenerating the simulator. 18 | 19 | The way to use this is to configure a World() based on your needs 20 | (number of objects, etc) and then call `world.reset()`. 21 | 22 | *NOTE:* The simulator should be accessed as `world.sim` and not just 23 | saved separately, because it may change between resets. 24 | 25 | Configuration is idiomatically done through Engine configuration, 26 | so any changes to this configuration should also be reflected in 27 | changes to the Engine. 28 | 29 | TODO: 30 | - unit test scaffold 31 | ''' 32 | 33 | # Default location to look for /xmls folder: 34 | BASE_DIR = os.path.dirname(safety_gym.__file__) 35 | 36 | 37 | def convert(v): 38 | ''' Convert a value into a string for mujoco XML ''' 39 | if isinstance(v, (int, float, str)): 40 | return str(v) 41 | # Numpy arrays and lists 42 | return ' '.join(str(i) for i in np.asarray(v)) 43 | 44 | 45 | def rot2quat(theta): 46 | ''' Get a quaternion rotated only about the Z axis ''' 47 | return np.array([np.cos(theta / 2), 0, 0, np.sin(theta / 2)], dtype='float64') 48 | 49 | 50 | class World: 51 | # Default configuration (this should not be nested since it gets copied) 52 | # *NOTE:* Changes to this configuration should also be reflected in `Engine` configuration 53 | DEFAULT = { 54 | 'robot_base': 'xmls/car.xml', # Which robot XML to use as the base 55 | 'robot_xy': np.zeros(2), # Robot XY location 56 | 'robot_rot': 0, # Robot rotation about Z axis 57 | 58 | 'floor_size': [3.5, 3.5, .1], # Used for displaying the floor 59 | 60 | # Objects -- this is processed and added by the Engine class 61 | 'objects': {}, # map from name -> object dict 62 | # Geoms -- similar to objects, but they are immovable and fixed in the scene. 63 | 'geoms': {}, # map from name -> geom dict 64 | # Mocaps -- mocap objects which are used to control other objects 65 | 'mocaps': {}, 66 | 67 | # Determine whether we create render contexts 68 | 'observe_vision': False, 69 | } 70 | 71 | def __init__(self, config={}, render_context=None): 72 | ''' config - JSON string or dict of configuration. See self.parse() ''' 73 | self.parse(config) # Parse configuration 74 | self.first_reset = True 75 | self.viewer = None 76 | self.render_context = render_context 77 | self.update_viewer_sim = False 78 | self.robot = Robot(self.robot_base) 79 | 80 | def parse(self, config): 81 | ''' Parse a config dict - see self.DEFAULT for description ''' 82 | self.config = deepcopy(self.DEFAULT) 83 | self.config.update(deepcopy(config)) 84 | for key, value in self.config.items(): 85 | assert key in self.DEFAULT, f'Bad key {key}' 86 | setattr(self, key, value) 87 | 88 | @property 89 | def data(self): 90 | ''' Helper to get the simulation data instance ''' 91 | return self.sim.data 92 | 93 | # TODO: remove this when mujoco-py fix is merged and a new version is pushed 94 | # https://github.com/openai/mujoco-py/pull/354 95 | # Then all uses of `self.world.get_sensor()` should change to `self.data.get_sensor`. 96 | def get_sensor(self, name): 97 | id = self.model.sensor_name2id(name) 98 | adr = self.model.sensor_adr[id] 99 | dim = self.model.sensor_dim[id] 100 | return self.data.sensordata[adr:adr + dim].copy() 101 | 102 | def build(self): 103 | ''' Build a world, including generating XML and moving objects ''' 104 | # Read in the base XML (contains robot, camera, floor, etc) 105 | self.robot_base_path = os.path.join(BASE_DIR, self.robot_base) 106 | with open(self.robot_base_path) as f: 107 | self.robot_base_xml = f.read() 108 | self.xml = xmltodict.parse(self.robot_base_xml) # Nested OrderedDict objects 109 | 110 | # Convenience accessor for xml dictionary 111 | worldbody = self.xml['mujoco']['worldbody'] 112 | 113 | # Move robot position to starting position 114 | worldbody['body']['@pos'] = convert(np.r_[self.robot_xy, self.robot.z_height]) 115 | worldbody['body']['@quat'] = convert(rot2quat(self.robot_rot)) 116 | 117 | # We need this because xmltodict skips over single-item lists in the tree 118 | worldbody['body'] = [worldbody['body']] 119 | if 'geom' in worldbody: 120 | worldbody['geom'] = [worldbody['geom']] 121 | else: 122 | worldbody['geom'] = [] 123 | 124 | # Add equality section if missing 125 | if 'equality' not in self.xml['mujoco']: 126 | self.xml['mujoco']['equality'] = OrderedDict() 127 | equality = self.xml['mujoco']['equality'] 128 | if 'weld' not in equality: 129 | equality['weld'] = [] 130 | 131 | # Add asset section if missing 132 | if 'asset' not in self.xml['mujoco']: 133 | # old default rgb1: ".4 .5 .6" 134 | # old default rgb2: "0 0 0" 135 | # light pink: "1 0.44 .81" 136 | # light blue: "0.004 0.804 .996" 137 | # light purple: ".676 .547 .996" 138 | # med blue: "0.527 0.582 0.906" 139 | # indigo: "0.293 0 0.508" 140 | asset = xmltodict.parse(''' 141 | 142 | 144 | 146 | 148 | 149 | ''') 150 | self.xml['mujoco']['asset'] = asset['asset'] 151 | 152 | 153 | # Add light to the XML dictionary 154 | light = xmltodict.parse(''' 155 | 157 | ''') 158 | worldbody['light'] = light['b']['light'] 159 | 160 | # Add floor to the XML dictionary if missing 161 | if not any(g.get('@name') == 'floor' for g in worldbody['geom']): 162 | floor = xmltodict.parse(''' 163 | 164 | ''') 165 | worldbody['geom'].append(floor['geom']) 166 | 167 | # Make sure floor renders the same for every world 168 | for g in worldbody['geom']: 169 | if g['@name'] == 'floor': 170 | g.update({'@size': convert(self.floor_size), '@rgba': '1 1 1 1', '@material': 'MatPlane'}) 171 | 172 | # Add cameras to the XML dictionary 173 | cameras = xmltodict.parse(''' 174 | 175 | 176 | ''') 177 | worldbody['camera'] = cameras['b']['camera'] 178 | 179 | # Build and add a tracking camera (logic needed to ensure orientation correct) 180 | theta = self.robot_rot 181 | xyaxes = dict( 182 | x1=np.cos(theta), 183 | x2=-np.sin(theta), 184 | x3=0, 185 | y1=np.sin(theta), 186 | y2=np.cos(theta), 187 | y3=1 188 | ) 189 | pos = dict( 190 | xp=0*np.cos(theta) + (-2)*np.sin(theta), 191 | yp=0*(-np.sin(theta)) + (-2)*np.cos(theta), 192 | zp=2 193 | ) 194 | track_camera = xmltodict.parse(''' 195 | 196 | '''.format(**pos, **xyaxes)) 197 | worldbody['body'][0]['camera'] = [ 198 | worldbody['body'][0]['camera'], 199 | track_camera['b']['camera'] 200 | ] 201 | 202 | 203 | # Add objects to the XML dictionary 204 | for name, object in self.objects.items(): 205 | assert object['name'] == name, f'Inconsistent {name} {object}' 206 | object = object.copy() # don't modify original object 207 | object['quat'] = rot2quat(object['rot']) 208 | if name=='box': 209 | dim = object['size'][0] 210 | object['dim'] = dim 211 | object['width'] = dim/2 212 | object['x'] = dim 213 | object['y'] = dim 214 | body = xmltodict.parse(''' 215 | 216 | 217 | 219 | 221 | 223 | 225 | 227 | 228 | '''.format(**{k: convert(v) for k, v in object.items()})) 229 | else: 230 | body = xmltodict.parse(''' 231 | 232 | 233 | 235 | 236 | '''.format(**{k: convert(v) for k, v in object.items()})) 237 | # Append new body to world, making it a list optionally 238 | # Add the object to the world 239 | worldbody['body'].append(body['body']) 240 | # Add mocaps to the XML dictionary 241 | for name, mocap in self.mocaps.items(): 242 | # Mocap names are suffixed with 'mocap' 243 | assert mocap['name'] == name, f'Inconsistent {name} {object}' 244 | assert name.replace('mocap', 'obj') in self.objects, f'missing object for {name}' 245 | # Add the object to the world 246 | mocap = mocap.copy() # don't modify original object 247 | mocap['quat'] = rot2quat(mocap['rot']) 248 | body = xmltodict.parse(''' 249 | 250 | 252 | 253 | '''.format(**{k: convert(v) for k, v in mocap.items()})) 254 | worldbody['body'].append(body['body']) 255 | # Add weld to equality list 256 | mocap['body1'] = name 257 | mocap['body2'] = name.replace('mocap', 'obj') 258 | weld = xmltodict.parse(''' 259 | 260 | '''.format(**{k: convert(v) for k, v in mocap.items()})) 261 | equality['weld'].append(weld['weld']) 262 | # Add geoms to XML dictionary 263 | for name, geom in self.geoms.items(): 264 | assert geom['name'] == name, f'Inconsistent {name} {geom}' 265 | geom = geom.copy() # don't modify original object 266 | geom['quat'] = rot2quat(geom['rot']) 267 | geom['contype'] = geom.get('contype', 1) 268 | geom['conaffinity'] = geom.get('conaffinity', 1) 269 | body = xmltodict.parse(''' 270 | 271 | 273 | 274 | '''.format(**{k: convert(v) for k, v in geom.items()})) 275 | # Append new body to world, making it a list optionally 276 | # Add the object to the world 277 | worldbody['body'].append(body['body']) 278 | 279 | # Instantiate simulator 280 | # print(xmltodict.unparse(self.xml, pretty=True)) 281 | self.xml_string = xmltodict.unparse(self.xml) 282 | self.model = load_model_from_xml(self.xml_string) 283 | self.sim = MjSim(self.model) 284 | 285 | # Add render contexts to newly created sim 286 | if self.render_context is None and self.observe_vision: 287 | render_context = MjRenderContextOffscreen(self.sim, device_id=-1, quiet=True) 288 | render_context.vopt.geomgroup[:] = 1 289 | self.render_context = render_context 290 | 291 | if self.render_context is not None: 292 | self.render_context.update_sim(self.sim) 293 | 294 | # Recompute simulation intrinsics from new position 295 | self.sim.forward() 296 | 297 | def rebuild(self, config={}, state=True): 298 | ''' Build a new sim from a model if the model changed ''' 299 | if state: 300 | old_state = self.sim.get_state() 301 | #self.config.update(deepcopy(config)) 302 | #self.parse(self.config) 303 | self.parse(config) 304 | self.build() 305 | if state: 306 | self.sim.set_state(old_state) 307 | self.sim.forward() 308 | 309 | def reset(self, build=True): 310 | ''' Reset the world (sim is accessed through self.sim) ''' 311 | if build: 312 | self.build() 313 | # set flag so that renderer knows to update sim 314 | self.update_viewer_sim = True 315 | 316 | def render(self, mode='human'): 317 | ''' Render the environment to the screen ''' 318 | if self.viewer is None: 319 | self.viewer = MjViewer(self.sim) 320 | # Turn all the geom groups on 321 | self.viewer.vopt.geomgroup[:] = 1 322 | # Set camera if specified 323 | if mode == 'human': 324 | self.viewer.cam.fixedcamid = -1 325 | self.viewer.cam.type = const.CAMERA_FREE 326 | else: 327 | self.viewer.cam.fixedcamid = self.model.camera_name2id(mode) 328 | self.viewer.cam.type = const.CAMERA_FIXED 329 | if self.update_viewer_sim: 330 | self.viewer.update_sim(self.sim) 331 | self.update_viewer_sim = False 332 | self.viewer.render() 333 | 334 | def robot_com(self): 335 | ''' Get the position of the robot center of mass in the simulator world reference frame ''' 336 | return self.body_com('robot') 337 | 338 | def robot_pos(self): 339 | ''' Get the position of the robot in the simulator world reference frame ''' 340 | return self.body_pos('robot') 341 | 342 | def robot_mat(self): 343 | ''' Get the rotation matrix of the robot in the simulator world reference frame ''' 344 | return self.body_mat('robot') 345 | 346 | def robot_vel(self): 347 | ''' Get the velocity of the robot in the simulator world reference frame ''' 348 | return self.body_vel('robot') 349 | 350 | def body_com(self, name): 351 | ''' Get the center of mass of a named body in the simulator world reference frame ''' 352 | return self.data.subtree_com[self.model.body_name2id(name)].copy() 353 | 354 | def body_pos(self, name): 355 | ''' Get the position of a named body in the simulator world reference frame ''' 356 | return self.data.get_body_xpos(name).copy() 357 | 358 | def body_mat(self, name): 359 | ''' Get the rotation matrix of a named body in the simulator world reference frame ''' 360 | return self.data.get_body_xmat(name).copy() 361 | 362 | def body_vel(self, name): 363 | ''' Get the velocity of a named body in the simulator world reference frame ''' 364 | return self.data.get_body_xvelp(name).copy() 365 | 366 | 367 | 368 | class Robot: 369 | ''' Simple utility class for getting mujoco-specific info about a robot ''' 370 | def __init__(self, path): 371 | base_path = os.path.join(BASE_DIR, path) 372 | self.sim = MjSim(load_model_from_path(base_path)) 373 | self.sim.forward() 374 | 375 | # Needed to figure out z-height of free joint of offset body 376 | self.z_height = self.sim.data.get_body_xpos('robot')[2] 377 | # Get a list of geoms in the robot 378 | self.geom_names = [n for n in self.sim.model.geom_names if n != 'floor'] 379 | # Needed to figure out the observation spaces 380 | self.nq = self.sim.model.nq 381 | self.nv = self.sim.model.nv 382 | # Needed to figure out action space 383 | self.nu = self.sim.model.nu 384 | # Needed to figure out observation space 385 | # See engine.py for an explanation for why we treat these separately 386 | self.hinge_pos_names = [] 387 | self.hinge_vel_names = [] 388 | self.ballquat_names = [] 389 | self.ballangvel_names = [] 390 | self.sensor_dim = {} 391 | for name in self.sim.model.sensor_names: 392 | id = self.sim.model.sensor_name2id(name) 393 | self.sensor_dim[name] = self.sim.model.sensor_dim[id] 394 | sensor_type = self.sim.model.sensor_type[id] 395 | if self.sim.model.sensor_objtype[id] == const.OBJ_JOINT: 396 | joint_id = self.sim.model.sensor_objid[id] 397 | joint_type = self.sim.model.jnt_type[joint_id] 398 | if joint_type == const.JNT_HINGE: 399 | if sensor_type == const.SENS_JOINTPOS: 400 | self.hinge_pos_names.append(name) 401 | elif sensor_type == const.SENS_JOINTVEL: 402 | self.hinge_vel_names.append(name) 403 | else: 404 | t = self.sim.model.sensor_type[i] 405 | raise ValueError('Unrecognized sensor type {} for joint'.format(t)) 406 | elif joint_type == const.JNT_BALL: 407 | if sensor_type == const.SENS_BALLQUAT: 408 | self.ballquat_names.append(name) 409 | elif sensor_type == const.SENS_BALLANGVEL: 410 | self.ballangvel_names.append(name) 411 | elif joint_type == const.JNT_SLIDE: 412 | # Adding slide joints is trivially easy in code, 413 | # but this removes one of the good properties about our observations. 414 | # (That we are invariant to relative whole-world transforms) 415 | # If slide joints are added we sould ensure this stays true! 416 | raise ValueError('Slide joints in robots not currently supported') -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/random_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import gym 5 | import safety_gym # noqa 6 | import numpy as np # noqa 7 | 8 | def run_random(env_name): 9 | env = gym.make(env_name) 10 | obs = env.reset() 11 | done = False 12 | ep_ret = 0 13 | ep_cost = 0 14 | while True: 15 | if done: 16 | print('Episode Return: %.3f \t Episode Cost: %.3f'%(ep_ret, ep_cost)) 17 | ep_ret, ep_cost = 0, 0 18 | obs = env.reset() 19 | assert env.observation_space.contains(obs) 20 | act = env.action_space.sample() 21 | assert env.action_space.contains(act) 22 | obs, reward, done, info = env.step(act) 23 | # print('reward', reward) 24 | ep_ret += reward 25 | ep_cost += info.get('cost', 0) 26 | env.render() 27 | 28 | 29 | if __name__ == '__main__': 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--env', default='Safexp-PointGoal1-v0') 33 | args = parser.parse_args() 34 | run_random(args.env) 35 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/test/obs_space_refs.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZifanWu/CAL/e4087cc5fdd558b28a16dbcf54c0130952422d98/env/safety-gym/safety_gym/test/obs_space_refs.pkl -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/test/test_bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import unittest 5 | import numpy as np 6 | import gym 7 | import gym.spaces 8 | 9 | from safety_gym.envs.engine import Engine 10 | 11 | 12 | class TestBench(unittest.TestCase): 13 | def test_goal(self): 14 | ''' Point should run into and get a goal ''' 15 | config = { 16 | 'robot_base': 'xmls/point.xml', 17 | 'goal_size': 0.5, 18 | 'goal_placements': [(0, -.5, 5, .5)], 19 | 'reward_goal': 1.0, 20 | 'reward_distance': 1.0, 21 | 'robot_locations': [(0, 0)], 22 | 'robot_rot': 0, 23 | '_seed': 0, 24 | } 25 | env = Engine(config) 26 | env.reset() 27 | goal_met = False 28 | for _ in range(999): 29 | act = np.zeros(env.action_space.shape) 30 | act[0] = 1 31 | _, reward, done, info = env.step(act) 32 | self.assertFalse(done) 33 | # If we have not yet got the goal 34 | if not goal_met: 35 | # Reward should be positive, since we're moving towards it. 36 | self.assertGreater(reward, 0) 37 | # Update if we got the goal 38 | if 'goal_met' in info: 39 | goal_met = info['goal_met'] 40 | # Assert we got 1 point for the goal 41 | self.assertGreater(reward, 1) 42 | # env.render() # Uncomment to visualize test 43 | self.assertTrue(goal_met) 44 | 45 | def test_hazards(self): 46 | ''' Point should run into and get a hazard ''' 47 | config = { 48 | 'robot_base': 'xmls/point.xml', 49 | 'goal_size': 0.5, 50 | 'goal_placements': [(5, -.5, 10, .5)], 51 | 'reward_goal': 1.0, 52 | 'reward_distance': 1.0, 53 | 'constrain_indicator': True, 54 | 'constrain_hazards': True, 55 | 'hazards_num': 1, 56 | 'hazards_size': 0.5, 57 | 'hazards_locations': [(2, 0)], 58 | 'hazards_cost': 1.0, 59 | 'robot_locations': [(0, 0)], 60 | 'robot_rot': 0, 61 | '_seed': 0, 62 | } 63 | env = Engine(config) 64 | env.reset() 65 | goal_met = False 66 | hazard_found = False 67 | for _ in range(999): 68 | act = np.zeros(env.action_space.shape) 69 | act[0] = 1 70 | _, reward, done, info = env.step(act) 71 | if not hazard_found: 72 | if info['cost']: 73 | hazard_found = True 74 | self.assertEqual(info['cost'], 1.0) # Sparse costs 75 | self.assertGreater(info['cost_hazards'], 0.0) # Nonzero hazard cost 76 | if 'goal_met' in info: 77 | goal_met = info['goal_met'] 78 | # env.render() # Uncomment to visualize test 79 | self.assertTrue(hazard_found) 80 | self.assertTrue(goal_met) 81 | 82 | def test_vases(self): 83 | ''' Point should run into and past a vase, pushing it out of the way ''' 84 | config = { 85 | 'robot_base': 'xmls/point.xml', 86 | 'goal_size': 0.5, 87 | 'goal_placements': [(5, -.5, 10, .5)], 88 | 'reward_goal': 1.0, 89 | 'reward_distance': 1.0, 90 | 'constrain_indicator': True, 91 | 'constrain_vases': True, 92 | 'vases_num': 1, 93 | 'vases_locations': [(2, 0)], 94 | 'vases_contact_cost': 1.0, 95 | 'vases_displace_cost': 1.0, 96 | 'vases_velocity_cost': 1.0, 97 | 'robot_locations': [(0, 0)], 98 | 'robot_rot': 0, 99 | '_seed': 0, 100 | } 101 | env = Engine(config) 102 | env.reset() 103 | goal_met = False 104 | vase_found = False 105 | for _ in range(999): 106 | act = np.zeros(env.action_space.shape) 107 | act[0] = 1 108 | _, reward, done, info = env.step(act) 109 | if not vase_found: 110 | if info['cost']: 111 | vase_found = True 112 | self.assertEqual(info['cost'], 1.0) # Sparse costs 113 | self.assertGreater(info['cost_vases_contact'], 0.0) # Nonzero vase cost 114 | self.assertGreater(info['cost_vases_velocity'], 0.0) # Nonzero vase cost 115 | else: 116 | # We've already found the vase (and hit it), ensure displace cost 117 | self.assertEqual(info['cost'], 1.0) # Sparse costs 118 | self.assertGreater(info['cost_vases_displace'], 0.0) # Nonzero vase cost 119 | if 'goal_met' in info: 120 | goal_met = info['goal_met'] 121 | # env.render() # Uncomment to visualize test 122 | self.assertTrue(vase_found) 123 | self.assertTrue(goal_met) 124 | 125 | def check_correct_lidar(self, env_name): 126 | ''' Check that a benchmark env has the right lidar obs for the objects in scene ''' 127 | env = gym.make(env_name) 128 | env.reset() 129 | physics = env.unwrapped 130 | world = physics.world 131 | obs_space_dict = physics.obs_space_dict 132 | task = physics.task 133 | lidar_count = sum('lidar' in o.lower() for o in obs_space_dict.keys()) 134 | # Goal based lidar 135 | if task == 'x': 136 | self.assertEqual(lidar_count, 0) 137 | elif task == 'circle': 138 | self.assertEqual(lidar_count, 1) 139 | self.assertIn('circle_lidar', obs_space_dict) 140 | elif task == 'goal': 141 | self.assertIn('goal_lidar', obs_space_dict) 142 | elif task == 'push': 143 | self.assertIn('goal_lidar', obs_space_dict) 144 | self.assertIn('box_lidar', obs_space_dict) 145 | elif task == 'button': 146 | self.assertIn('goal_lidar', obs_space_dict) 147 | self.assertIn('buttons_lidar', obs_space_dict) 148 | 149 | if physics.constrain_hazards or physics.hazards_num > 0: 150 | self.assertIn('hazards_lidar', obs_space_dict) 151 | self.assertGreater(physics.hazards_num, 0) 152 | if physics.constrain_vases or physics.vases_num > 0: 153 | self.assertIn('vases_lidar', obs_space_dict) 154 | self.assertGreater(physics.vases_num, 0) 155 | if physics.constrain_pillars or physics.pillars_num > 0: 156 | self.assertIn('pillars_lidar', obs_space_dict) 157 | self.assertGreater(physics.pillars_num, 0) 158 | if physics.constrain_buttons or physics.buttons_num > 0: 159 | self.assertIn('buttons_lidar', obs_space_dict) 160 | self.assertGreater(physics.buttons_num, 0) 161 | if physics.constrain_gremlins or physics.gremlins_num > 0: 162 | self.assertIn('gremlins_lidar', obs_space_dict) 163 | self.assertGreater(physics.gremlins_num, 0) 164 | 165 | def test_correct_lidar(self): 166 | ''' We should have lidar for every object in the env ''' 167 | matched = [] 168 | for env_spec in gym.envs.registry.all(): 169 | #if re.match(r'Safexp-.*-v0', env_spec.id) is not None: 170 | if 'Safexp' in env_spec.id and not('Vision' in env_spec.id): 171 | matched.append(env_spec.id) 172 | assert matched, 'Failed to match any environments!' 173 | for env_name in matched: 174 | print(env_name) 175 | self.check_correct_lidar(env_name) 176 | 177 | 178 | if __name__ == '__main__': 179 | unittest.main() 180 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/test/test_button.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import numpy as np 5 | 6 | from safety_gym.envs.engine import Engine, ResamplingError 7 | 8 | 9 | class TestButton(unittest.TestCase): 10 | def rollout_env(self, env, gets_goal=False): 11 | ''' 12 | Roll an environment out to the end, return final info dict. 13 | If gets_goal=True, then also assert that we got a goal successfully. 14 | ''' 15 | got_goal = False 16 | done = False 17 | while not done: 18 | _, _, done, info = env.step([1, 0]) 19 | if 'goal_met' in info: 20 | got_goal = True 21 | if gets_goal: 22 | self.assertTrue(got_goal) 23 | return info 24 | 25 | def test_timer(self): 26 | ''' Buttons should wait a period before becoming active again ''' 27 | config = { 28 | 'robot_base': 'xmls/point.xml', 29 | 'num_steps': 100, 30 | 'task': 'button', 31 | 'buttons_num': 2, 32 | 'buttons_locations': [(0, 0), (1, 0)], 33 | 'buttons_resampling_delay': 1000, 34 | 'constrain_buttons': True, 35 | 'constrain_indicator': True, 36 | 'robot_locations': [(-1, 0)], 37 | 'robot_rot': 0, 38 | '_seed': 0, 39 | } 40 | # Correct button is pressed, nothing afterwards 41 | env = Engine(config) 42 | env.reset() 43 | info = self.rollout_env(env, gets_goal=True) 44 | self.assertEqual(info['cost_buttons'], 0.0) 45 | # Correct button is pressed, then times out and penalties 46 | config['buttons_resampling_delay'] = 10 47 | env = Engine(config) 48 | env.reset() 49 | info = self.rollout_env(env, gets_goal=True) 50 | self.assertEqual(info['cost_buttons'], 1.0) 51 | # Wrong button is pressed, gets penalty 52 | config['_seed'] = 1 53 | env = Engine(config) 54 | env.reset() 55 | info = self.rollout_env(env) 56 | self.assertEqual(info['cost_buttons'], 1.0) 57 | 58 | 59 | if __name__ == '__main__': 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/test/test_determinism.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import numpy as np 5 | import gym 6 | import safety_gym # noqa 7 | 8 | 9 | class TestDeterminism(unittest.TestCase): 10 | def check_qpos(self, env_name): 11 | ''' Check that a single environment is seed-stable at init ''' 12 | for seed in [0, 1, 123456789]: 13 | print('running', env_name, seed) 14 | env1 = gym.make(env_name) 15 | env1.seed(np.random.randint(123456789)) 16 | env1.reset() 17 | env1.seed(seed) 18 | env1.reset() 19 | env2 = gym.make(env_name) 20 | env2.seed(seed) 21 | env2.reset() 22 | np.testing.assert_almost_equal(env1.unwrapped.data.qpos, env2.unwrapped.data.qpos) 23 | 24 | def test_qpos(self): 25 | ''' Run all the bench envs ''' 26 | for env_spec in gym.envs.registry.all(): 27 | if 'Safexp' in env_spec.id: 28 | self.check_qpos(env_spec.id) 29 | 30 | def check_names(self, env_name): 31 | ''' Check that all the names in the mujoco model are the same for different envs ''' 32 | print('check names', env_name) 33 | env1 = gym.make(env_name) 34 | env1.seed(0) 35 | env1.reset() 36 | env2 = gym.make(env_name) 37 | env2.seed(1) 38 | env2.reset() 39 | model1 = env1.unwrapped.model 40 | model2 = env2.unwrapped.model 41 | shared_names = ['actuator_names', 'body_names', 'camera_names', 'geom_names', 42 | 'joint_names', 'light_names', 'mesh_names', 'sensor_names', 43 | 'site_names', 'tendon_names', 'userdata_names'] 44 | for n in shared_names: 45 | self.assertEqual(getattr(model1, n), getattr(model2, n)) 46 | 47 | def test_names(self): 48 | ''' Run all the bench envs ''' 49 | for env_spec in gym.envs.registry.all(): 50 | if 'Safexp' in env_spec.id: 51 | self.check_names(env_spec.id) 52 | 53 | 54 | if __name__ == '__main__': 55 | unittest.main() 56 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/test/test_engine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import numpy as np 5 | import gym.spaces 6 | 7 | from safety_gym.envs.engine import Engine 8 | 9 | 10 | class TestEngine(unittest.TestCase): 11 | def test_timeout(self): 12 | ''' Test that episode is over after num_steps ''' 13 | p = Engine({'num_steps': 10}) 14 | p.reset() 15 | for _ in range(10): 16 | self.assertFalse(p.done) 17 | p.step(np.zeros(p.action_space.shape)) 18 | self.assertTrue(p.done) 19 | with self.assertRaises(AssertionError): 20 | p.step(np.zeros(p.action_space.shape)) 21 | 22 | def test_flatten(self): 23 | ''' Test that physics can flatten observations ''' 24 | p = Engine({'observation_flatten': True}) 25 | obs = p.reset() 26 | self.assertIsInstance(p.observation_space, gym.spaces.Box) 27 | self.assertEqual(len(p.observation_space.shape), 1) 28 | self.assertTrue(p.observation_space.contains(obs)) 29 | 30 | p = Engine({'observation_flatten': False}) 31 | obs = p.reset() 32 | self.assertIsInstance(p.observation_space, gym.spaces.Dict) 33 | self.assertTrue(p.observation_space.contains(obs)) 34 | 35 | def test_angle_components(self): 36 | ''' Test that the angle components are about correct ''' 37 | p = Engine({'robot_base': 'xmls/doggo.xml', 38 | 'observation_flatten': False, 39 | 'sensors_angle_components': True, 40 | 'robot_rot': .3}) 41 | p.reset() 42 | p.step(p.action_space.high) 43 | p.step(p.action_space.high) 44 | p.step(p.action_space.low) 45 | theta = p.data.get_joint_qpos('hip_1_z') 46 | dtheta = p.data.get_joint_qvel('hip_1_z') 47 | print('theta', theta) 48 | print('dtheta', dtheta) 49 | print('sensordata', p.data.sensordata) 50 | obs = p.obs() 51 | print('obs', obs) 52 | x, y = obs['jointpos_hip_1_z'] 53 | dz = obs['jointvel_hip_1_z'] 54 | # x, y components should be unit vector 55 | self.assertAlmostEqual(np.sqrt(np.sum(np.square([x, y]))), 1.0) 56 | # x, y components should be sin/cos theta 57 | self.assertAlmostEqual(np.sin(theta), x) 58 | self.assertAlmostEqual(np.cos(theta), y) 59 | # dz should be the same as dtheta 60 | self.assertAlmostEqual(dz, dtheta) 61 | 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/test/test_envs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import gym 5 | import safety_gym.envs # noqa 6 | 7 | 8 | class TestEnvs(unittest.TestCase): 9 | def check_env(self, env_name): 10 | ''' Run a single environment for a single episode ''' 11 | print('running', env_name) 12 | env = gym.make(env_name) 13 | env.reset() 14 | done = False 15 | while not done: 16 | _, _, done, _ = env.step(env.action_space.sample()) 17 | 18 | def test_envs(self): 19 | ''' Run all the bench envs ''' 20 | for env_spec in gym.envs.registry.all(): 21 | if 'Safexp' in env_spec.id: 22 | self.check_env(env_spec.id) 23 | 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/test/test_goal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import numpy as np 5 | 6 | from safety_gym.envs.engine import Engine, ResamplingError 7 | 8 | 9 | class TestGoal(unittest.TestCase): 10 | def rollout_env(self, env): 11 | ''' roll an environment until it is done ''' 12 | done = False 13 | while not done: 14 | _, _, done, _ = env.step([1,0]) 15 | 16 | def test_resample(self): 17 | ''' Episode should end with resampling failure ''' 18 | config = { 19 | 'robot_base': 'xmls/point.xml', 20 | 'num_steps': 1001, 21 | 'placements_extents': [-1, -1, 1, 1], 22 | 'goal_size': 1.414, 23 | 'goal_keepout': 1.414, 24 | 'goal_locations': [(1, 1)], 25 | 'robot_keepout': 1.414, 26 | 'robot_locations': [(-1, -1)], 27 | 'robot_rot': np.sin(np.pi / 4), 28 | 'terminate_resample_failure': True, 29 | '_seed': 0, 30 | } 31 | env = Engine(config) 32 | env.reset() 33 | self.assertEqual(env.steps, 0) 34 | # Move the robot towards the goal 35 | self.rollout_env(env) 36 | # Check that the environment terminated early 37 | self.assertLess(env.steps, 1000) 38 | 39 | # Try again with the raise 40 | config['terminate_resample_failure'] = False 41 | env = Engine(config) 42 | env.reset() 43 | # Move the robot towards the goal, which should cause resampling failure 44 | with self.assertRaises(ResamplingError): 45 | self.rollout_env(env) 46 | 47 | 48 | if __name__ == '__main__': 49 | unittest.main() 50 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/test/test_obs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import numpy as np 5 | import joblib 6 | import os 7 | import os.path as osp 8 | import gym 9 | import safety_gym 10 | from safety_gym.envs.engine import Engine 11 | 12 | 13 | class TestObs(unittest.TestCase): 14 | def test_rotate(self): 15 | ''' Point should observe compass/lidar differently for different rotations ''' 16 | config = { 17 | 'robot_base': 'xmls/point.xml', 18 | 'observation_flatten': False, 19 | 'observe_sensors': False, 20 | 'observe_remaining': False, 21 | 'observe_goal_lidar': True, 22 | 'observe_goal_comp': True, 23 | 'goal_size': 3, 24 | 'goal_locations': [(5, 0)], 25 | 'robot_locations': [(1, 1)], 26 | '_seed': 0, 27 | } 28 | for s in (2, 3): 29 | config['compass_shape'] = s 30 | config['robot_rot'] = 5.3 31 | env = Engine(config) 32 | obs0 = env.reset() 33 | # for _ in range(1000): env.render() 34 | # print('obs0', obs0) 35 | config['robot_rot'] = np.pi / 4 36 | env = Engine(config) 37 | obs1 = env.reset() 38 | # for _ in range(1000): env.render() 39 | # print('obs1', obs1) 40 | self.assertTrue((obs0['goal_lidar'] != obs1['goal_lidar']).any()) 41 | self.assertTrue((obs0['goal_compass'] != obs1['goal_compass']).any()) 42 | 43 | def test_spaces(self): 44 | ''' Observation spaces should not unintentionally change from known good reference ''' 45 | BASE_DIR = os.path.dirname(safety_gym.__file__) 46 | fpath = osp.join(BASE_DIR, 'test', 'obs_space_refs.pkl') 47 | obs_space_refs = joblib.load(fpath) 48 | for env_spec in gym.envs.registry.all(): 49 | if 'Safexp' in env_spec.id and env_spec.id in obs_space_refs: 50 | print('Checking obs space for... ', env_spec.id) 51 | env = gym.make(env_spec.id) 52 | ref_obs_space_dict = obs_space_refs[env_spec.id] 53 | obs_spaces_are_same = env.obs_space_dict==ref_obs_space_dict 54 | if not(obs_spaces_are_same): 55 | print('\n', env_spec.id, '\n') 56 | print('Current Observation Space:\n', env.obs_space_dict, '\n\n') 57 | print('Reference Observation Space:\n', ref_obs_space_dict, '\n\n') 58 | self.assertTrue(obs_spaces_are_same) 59 | 60 | 61 | if __name__ == '__main__': 62 | unittest.main() 63 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/xmls/README.md: -------------------------------------------------------------------------------- 1 | # xmls 2 | 3 | These are mujoco XML files which are used as bases for the simulations. 4 | 5 | Some design goals for them: 6 | 7 | - XML should be complete and simulate-able as-is 8 | - Include a floor geom which is a plane 9 | - Include joint sensor for the robot which provide observation 10 | - Include actuators which provide control 11 | - Default positions should all be neutral 12 | - position 0,0,0 should be resting on the floor, not intersecting it 13 | - robot should start at the origin 14 | - Scene should be clear of other objects 15 | - no obstacles or things to manipulate 16 | - only the robot in the scene 17 | 18 | Requirements for the robot 19 | - Position joints should be separate and named `x`, `y`, and `z` 20 | - 0, 0, 0 position should be resting on the floor above the origin at a neutral position 21 | - First 6 sensors should be (in order): 22 | - joint positions for x, y, z (absolute position in the scene) 23 | - joint velocities for x, y, z (absolute velocity in the scene) 24 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/xmls/car.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 58 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/xmls/doggo.xml: -------------------------------------------------------------------------------- 1 | 2 | 178 | -------------------------------------------------------------------------------- /env/safety-gym/safety_gym/xmls/point.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 41 | -------------------------------------------------------------------------------- /env/safety-gym/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | import sys 5 | 6 | assert sys.version_info.major == 3 and sys.version_info.minor >= 6, \ 7 | "Safety Gym is designed to work with Python 3.6 and greater. " \ 8 | + "Please install it before proceeding." 9 | 10 | setup( 11 | name='safety_gym', 12 | packages=['safety_gym'], 13 | install_requires=[ 14 | 'gym', 15 | 'numpy', 16 | 'mujoco_py==2.0.2.7', 17 | 'xmltodict', 18 | ], 19 | ) 20 | -------------------------------------------------------------------------------- /env/safety-gym/setup_dependency.sh: -------------------------------------------------------------------------------- 1 | # depends on the system and needs 2 | sudo curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf 3 | sudo chmod +x /usr/local/bin/patchelf 4 | sudo apt install libosmesa6-dev libgl1-mesa-glx libglfw3 5 | sudo apt install libglew-dev 6 | -------------------------------------------------------------------------------- /env/safety-gym/setup_mujoco.sh: -------------------------------------------------------------------------------- 1 | # install mujoco 2 | wget https://www.roboti.us/download/mujoco200_linux.zip 3 | unzip mujoco200_linux.zip 4 | rm -r mujoco200_linux.zip 5 | mkdir ~/.mujoco 6 | mv mujoco200_linux ~/.mujoco/mujoco200 7 | 8 | cd ~/.mujoco 9 | wget https://www.roboti.us/file/mjkey.txt 10 | 11 | 12 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:~/.mujoco/mujoco200/bin 13 | export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so 14 | echo "export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:~/.mujoco/mujoco200/bin" >> ~/.bashrc 15 | echo "export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so" >> ~/.bashrc 16 | -------------------------------------------------------------------------------- /img/cal_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZifanWu/CAL/e4087cc5fdd558b28a16dbcf54c0130952422d98/img/cal_fig1.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import time 2 | import gym 3 | import torch 4 | import numpy as np 5 | import os 6 | import wandb 7 | import socket 8 | from pathlib import Path 9 | import setproctitle 10 | 11 | from agent.replay_memory import ReplayMemory 12 | from agent.cal import CALAgent 13 | from sampler.mujoco_env_sampler import MuJoCoEnvSampler 14 | from sampler.safetygym_env_sampler import SafetygymEnvSampler 15 | 16 | 17 | def train(args, env_sampler, agent, pool): 18 | total_step = 0 19 | exploration_before_start(args, env_sampler, pool, agent) 20 | epoch = 0 21 | 22 | for _ in range(args.num_epoch): 23 | sta = time.time() 24 | epo_len = args.epoch_length 25 | train_policy_steps = 0 26 | for i in range(epo_len): 27 | cur_state, action, next_state, reward, done, info = env_sampler.sample(agent, i) 28 | pool.push(cur_state, action, reward, next_state, done) 29 | 30 | # train the policy 31 | if len(pool) > args.min_pool_size: 32 | train_policy_steps += train_policy_repeats(args, total_step, train_policy_steps, pool, agent) 33 | total_step += 1 34 | 35 | def evaluate(num_eval_epo=1): 36 | env_sampler.current_state = None 37 | avg_return, avg_cost_return = 0, 0 38 | eval_step = 0 39 | for _ in range(num_eval_epo): 40 | sum_reward, sum_cost = 0, 0 41 | eval_step = 0 42 | done = False 43 | while not done and eval_step < epo_len: 44 | _, _, _, reward, done, _ = env_sampler.sample(agent, eval_step, eval_t=True) 45 | sum_reward += reward[0] 46 | sum_cost += reward[1] if args.safetygym else args.gamma**eval_step * reward[1] 47 | eval_step += 1 48 | avg_return += sum_reward 49 | avg_cost_return += sum_cost 50 | avg_return /= num_eval_epo 51 | avg_cost_return /= num_eval_epo 52 | return avg_return, avg_cost_return 53 | 54 | if total_step % epo_len == 0 or total_step == 1: 55 | test_reward, test_cost = evaluate(args.num_eval_epochs) 56 | print('env: {}, exp: {}, step: {}, test_return: {}, test_cost: {}, budget: {}, seed: {}, cuda_num: {}, time: {}s'.format(args.env_name, args.experiment_name, total_step, np.around(test_reward, 2), np.around(test_cost, 2), args.cost_lim, args.seed, args.cuda_num, int(time.time() - sta))) 57 | if args.use_wandb: 58 | wandb.log({"test_return": test_reward, 'total_step': total_step}) 59 | wandb.log({"test_cost": test_cost, 'total_step': total_step}) 60 | epoch += 1 61 | 62 | # save network parameters after training 63 | if args.save_parameters: 64 | agent.save_model() 65 | 66 | 67 | def exploration_before_start(args, env_sampler, pool, agent): 68 | for i in range(args.init_exploration_steps): 69 | cur_state, action, next_state, reward, done, info = env_sampler.sample(agent, i) 70 | pool.push(cur_state, action, reward, next_state, done) 71 | 72 | 73 | def train_policy_repeats(args, total_step, train_step, pool, agent): 74 | if total_step % args.train_every_n_steps > 0: 75 | return 0 76 | 77 | if train_step > args.max_train_repeat_per_step * total_step: 78 | return 0 79 | 80 | for i in range(args.num_train_repeat): 81 | batch_state, batch_action, batch_reward, batch_next_state, batch_done = pool.sample(args.policy_train_batch_size) 82 | batch_reward, batch_done = np.squeeze(batch_reward), np.squeeze(batch_done) 83 | batch_done = (~batch_done).astype(int) 84 | agent.update_parameters((batch_state, batch_action, batch_reward, batch_next_state, batch_done), i) 85 | return args.num_train_repeat 86 | 87 | 88 | def main(args): 89 | torch.set_num_threads(args.n_training_threads) 90 | run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[ 91 | 0] + "/results") / args.env_name / args.experiment_name 92 | 93 | env = gym.make(args.env_name) 94 | # Set random seed 95 | torch.manual_seed(args.seed) 96 | np.random.seed(args.seed) 97 | env.seed(args.seed) 98 | 99 | s_dim = env.observation_space.shape[0] 100 | 101 | if args.env_name == 'Ant-v3': 102 | s_dim = int(27) 103 | elif args.env_name == 'Humanoid-v3': 104 | s_dim = int(45) 105 | 106 | if not run_dir.exists(): 107 | os.makedirs(str(run_dir)) 108 | 109 | if args.use_wandb: 110 | run = wandb.init(config=args, 111 | project='SafeRL', 112 | entity=args.user_name, 113 | notes=socket.gethostname(), 114 | name= args.experiment_name + '_' + str(args.cuda_num) +'_' + str(args.seed), 115 | group=args.env_name, 116 | dir=str(run_dir), 117 | job_type="training", 118 | reinit=True) 119 | 120 | setproctitle.setproctitle(str(args.env_name) + "-" + str(args.seed)) 121 | 122 | # Intial agent 123 | agent = CALAgent(s_dim, env.action_space, args) 124 | 125 | # Initial pool for env 126 | pool = ReplayMemory(args.replay_size) 127 | 128 | # Sampler of environment 129 | if args.safetygym: 130 | env_sampler = SafetygymEnvSampler(args, env) 131 | else: 132 | env_sampler = MuJoCoEnvSampler(args, env) 133 | 134 | train(args, env_sampler, agent, pool) 135 | 136 | if args.use_wandb: 137 | run.finish() 138 | 139 | 140 | if __name__ == '__main__': 141 | from arguments import readParser 142 | from env.constraints import get_threshold 143 | import safety_gym 144 | args = readParser() 145 | if 'Safe' in args.env_name: # safetygym 146 | args.constraint_type = 'safetygym' 147 | args.safetygym = True 148 | args.epoch_length = 400 149 | args.cost_lim = get_threshold(args.env_name, constraint=args.constraint_type) 150 | os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_num 151 | args.seed = torch.randint(0, 10000, (1,)).item() 152 | main(args) 153 | -------------------------------------------------------------------------------- /sampler/mujoco_env_sampler.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import wandb 4 | 5 | class MuJoCoEnvSampler(): 6 | def __init__(self, args, env, max_path_length=1000): 7 | self.env = env 8 | self.args = args 9 | 10 | self.path_length = 0 11 | self.total_path_length = 0 12 | self.current_state = None 13 | self.max_path_length = max_path_length 14 | self.sum_reward = 0 15 | 16 | self.cur_s = None 17 | 18 | def sample(self, agent, i, eval_t=False): 19 | self.total_path_length += 1 20 | if self.current_state is None: 21 | self.current_state = self.env.reset() 22 | if self.args.env_name == 'Ant-v3': 23 | self.current_state = self.current_state[:27] 24 | elif self.args.env_name == 'Humanoid-v3': 25 | self.current_state = self.current_state[:45] 26 | self.cur_s = self.current_state.copy() 27 | 28 | cur_state = self.current_state 29 | action = agent.select_action(cur_state, eval_t) 30 | next_state, reward, terminal, info = self.env.step(action) 31 | 32 | if 'y_velocity' in info: 33 | cost = np.sqrt(info["y_velocity"] ** 2 + info["x_velocity"] ** 2) 34 | else: 35 | cost = np.abs(info["x_velocity"]) 36 | 37 | if self.args.env_name == 'Ant-v3': 38 | next_state = next_state[:27] 39 | elif self.args.env_name == 'Humanoid-v3': 40 | next_state = next_state[:45] 41 | self.path_length += 1 42 | 43 | reward = np.array([reward, cost]) 44 | self.sum_reward += reward 45 | 46 | if terminal or self.path_length >= self.max_path_length: # NOTE 47 | self.current_state = None 48 | self.path_length = 0 49 | self.sum_reward = 0 50 | else: 51 | self.current_state = next_state 52 | self.cur_s = next_state 53 | return cur_state, action, next_state, reward, terminal, info 54 | 55 | 56 | def get_ter_action(self, agent): 57 | action = agent.select_action(self.cur_s, eval=False) 58 | return action 59 | -------------------------------------------------------------------------------- /sampler/safetygym_env_sampler.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import wandb 4 | 5 | class SafetygymEnvSampler(): 6 | def __init__(self, args, env, max_path_length=400): 7 | self.env = env 8 | self.args = args 9 | 10 | self.path_length = 0 11 | self.total_path_length = 0 12 | self.current_state = None 13 | self.max_path_length = max_path_length 14 | 15 | def sample(self, agent, i, eval_t=False): 16 | self.total_path_length += 1 17 | if i % self.args.epoch_length == 0: 18 | self.current_state = self.env.reset() 19 | cur_state = self.current_state 20 | action = agent.select_action(cur_state, eval_t) 21 | next_state, reward, done, info = self.env.step(action) 22 | 23 | if not eval_t: 24 | done = False if i == self.args.epoch_length - 1 or "TimeLimit.truncated" in info else done 25 | done = True if "goal_met" in info and info["goal_met"] else done 26 | 27 | cost = info['cost'] 28 | self.path_length += 1 29 | reward = np.array([reward, cost]) 30 | self.current_state = next_state 31 | return cur_state, action, next_state, reward, done, info 32 | 33 | def get_ter_action(self, agent): 34 | action = agent.select_action(self.cur_s_for_RLmodel, eval=False) 35 | return action 36 | --------------------------------------------------------------------------------