├── figures ├── qp.png ├── diff_inc.png ├── policy_diagram.pdf ├── policy_diagram.png ├── cars_env_reward.png ├── framework_diagram.png ├── unicycle_env_reward.png └── unicycle_snapshot.png ├── .gitignore ├── envs ├── utils.py ├── simulated_cars_env.py ├── pyglet_rendering.py └── unicycle_env.py ├── rcbf_sac ├── replay_memory.py ├── evaluator.py ├── generate_rollouts.py ├── compensator.py ├── utils.py ├── model.py ├── gp_model.py ├── sac_cbf.py ├── cbf_qp.py ├── dynamics.py └── diff_cbf_qp.py ├── README.md └── main.py /figures/qp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/qp.png -------------------------------------------------------------------------------- /figures/diff_inc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/diff_inc.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | */runs/* 3 | */output/* 4 | *__pycache__/ 5 | */.DS_Store 6 | output/* 7 | -------------------------------------------------------------------------------- /envs/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def to_pixel(meas_cm, shift=0): 3 | return 1.5 * 37.795 * meas_cm + shift -------------------------------------------------------------------------------- /figures/policy_diagram.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/policy_diagram.pdf -------------------------------------------------------------------------------- /figures/policy_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/policy_diagram.png -------------------------------------------------------------------------------- /figures/cars_env_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/cars_env_reward.png -------------------------------------------------------------------------------- /figures/framework_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/framework_diagram.png -------------------------------------------------------------------------------- /figures/unicycle_env_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/unicycle_env_reward.png -------------------------------------------------------------------------------- /figures/unicycle_snapshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/unicycle_snapshot.png -------------------------------------------------------------------------------- /rcbf_sac/replay_memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | class ReplayMemory: 5 | 6 | def __init__(self, capacity, seed): 7 | random.seed(seed) 8 | self.capacity = capacity 9 | self.buffer = [] 10 | self.position = 0 11 | 12 | def push(self, state, action, reward, next_state, mask, t=None, next_t=None): 13 | 14 | if len(self.buffer) < self.capacity: 15 | self.buffer.append(None) 16 | 17 | self.buffer[self.position] = (state, action, reward, next_state, mask, t, next_t) 18 | self.position = (self.position + 1) % self.capacity 19 | 20 | def batch_push(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, t_batch=None, next_t_batch=None): 21 | 22 | for i in range(state_batch.shape[0]): # TODO: Optimize This 23 | if t_batch is not None and next_t_batch is not None: 24 | self.push(state_batch[i], action_batch[i], reward_batch[i], next_state_batch[i], mask_batch[i], t_batch[i], next_t_batch[i]) # Append transition to memory 25 | else: 26 | self.push(state_batch[i], action_batch[i], reward_batch[i], next_state_batch[i], mask_batch[i]) # Append transition to memory 27 | 28 | def sample(self, batch_size): 29 | 30 | batch = random.sample(self.buffer, batch_size) 31 | state, action, reward, next_state, mask, t, next_t = map(np.stack, zip(*batch)) 32 | return state, action, reward, next_state, mask, t, next_t 33 | 34 | def __len__(self): 35 | return len(self.buffer) 36 | -------------------------------------------------------------------------------- /rcbf_sac/evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.io import savemat 4 | from rcbf_sac.utils import * 5 | 6 | 7 | class Evaluator(object): 8 | 9 | def __init__(self, num_episodes, interval, save_path=''): 10 | self.num_episodes = num_episodes 11 | self.interval = interval 12 | self.save_path = save_path 13 | self.results = np.array([]).reshape(num_episodes, 0) 14 | 15 | def __call__(self, env, policy, cbf_wrapper=None, dynamics_model=None, debug=False, visualize=False, save=True): 16 | 17 | self.is_training = False 18 | result = [] 19 | 20 | for episode in range(self.num_episodes): 21 | 22 | # reset at the start of episode 23 | observation = env.reset() 24 | 25 | # Make sure to start from a safe state 26 | if cbf_wrapper and dynamics_model: 27 | out = None 28 | while out is None or cbf_wrapper.get_min_h_val(out) < 1e-4: 29 | observation = env.reset() 30 | state = dynamics_model.get_state(observation) 31 | out = dynamics_model.get_output(state) 32 | 33 | episode_steps = 0 34 | episode_reward = 0. 35 | 36 | assert observation is not None 37 | 38 | # start episode 39 | done = False 40 | while not done: 41 | # basic operation, action ,reward, blablabla ... 42 | action = policy(observation) 43 | 44 | observation, reward, done, info = env.step(action) 45 | 46 | if visualize: 47 | env.render(mode='human') 48 | 49 | # update 50 | episode_reward += reward 51 | episode_steps += 1 52 | 53 | if debug: prYellow('[Evaluate] #Episode{}: episode_reward:{}'.format(episode, episode_reward)) 54 | result.append(episode_reward) 55 | 56 | result = np.array(result).reshape(-1, 1) 57 | self.results = np.hstack([self.results, result]) 58 | 59 | if save: 60 | self.save_results('{}/validate_reward'.format(self.save_path)) 61 | return np.mean(result) 62 | 63 | def save_results(self, fn): 64 | 65 | y = np.mean(self.results, axis=0) 66 | error = np.std(self.results, axis=0) 67 | 68 | x = range(0, self.results.shape[1] * self.interval, self.interval) 69 | fig, ax = plt.subplots(1, 1, figsize=(6, 5)) 70 | plt.xlabel('Timestep') 71 | plt.ylabel('Average Reward') 72 | ax.errorbar(x, y, yerr=error, fmt='-o') 73 | plt.savefig(fn + '.png') 74 | savemat(fn + '.mat', {'reward': self.results}) 75 | plt.close() -------------------------------------------------------------------------------- /rcbf_sac/generate_rollouts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import deepcopy 3 | from rcbf_sac.utils import euler_to_mat_2d, prCyan, prRed 4 | 5 | 6 | def generate_model_rollouts(env, memory_model, memory, agent, dynamics_model, k_horizon=1, batch_size=20, warmup=False): 7 | 8 | def policy(observation): 9 | 10 | if warmup and env.action_space: 11 | action = agent.select_action(observation, dynamics_model, warmup=True) # Sample action from policy 12 | else: 13 | action = agent.select_action(observation, dynamics_model, evaluate=False) # Sample action from policy 14 | 15 | return action 16 | 17 | # Sample a batch from memory 18 | obs_batch, action_batch, reward_batch, next_obs_batch, mask_batch, t_batch, next_t_batch = memory.sample(batch_size=batch_size) 19 | 20 | obs_batch_ = deepcopy(obs_batch) 21 | done_batch_ = [False for _ in range(batch_size)] 22 | t_batch_ = deepcopy(t_batch) 23 | 24 | for k in range(k_horizon): 25 | 26 | batch_size_ = obs_batch_.shape[0] # that's because we remove steps where done = True so batch_size shrinks 27 | 28 | action_batch_ = policy(obs_batch_) 29 | state_batch_ = dynamics_model.get_state(obs_batch_) 30 | next_state_mu_, next_state_std_, next_t_batch_ = dynamics_model.predict_next_state(state_batch_, action_batch_, t_batch=t_batch) 31 | next_state_batch_ = np.random.normal(next_state_mu_, next_state_std_) 32 | next_obs_batch_ = dynamics_model.get_obs(next_state_batch_) 33 | 34 | if env.dynamics_mode == 'Unicycle': 35 | 36 | # Construct Next Observation from State 37 | dist2goal_prev = -np.log(obs_batch_[:, -1]) 38 | goal_rel = env.unwrapped.goal_pos[:2] - next_obs_batch_[:, :2] 39 | dist2goal = np.linalg.norm(goal_rel, axis=1) 40 | assert dist2goal.shape == (batch_size_,), 'dist2goal should be a vector of size (batch_size,), got {} instead'.format(dist2goal.shape) 41 | # generate compass 42 | compass = np.matmul(np.expand_dims(goal_rel, 1), euler_to_mat_2d(next_state_batch_[:, 2])).squeeze(1) 43 | compass /= np.sqrt(np.sum(np.square(compass), axis=1, keepdims=True)) + 0.001 44 | next_obs_batch_ = np.hstack((next_obs_batch_, compass, np.expand_dims(np.exp(-dist2goal), axis=-1))) 45 | 46 | # Compute Reward 47 | goal_size = 0.3 48 | reward_goal = 1.0 49 | reward_distance = 1.0 50 | reward_batch_ = (dist2goal_prev - dist2goal) * reward_distance + (dist2goal <= goal_size) * reward_goal 51 | # Compute Done 52 | reached_goal = dist2goal <= goal_size 53 | reward_batch_ += reward_goal * reached_goal 54 | done_batch_ = reached_goal 55 | mask_batch_ = np.invert(done_batch_) 56 | 57 | elif env.dynamics_mode == 'SimulatedCars': 58 | 59 | # Compute Reward 60 | # car_4_vel = next_state_batch_[:, 7] # car's 4 velocity 61 | # reward_batch_ = -np.abs(car_4_vel) * np.abs(action_batch_.squeeze()) * (action_batch_.squeeze() > 0) / env.max_episode_steps 62 | reward_batch_ = -5.0 * np.abs(action_batch_.squeeze() ** 2) / env.max_episode_steps 63 | 64 | # Compute Done 65 | done_batch_ = next_t_batch_ >= env.max_episode_steps * env.dt # done? 66 | mask_batch_ = np.invert(done_batch_) 67 | 68 | else: 69 | raise Exception('Environment/Dynamics mode {} not Recognized!'.format(env.dynamics_mode)) 70 | 71 | memory_model.batch_push(obs_batch_, action_batch_, reward_batch_, next_obs_batch_, mask_batch_, t_batch_, next_t_batch_) # Append transition to memory 72 | t_batch_ = deepcopy(next_t_batch_) 73 | 74 | # Update Current Observation Batch 75 | obs_batch_ = deepcopy(next_obs_batch_) 76 | 77 | # Delete Done Trajectories 78 | if np.sum(done_batch_) > 0: 79 | obs_batch_ = np.delete(obs_batch_, done_batch_ > 0, axis=0) 80 | 81 | return memory_model 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SAC-RCBF (DEPRECATED, repo from updated paper [here](https://github.com/yemam3/Mod-RL-RCBF)) 2 | 3 | Repository containing the code for the paper "Safe Model-Based Reinforcement Learning using Robust Control Barrier Functions". Specifically, an implementation of SAC + Robust Control Barrier Functions (RCBFs) for safe reinforcement learning in two custom environments. 4 | 5 | While exploring, an RL agent can take actions that lead the system to unsafe states. Here, we use a differentiable RCBF safety layer that minimially alters (in the least-squares sense) the actions taken by the RL agent to ensure the safety of the agent. 6 | 7 | 8 | 9 |

10 | 11 |

12 | 13 | ## Robust Control Barrier Functions (RCBFs) 14 | 15 | In this work, we focus on RCBFs that are formulated with respect to differential inclusions of the following form: 16 | 17 |

18 | 19 |

20 | 21 | Here `D(x)` is a disturbance set unkown apriori to the robot, which we learn online during traing via Gaussian Processes (GPs). The underlying library is GPyTorch. 22 | 23 | The QP used to ensure the system's safety is given by: 24 | 25 |

26 | 27 |

28 | 29 | where `h(x)` is the RCBF, and `u_RL` is the action outputted by the RL policy. As such, the final (safe) action taken in the environment is given by `u = u_RL + u_RCBF` as shown in the following diagram: 30 | 31 |

32 | 33 |

34 | 35 | 36 | ## Coupling RL & RCBFs to Improve Training Performance 37 | 38 | The above is sufficient to ensure the safety of the system, however, we would also like to improve the performance of the learning by letting the RCBF layer guide the training. This is achieved via: 39 | * Using a differentiable version of the safety layer that allows us to backpropagte through the RCBF based Quadratic Program (QP) resulting in an end-to-end policy. 40 | * Using the GPs and the dynamics prior to generate synthetic data (model-based RL). 41 | 42 | ## Other Approaches 43 | 44 | In addition, the approach is compared against two other frameworks (implemented here) in the experiments: 45 | * A vanilla baseline that uses SAC with RCBFs without generating synthetic data nor backproping through the QP (RL loss computed wrt ouput of RL policy). 46 | * A modified approach from ["End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous Control Tasks"](https://ojs.aaai.org/index.php/AAAI/article/view/4213) that replaces their discrete time CBF formulation with RCBFs, but makes use of the supervised learning component to speed up the learning. 47 | 48 | ## Running the Experiments 49 | 50 | The two environments are `Unicycle` and `SimulatedCars`. `Unicycle` involves a unicycle robot tasked with reaching a desired location while avoiding obstacles and `SimulatedCars` involves a chain of cars driving in a lane, the RL agent controls the 4th car and must try minimzing control effort while avoiding colliding with the other cars. 51 | 52 | ### `Unicycle` Env: 53 | 54 | * Training the proposed approach: 55 | `python main.py --env Unicycle --gamma_b 20 --max_episodes 400 --cuda --updates_per_step 2 --batch_size 512 --seed 12345 --model_based` 56 | 57 | * Training the baseline: 58 | `python main.py --env Unicycle --gamma_b 20 --max_episodes 400 --cuda --updates_per_step 1 --batch_size 256 --seed 12345 --no_diff_qp` 59 | 60 | * Training the modified approach from "End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous Control Tasks": 61 | `python main.py --env Unicycle --gamma_b 20 --max_episodes 400 --cuda --updates_per_step 1 --batch_size 256 --seed 12345 --no_diff_qp --use_comp True` 62 | 63 | ### `SimulatedCars` Env: 64 | 65 | * Training the proposed approach: 66 | `python main.py --env SimulatedCars --gamma_b 20 --max_episodes 200 --cuda --updates_per_step 2 --batch_size 512 --seed 12345 --model_based` 67 | 68 | * Training the baseline: 69 | `python main.py --env SimulatedCars --gamma_b 20 --max_episodes 200 --cuda --updates_per_step 1 --batch_size 256 --seed 12345 --no_diff_qp` 70 | 71 | * Training the modified approach from "End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous Control Tasks": 72 | `python main.py --env SimulatedCars --gamma_b 20 --max_episodes 200 --cuda --updates_per_step 1 --batch_size 256 --seed 12345 --no_diff_qp --use_comp True` 73 | 74 | ### Testing 75 | 76 | * To test, add `--mode test` and `--resume /path/to/output/{1}-run-{2}`, where `{1}` is the env name and `{2}` is the experiment number, to any of the commands above. 77 | -------------------------------------------------------------------------------- /rcbf_sac/compensator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from torch.optim import Adam 5 | from rcbf_sac.utils import scale_action, to_tensor, to_numpy 6 | 7 | criterion = nn.MSELoss() 8 | 9 | 10 | def fanin_init(size, fanin=None): 11 | fanin = fanin or size[0] 12 | v = 1. / np.sqrt(fanin) 13 | return torch.Tensor(size).uniform_(-v, v) 14 | 15 | 16 | class CompensatorModel(nn.Module): 17 | 18 | def __init__(self, state_dim, action_dim, hidden1=30, hidden2=20, init_w=3e-3): 19 | super(CompensatorModel, self).__init__() 20 | self.fc1 = nn.Linear(state_dim, hidden1) 21 | self.fc2 = nn.Linear(hidden1, hidden2) 22 | self.fc3 = nn.Linear(hidden2, action_dim) 23 | self.init_weights(init_w) 24 | 25 | def init_weights(self, init_w): 26 | self.fc1.weight.data = fanin_init(self.fc1.weight.data.size()) 27 | self.fc2.weight.data = fanin_init(self.fc2.weight.data.size()) 28 | self.fc3.weight.data.uniform_(-init_w, init_w) 29 | 30 | def forward(self, state): 31 | state = state 32 | out = self.fc1(state) 33 | out = torch.relu(out) 34 | out = self.fc2(out) 35 | out = torch.relu(out) 36 | action = torch.tanh(self.fc3(out)) # TODO:Not sure if they have the tanh here in Cheng's paper 37 | return action 38 | 39 | 40 | class Compensator: 41 | 42 | def __init__(self, state_dim, action_dim, action_lb, action_ub, args): 43 | 44 | if args.seed > 0: 45 | self.seed(args.seed) 46 | 47 | self.state_dim = state_dim 48 | self.action_dim = action_dim 49 | 50 | # # Create Actor and Critic Network 51 | # net_cfg = { 52 | # 'hidden1': args.hidden1, 53 | # 'hidden2': args.hidden2, 54 | # 'init_w': args.init_w 55 | # } 56 | # self.actor = CompensatorModel(state_dim, action_dim, **net_cfg) 57 | self.comp_actor = CompensatorModel(state_dim, action_dim) 58 | self.comp_actor_optim = Adam(self.comp_actor.parameters(), lr=args.comp_rate) 59 | 60 | self.device = torch.device("cuda" if args.cuda else "cpu") 61 | self.comp_actor.to(self.device) 62 | 63 | # Action Lower and Upper Bounds 64 | self.action_lb = to_tensor(action_lb, torch.FloatTensor, self.device) 65 | self.action_ub = to_tensor(action_ub, torch.FloatTensor, self.device) 66 | 67 | # If its never been trained then we don't want to use it (useful for eval only) 68 | self.is_trained = False 69 | 70 | 71 | def __call__(self, observation): 72 | action = self.comp_actor(observation) * self.is_trained 73 | return scale_action(action, self.action_lb, self.action_ub) 74 | 75 | def train(self, rollouts, epochs=1): 76 | """ 77 | 78 | Parameters 79 | ---------- 80 | rollout : list 81 | List of dicts with each dict containing the keys ['obs'], ['u_nom'], 82 | ['u_safe'], ['u_comp']. Each of those keys has a ndarray value where the rows are steps. 83 | Note here state refers to observation not internal state of the dynamics. 84 | 85 | Returns 86 | ------- 87 | 88 | """ 89 | 90 | self.is_trained = True # only want to use it if its been trained 91 | 92 | for epoch in range(epochs): 93 | 94 | # Compensator Actor update 95 | self.comp_actor.zero_grad() 96 | 97 | # Stack everything appropriately 98 | all_obs = np.zeros((0, self.state_dim)) 99 | all_u_comp = np.zeros((0, self.action_dim)) 100 | all_u_safe = np.zeros((0, self.action_dim)) 101 | 102 | for rollout in rollouts: 103 | all_obs = np.vstack((all_obs, rollout['obs'])) 104 | all_u_comp = np.vstack((all_u_comp, rollout['u_comp'])) 105 | all_u_safe = np.vstack((all_u_safe, rollout['u_safe'])) 106 | 107 | all_obs = to_tensor(all_obs, torch.FloatTensor, self.device) 108 | all_u_comp = to_tensor(all_u_comp, torch.FloatTensor, self.device) 109 | all_u_safe = to_tensor(all_u_safe, torch.FloatTensor, self.device) 110 | target = scale_action(all_u_comp + all_u_safe, self.action_lb, self.action_ub) 111 | comp_actor_loss = criterion(self.comp_actor(all_obs), target) 112 | 113 | comp_actor_loss.backward() 114 | self.comp_actor_optim.step() 115 | 116 | def load_weights(self, output): 117 | if output is None: return 118 | 119 | self.comp_actor.load_state_dict( 120 | torch.load('{}/comp_actor.pkl'.format(output)) 121 | ) 122 | 123 | def save_model(self, output): 124 | torch.save( 125 | self.comp_actor.state_dict(), 126 | '{}/comp_actor.pkl'.format(output) 127 | ) 128 | 129 | def seed(self, s): 130 | 131 | torch.manual_seed(s) 132 | if torch.cuda.is_available(): 133 | torch.cuda.manual_seed(s) 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /rcbf_sac/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import os 4 | import torch 5 | from torch.autograd import Variable 6 | 7 | USE_CUDA = torch.cuda.is_available() 8 | 9 | 10 | def prRed(prt): print("\033[91m {}\033[00m".format(prt)) 11 | 12 | 13 | def prGreen(prt): print("\033[92m {}\033[00m".format(prt)) 14 | 15 | 16 | def prYellow(prt): print("\033[93m {}\033[00m".format(prt)) 17 | 18 | 19 | def prLightPurple(prt): print("\033[94m {}\033[00m".format(prt)) 20 | 21 | 22 | def prPurple(prt): print("\033[95m {}\033[00m".format(prt)) 23 | 24 | 25 | def prCyan(prt): print("\033[96m {}\033[00m".format(prt)) 26 | 27 | 28 | def prLightGray(prt): print("\033[97m {}\033[00m".format(prt)) 29 | 30 | 31 | def prBlack(prt): print("\033[98m {}\033[00m".format(prt)) 32 | 33 | 34 | def mat_to_euler_2d(rot_mat): 35 | """ 36 | rot_mat has shape: 37 | [[c -s 0], 38 | [s c 0], 39 | [0 0 1]] 40 | """ 41 | 42 | theta = np.arcsin(rot_mat[1, 0]) 43 | return theta 44 | 45 | 46 | def euler_to_mat_2d(theta_batch): 47 | s = np.sin(theta_batch) 48 | c = np.cos(theta_batch) 49 | Rs = np.zeros((theta_batch.shape[0], 2, 2)) 50 | Rs[:, 0, 0] = c 51 | Rs[:, 0, 1] = -s 52 | Rs[:, 1, 0] = s 53 | Rs[:, 1, 1] = c 54 | return Rs 55 | 56 | def to_numpy(x): 57 | # convert torch tensor to numpy array 58 | return x.cpu().detach().double().numpy() 59 | 60 | def to_tensor(x, dtype, device, requires_grad=False): 61 | # convert numpy array to torch tensor 62 | return torch.from_numpy(x).type(dtype).to(device).requires_grad_(requires_grad) 63 | 64 | def scale_action(action, action_lb, action_ub, device=None): 65 | 66 | act_k = (action_ub - action_lb) / 2. 67 | act_b = (action_ub + action_lb) / 2. 68 | return act_k * action + act_b 69 | 70 | 71 | def soft_update(target, source, tau): 72 | for target_param, param in zip(target.parameters(), source.parameters()): 73 | target_param.data.copy_( 74 | target_param.data * (1.0 - tau) + param.data * tau 75 | ) 76 | 77 | 78 | def hard_update(target, source): 79 | for target_param, param in zip(target.parameters(), source.parameters()): 80 | target_param.data.copy_(param.data) 81 | 82 | 83 | def create_log_gaussian(mean, log_std, t): 84 | quadratic = -((0.5 * (t - mean) / (log_std.exp())).pow(2)) 85 | l = mean.shape 86 | log_z = log_std 87 | z = l[-1] * math.log(2 * math.pi) 88 | log_p = quadratic.sum(dim=-1) - log_z.sum(dim=-1) - 0.5 * z 89 | return log_p 90 | 91 | 92 | def logsumexp(inputs, dim=None, keepdim=False): 93 | if dim is None: 94 | inputs = inputs.view(-1) 95 | dim = 0 96 | s, _ = torch.max(inputs, dim=dim, keepdim=True) 97 | outputs = s + (inputs - s).exp().sum(dim=dim, keepdim=True).log() 98 | if not keepdim: 99 | outputs = outputs.squeeze(dim) 100 | return outputs 101 | 102 | 103 | def get_output_folder(parent_dir, env_name): 104 | """Return save folder. 105 | 106 | Assumes folders in the parent_dir have suffix -run{run 107 | number}. Finds the highest run number and sets the output folder 108 | to that number + 1. This is just convenient so that if you run the 109 | same script multiple times tensorboard can plot all of the results 110 | on the same plots with different names. 111 | 112 | Parameters 113 | ---------- 114 | parent_dir: str 115 | Path of the directory containing all experiment runs. 116 | 117 | Returns 118 | ------- 119 | parent_dir/run_dir 120 | Path to this run's save directory. 121 | """ 122 | os.makedirs(parent_dir, exist_ok=True) 123 | experiment_id = 0 124 | for folder_name in os.listdir(parent_dir): 125 | if not os.path.isdir(os.path.join(parent_dir, folder_name)): 126 | continue 127 | try: 128 | folder_name = int(folder_name.split('-run')[-1]) 129 | if folder_name > experiment_id: 130 | experiment_id = folder_name 131 | except: 132 | pass 133 | experiment_id += 1 134 | 135 | parent_dir = os.path.join(parent_dir, env_name) 136 | parent_dir = parent_dir + '-run{}'.format(experiment_id) 137 | os.makedirs(parent_dir, exist_ok=True) 138 | return parent_dir 139 | 140 | 141 | def get_wrapped_policy(agent, cbf_wrapper, dynamics_model, compensator=None, warmup=False, action_space=None, 142 | policy_eval=False): 143 | 144 | def wrapped_policy(observation): 145 | 146 | if warmup and action_space: 147 | action = action_space.sample() # Sample random action 148 | else: 149 | action = agent.select_action(observation, evaluate=policy_eval) # Sample action from policy 150 | 151 | if compensator: 152 | action_comp = compensator(observation) 153 | else: 154 | action_comp = 0 155 | state = dynamics_model.get_state(observation) 156 | disturb_mean, disturb_std = dynamics_model.predict_disturbance(state) 157 | action_safe = cbf_wrapper.get_u_safe(action + action_comp, state, disturb_mean, disturb_std) 158 | # print('state = {}, action = {}, action_comp = {}, u_safe = {}'.format(state, action, action_comp, u_safe)) 159 | return action + action_comp + action_safe 160 | 161 | return wrapped_policy 162 | -------------------------------------------------------------------------------- /rcbf_sac/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | 6 | LOG_SIG_MAX = 2 7 | LOG_SIG_MIN = -20 8 | epsilon = 1e-6 9 | 10 | # Initialize Policy weights 11 | def weights_init_(m): 12 | if isinstance(m, nn.Linear): 13 | torch.nn.init.xavier_uniform_(m.weight, gain=1) 14 | torch.nn.init.constant_(m.bias, 0) 15 | 16 | 17 | class ValueNetwork(nn.Module): 18 | def __init__(self, num_inputs, hidden_dim): 19 | super(ValueNetwork, self).__init__() 20 | 21 | self.linear1 = nn.Linear(num_inputs, hidden_dim) 22 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 23 | self.linear3 = nn.Linear(hidden_dim, 1) 24 | 25 | self.apply(weights_init_) 26 | 27 | def forward(self, state): 28 | x = F.relu(self.linear1(state)) 29 | x = F.relu(self.linear2(x)) 30 | x = self.linear3(x) 31 | return x 32 | 33 | 34 | class QNetwork(nn.Module): 35 | def __init__(self, num_inputs, num_actions, hidden_dim): 36 | super(QNetwork, self).__init__() 37 | 38 | # Q1 architecture 39 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim) 40 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 41 | self.linear3 = nn.Linear(hidden_dim, 1) 42 | 43 | # Q2 architecture 44 | self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim) 45 | self.linear5 = nn.Linear(hidden_dim, hidden_dim) 46 | self.linear6 = nn.Linear(hidden_dim, 1) 47 | 48 | self.apply(weights_init_) 49 | 50 | def forward(self, state, action): 51 | xu = torch.cat([state, action], 1) 52 | 53 | x1 = F.relu(self.linear1(xu)) 54 | x1 = F.relu(self.linear2(x1)) 55 | x1 = self.linear3(x1) 56 | 57 | x2 = F.relu(self.linear4(xu)) 58 | x2 = F.relu(self.linear5(x2)) 59 | x2 = self.linear6(x2) 60 | 61 | return x1, x2 62 | 63 | 64 | class GaussianPolicy(nn.Module): 65 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None): 66 | super(GaussianPolicy, self).__init__() 67 | 68 | self.linear1 = nn.Linear(num_inputs, hidden_dim) 69 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 70 | 71 | self.mean_linear = nn.Linear(hidden_dim, num_actions) 72 | self.log_std_linear = nn.Linear(hidden_dim, num_actions) 73 | 74 | self.apply(weights_init_) 75 | 76 | # action rescaling 77 | if action_space is None: 78 | self.action_scale = torch.tensor(1.) 79 | self.action_bias = torch.tensor(0.) 80 | else: 81 | self.action_scale = torch.FloatTensor( 82 | (action_space.high - action_space.low) / 2.) 83 | self.action_bias = torch.FloatTensor( 84 | (action_space.high + action_space.low) / 2.) 85 | 86 | def forward(self, state): 87 | x = F.relu(self.linear1(state)) 88 | x = F.relu(self.linear2(x)) 89 | mean = self.mean_linear(x) 90 | log_std = self.log_std_linear(x) 91 | log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX) 92 | return mean, log_std 93 | 94 | def sample(self, state): 95 | mean, log_std = self.forward(state) 96 | std = log_std.exp() 97 | normal = Normal(mean, std) 98 | x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) 99 | y_t = torch.tanh(x_t) 100 | action = y_t * self.action_scale + self.action_bias 101 | log_prob = normal.log_prob(x_t) 102 | # Enforcing Action Bound 103 | log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon) 104 | log_prob = log_prob.sum(1, keepdim=True) 105 | mean = torch.tanh(mean) * self.action_scale + self.action_bias 106 | return action, log_prob, mean 107 | 108 | def to(self, device): 109 | self.action_scale = self.action_scale.to(device) 110 | self.action_bias = self.action_bias.to(device) 111 | return super(GaussianPolicy, self).to(device) 112 | 113 | 114 | class DeterministicPolicy(nn.Module): 115 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None): 116 | super(DeterministicPolicy, self).__init__() 117 | self.linear1 = nn.Linear(num_inputs, hidden_dim) 118 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 119 | 120 | self.mean = nn.Linear(hidden_dim, num_actions) 121 | self.noise = torch.Tensor(num_actions) 122 | 123 | self.apply(weights_init_) 124 | 125 | # action rescaling 126 | if action_space is None: 127 | self.action_scale = 1. 128 | self.action_bias = 0. 129 | else: 130 | self.action_scale = torch.FloatTensor( 131 | (action_space.high - action_space.low) / 2.) 132 | self.action_bias = torch.FloatTensor( 133 | (action_space.high + action_space.low) / 2.) 134 | 135 | def forward(self, state): 136 | x = F.relu(self.linear1(state)) 137 | x = F.relu(self.linear2(x)) 138 | mean = torch.tanh(self.mean(x)) * self.action_scale + self.action_bias 139 | return mean 140 | 141 | def sample(self, state): 142 | mean = self.forward(state) 143 | noise = self.noise.normal_(0., std=0.1) 144 | noise = noise.clamp(-0.25, 0.25) 145 | action = mean + noise 146 | return action, torch.tensor(0.), mean 147 | 148 | def to(self, device): 149 | self.action_scale = self.action_scale.to(device) 150 | self.action_bias = self.action_bias.to(device) 151 | self.noise = self.noise.to(device) 152 | return super(DeterministicPolicy, self).to(device) 153 | -------------------------------------------------------------------------------- /rcbf_sac/gp_model.py: -------------------------------------------------------------------------------- 1 | """ Adapted almost directly from: 2 | https://docs.gpytorch.ai/en/stable/examples/02_Scalable_Exact_GPs/Simple_GP_Regression_CUDA.html 3 | 4 | Training is performed rapidly (and exactly) using GPUs and prediction is done very rapidly using LOVE. 5 | """ 6 | 7 | import torch 8 | import gpytorch 9 | from rcbf_sac.utils import to_tensor, to_numpy 10 | 11 | 12 | class BaseGPy(gpytorch.models.ExactGP): 13 | 14 | def __init__(self, train_x, train_y, prior_std, likelihood): 15 | super().__init__(train_x, train_y, likelihood) 16 | self.mean_module = gpytorch.means.ZeroMean() 17 | self.covar_module = gpytorch.kernels.ScaleKernel( 18 | gpytorch.kernels.RBFKernel(lengthscale_prior=gpytorch.priors.NormalPrior(1e5, 1e-5)), 19 | outputscale_prior=gpytorch.priors.NormalPrior(prior_std + 1e-6, 1e-5)) 20 | # Initialize lengthscale and outputscale to mean of priors 21 | self.covar_module.base_kernel.lengthscale = 1e5 22 | self.covar_module.outputscale = prior_std + 1e-6 23 | 24 | def forward(self, x): 25 | mean = self.mean_module(x) 26 | covar = self.covar_module(x) 27 | return gpytorch.distributions.MultivariateNormal(mean, covar) 28 | 29 | 30 | class GPyDisturbanceEstimator: 31 | """ 32 | A wrapper around teh BaseGPy model above. 33 | """ 34 | 35 | def __init__(self, train_x, train_y, prior_std, likelihood=None, device=None): 36 | 37 | 38 | 39 | if device: 40 | self.device = device 41 | else: 42 | self.device = torch.device("cpu") 43 | 44 | if not torch.is_tensor(train_x): 45 | train_x = to_tensor(train_x, torch.FloatTensor, self.device) 46 | if not torch.is_tensor(train_y): 47 | train_y = to_tensor(train_y, torch.FloatTensor, self.device) 48 | self.train_x = train_x 49 | self.train_y = train_y 50 | 51 | if not likelihood: 52 | likelihood = gpytorch.likelihoods.GaussianLikelihood() 53 | self.likelihood = likelihood.to(self.device) 54 | 55 | self.model = BaseGPy(train_x, train_y, prior_std, likelihood) 56 | self.model = self.model.to(self.device) 57 | 58 | def train(self, training_iter, verbose=False): 59 | 60 | # Find optimal model hyperparameters 61 | self.model.train() 62 | self.likelihood.train() 63 | 64 | # Use the adam optimizer 65 | optimizer = torch.optim.Adam(self.model.parameters(), lr=0.1) # Includes GaussianLikelihood parameters 66 | 67 | # "Loss" for GPs - the marginal log likelihood 68 | mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model) 69 | 70 | for i in range(training_iter): 71 | # Zero gradients from previous iteration 72 | optimizer.zero_grad() 73 | # Output from model 74 | output = self.model(self.train_x) 75 | # Calc loss and backprop gradients 76 | loss = -mll(output, self.train_y) 77 | loss.backward() 78 | if verbose: 79 | print('\tIter %d/%d - Loss: %.3f lengthscale: %.3f noise: %.3f' % ( 80 | i + 1, training_iter, loss.item(), 81 | self.model.covar_module.base_kernel.lengthscale.item(), 82 | self.model.likelihood.noise.item() 83 | )) 84 | optimizer.step() 85 | 86 | def predict(self, test_x): 87 | 88 | # Convert to torch tensor 89 | is_tensor = torch.is_tensor(test_x) 90 | if not is_tensor: 91 | test_x = to_tensor(test_x, torch.FloatTensor, self.device) 92 | 93 | # Get into evaluation (predictive posterior) mode 94 | self.model.eval() 95 | self.likelihood.eval() 96 | 97 | # Test points are regularly spaced along [0,1] 98 | # Make predictions by feeding model through likelihood 99 | with torch.no_grad(), gpytorch.settings.fast_pred_var(): 100 | observed_pred = self.likelihood(self.model(test_x)) 101 | pred_dict = dict() 102 | pred_dict['mean'] = observed_pred.mean.cpu() 103 | pred_dict['f_var'] = observed_pred.variance.cpu() 104 | pred_dict['f_covar'] = observed_pred.covariance_matrix.cpu() 105 | lower_ci, upper_ci = observed_pred.confidence_region() 106 | pred_dict['lower_ci'] = lower_ci.cpu() 107 | pred_dict['upper_ci'] = upper_ci.cpu() 108 | 109 | # If they gave us ndarray, we give back ndarray 110 | if not is_tensor: 111 | for key, val in pred_dict.items(): 112 | pred_dict[key] = to_numpy(val) 113 | 114 | return pred_dict 115 | 116 | 117 | if __name__ == '__main__': 118 | """ 119 | Simple code to test the GP model on a simple dataset. 120 | """ 121 | 122 | # this is for running the notebook in our testing framework 123 | import os 124 | import matplotlib.pyplot as plt 125 | import math 126 | 127 | smoke_test = ('CI' in os.environ) 128 | training_iter = 2 if smoke_test else 50 129 | # Training data is 11 points in [0,1] inclusive regularly spaced 130 | train_x = torch.linspace(0, 1, 100) 131 | # True function is sin(2*pi*x) with Gaussian noise 132 | train_y = torch.sin(train_x * (2 * math.pi)) + torch.randn(train_x.size()) * 0.2 133 | 134 | disturb_estimator = GPyDisturbanceEstimator(train_x, train_y, 0.0) 135 | disturb_estimator.train(training_iter) 136 | 137 | print('Testing model...') 138 | test_x = torch.linspace(0, 1, 51) 139 | prediction = disturb_estimator.predict(test_x) 140 | train_x = train_x.cpu() 141 | train_y = train_y.cpu() 142 | test_x = test_x.cpu() 143 | 144 | with torch.no_grad(): 145 | # Initialize plot 146 | f, ax = plt.subplots(1, 1, figsize=(4, 3)) 147 | # Plot training data as black stars 148 | ax.plot(train_x.numpy(), train_y.numpy(), 'k*') 149 | # Plot predictive means as blue line 150 | ax.plot(test_x.numpy(), prediction['mean'].numpy(), 'b') 151 | # Shade between the lower and upper confidence bounds 152 | ax.fill_between(test_x.numpy(), prediction['lower_ci'].numpy(), prediction['upper_ci'].numpy(), alpha=0.5) 153 | ax.set_ylim([-3, 3]) 154 | ax.legend(['Observed Data', 'Mean', 'Confidence']) 155 | plt.show() -------------------------------------------------------------------------------- /envs/simulated_cars_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SimulatedCarsEnv(gym.Env): 7 | """Simulated Car Following Env, almost identical to https://github.com/rcheng805/RL-CBF/blob/master/car/DDPG/car_simulator.py 8 | Front <- Car 1 <- Car 2 <- Car 3 <- Car 4 (controlled) <- Car 5 9 | """ 10 | 11 | metadata = {'render.modes': ['human']} 12 | 13 | def __init__(self): 14 | 15 | super(SimulatedCarsEnv, self).__init__() 16 | 17 | self.dynamics_mode = 'SimulatedCars' 18 | self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,)) 19 | self.safe_action_space = spaces.Box(low=-10.0, high=10.0, shape=(1,)) 20 | self.observation_space = spaces.Box(low=-1e10, high=1e10, shape=(10,)) 21 | self.max_episode_steps = 300 22 | self.dt = 0.02 23 | 24 | # Gains 25 | self.kp = 4.0 26 | self.k_brake = 20.0 27 | 28 | self.state = None # State [x_1 v_1 ... x_5 v_5] 29 | self.t = 0 # Time 30 | self.episode_step = 0 # Episode Step 31 | 32 | # Gaussian Noise Parameters on the accelerations of the other vehicles 33 | self.disturb_mean = np.zeros((1,)) 34 | self.disturb_covar = np.diag([0.2**2]) 35 | 36 | self.reset() 37 | 38 | def step(self, action): 39 | """Organize the observation to understand what's going on 40 | 41 | Parameters 42 | ---------- 43 | action : ndarray 44 | Action that the agent takes in the environment 45 | 46 | Returns 47 | ------- 48 | new_obs : ndarray 49 | The new observation with the following structure: 50 | [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, dist2goal] 51 | 52 | """ 53 | 54 | # Current State 55 | pos = self.state[::2] 56 | vels = self.state[1::2] 57 | 58 | # Actions (accelerations of Cars 1 to 5) 59 | vels_des = 30.0 * np.ones(5) # Desired velocities 60 | vels_des[0] -= 10*np.sin(0.2*self.t) 61 | accels = self.kp * (vels_des - vels) 62 | accels[1] += -self.k_brake * (pos[0] - pos[1]) * ((pos[0] - pos[1]) < 6.0) 63 | accels[2] += -self.k_brake * (pos[1] - pos[2]) * ((pos[1] - pos[2]) < 6.0) 64 | accels[4] += -self.k_brake * (pos[2] - pos[4]) * ((pos[2] - pos[4]) < 13.0) 65 | 66 | # Add deterministic disturbance 67 | accels *= 1.1 68 | 69 | # Determine action of each car 70 | f_x = np.zeros(10) 71 | g_x = np.zeros(10) 72 | 73 | f_x[::2] = vels # Derivatives of positions are velocities 74 | f_x[1::2] = accels # Derivatives of velocities are acceleration 75 | g_x[7] = 50.0 # Car 4's acceleration (idx = 2*4 - 1) is the control input 76 | 77 | self.state += self.dt * (f_x + g_x * action) 78 | 79 | self.t = self.t + self.dt # time 80 | 81 | self.episode_step += 1 # steps in episode 82 | 83 | done = self.episode_step >= self.max_episode_steps # done? 84 | 85 | info = {'cost': self._get_cost(), 'goal_met': False} # Goal is never met since we're driving into the sunset 86 | 87 | return self._get_obs(), self._get_reward(action[0]), done, info 88 | 89 | def _get_reward(self, action): 90 | 91 | # car_4_vel = self.state[7] # car's 4 velocity 92 | # return -np.abs(car_4_vel) * np.abs(action) * (action > 0) / self.max_episode_steps 93 | return -5.0 * np.abs(action**2) / self.max_episode_steps 94 | 95 | def _get_cost(self): 96 | 97 | car_4_pos = self.state[6] # car's 4 position 98 | cost = 0 99 | 100 | if (self.state[4] - car_4_pos) < 2.99: # How far is car 3? 101 | cost -= 0.1 102 | 103 | if (car_4_pos - self.state[8]) < 2.99: # How far is car 4? 104 | cost -= 0.1 105 | 106 | return cost 107 | 108 | def reset(self): 109 | """ Reset the state of the environment to an initial state. 110 | 111 | Returns 112 | ------- 113 | observation : ndarray 114 | Next observation. 115 | """ 116 | 117 | self.t = 0 118 | self.state = np.zeros(10) # first col is pos, 2nd is vel 119 | self.state[::2] = [34.0, 28.0, 22.0, 16.0, 10.0] # initial positions 120 | self.state[1::2] = 30.0 + np.random.normal(0, 0.5) # initial velocities 121 | self.state[7] = 35.0 # initial velocity of car 4 122 | 123 | self.episode_step = 0 124 | 125 | return self._get_obs() 126 | 127 | 128 | def render(self, mode='human', close=False): 129 | """Render the environment to the screen 130 | 131 | Parameters 132 | ---------- 133 | mode : str 134 | close : bool 135 | 136 | Returns 137 | ------- 138 | 139 | """ 140 | 141 | print('Ep_step = {}, \tState = {}'.format(self.episode_step, self.state)) 142 | 143 | def _get_obs(self): 144 | """Given the state, this function returns it to an observation akin to the one obtained by calling env.step 145 | 146 | Parameters 147 | ---------- 148 | 149 | Returns 150 | ------- 151 | observation : ndarray 152 | Observation: [car_1_x, car_1_v, car_1_a, ...] 153 | """ 154 | 155 | obs = np.copy(np.ravel(self.state)) 156 | obs[::2] /= 100.0 # scale positions 157 | obs[1::2] /= 30.0 # scale velocities 158 | return obs 159 | 160 | 161 | if __name__ == "__main__": 162 | 163 | import matplotlib.pyplot as plt 164 | from rcbf_sac.cbf_qp import CascadeCBFLayer 165 | from rcbf_sac.dynamics import DynamicsModel 166 | import argparse 167 | 168 | parser = argparse.ArgumentParser() 169 | parser.add_argument('--gp_model_size', default=2000, type=int, help='gp') 170 | parser.add_argument('--k_d', default=3.0, type=float) 171 | parser.add_argument('--gamma_b', default=20, type=float) 172 | parser.add_argument('--cuda', action="store_true", help='run on CUDA (default: False)') 173 | args = parser.parse_args() 174 | 175 | env = SimulatedCarsEnv() 176 | dynamics_model = DynamicsModel(env, args) 177 | cbf_wrapper = CascadeCBFLayer(env, gamma_b=args.gamma_b, k_d=args.k_d) 178 | 179 | obs = env.reset() 180 | state = dynamics_model.get_state(obs) 181 | done = False 182 | episode_reward = 0 183 | episode_step = 0 184 | 185 | # Plot initial state 186 | car_patches = [] 187 | plt.figure(figsize=(20, 5), dpi=80) 188 | for i in range(5): 189 | car_patches.append(plt.Rectangle((state[2*i] - 1.5, -1.0), 3.0, 2.0, fc='blue', ec='blue', alpha=0.2)) 190 | plt.gca().add_patch(car_patches[i]) 191 | p_vel = plt.quiver(state[::2], np.zeros(5), state[1::2], np.zeros(5)) 192 | plt.ylim([-6.0, 6.0]) 193 | plt.grid() 194 | 195 | def controller(state): 196 | gain = 1.0 197 | action = np.array([gain * (state[4] - state[6] - 0.4) * (state[4] - state[6] - 0.4 < 0)]) 198 | action += np.array([gain * (state[8] - state[6] + 0.4) * (state[8] - state[6] + 0.4 > 0)]) 199 | return action 200 | 201 | while not done: 202 | # Plot current state 203 | state = dynamics_model.get_state(obs) 204 | print('obs = {}'.format(obs)) 205 | print('state = {}'.format(state)) 206 | pos = state[::2] 207 | for i in range(5): 208 | car_patches[i].set_x(state[2*i] - 1.5) 209 | p_vel.XY[:, 0] = state[::2] 210 | p_vel.set_UVC(state[1::2], np.zeros(5)) 211 | # Take Action and get next state 212 | # random_action = env.action_space.sample() 213 | random_action = controller(state) 214 | disturb_mean, disturb_std = dynamics_model.predict_disturbance(state) 215 | action_safe = cbf_wrapper.get_u_safe(random_action, state, disturb_mean, disturb_std) 216 | # Predict next state (testing for model-based rollouts) 217 | # Note that in this env, obs and state are the same but that's not always the case! 218 | next_state, next_state_std, _ = dynamics_model.predict_next_state(obs, random_action + action_safe, t_batch=np.array([env.dt * episode_step]), use_gps=False) 219 | # Take Environment Action 220 | obs, reward, done, info = env.step(random_action + action_safe) 221 | # assert np.sum(np.abs(next_state - obs)) < 1e-6, 'Predicted and Actual Next States are not the same!\nPredicted: {} \nActual: {}'.format(next_state, obs) 222 | plt.xlim([pos[-1] - 5.0, pos[0] + 5.0]) 223 | plt.pause(0.01) 224 | episode_reward += reward 225 | episode_step += 1 226 | print('action = {},\taction_cbf = {},\tepisode_reward = {:.3f}'.format(random_action + action_safe, action_safe, episode_reward)) 227 | 228 | plt.show() 229 | -------------------------------------------------------------------------------- /rcbf_sac/sac_cbf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.optim import Adam 4 | from rcbf_sac.utils import soft_update, hard_update 5 | from rcbf_sac.model import GaussianPolicy, QNetwork, DeterministicPolicy 6 | import numpy as np 7 | from rcbf_sac.diff_cbf_qp import CBFQPLayer 8 | from rcbf_sac.utils import to_tensor 9 | from rcbf_sac.compensator import Compensator 10 | from copy import deepcopy 11 | 12 | class RCBF_SAC(object): 13 | 14 | def __init__(self, num_inputs, action_space, env, args): 15 | 16 | self.gamma = args.gamma 17 | self.tau = args.tau 18 | self.alpha = args.alpha 19 | 20 | self.policy_type = args.policy 21 | self.target_update_interval = args.target_update_interval 22 | self.automatic_entropy_tuning = args.automatic_entropy_tuning 23 | self.action_space = action_space 24 | self.device = torch.device("cuda" if args.cuda else "cpu") 25 | 26 | self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) 27 | self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) 28 | 29 | self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) 30 | hard_update(self.critic_target, self.critic) 31 | 32 | if self.policy_type == "Gaussian": 33 | # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper 34 | if self.automatic_entropy_tuning is True: 35 | self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() 36 | self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) 37 | self.alpha_optim = Adam([self.log_alpha], lr=args.lr) 38 | 39 | self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) 40 | self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) 41 | 42 | else: 43 | self.alpha = 0 44 | self.automatic_entropy_tuning = False 45 | self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) 46 | self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) 47 | 48 | # CBF layer 49 | self.env = env 50 | self.cbf_layer = CBFQPLayer(env, args, args.gamma_b, args.k_d, args.l_p) 51 | self.diff_qp = args.diff_qp 52 | 53 | # compensator 54 | if args.use_comp: 55 | self.compensator = Compensator(num_inputs, action_space.shape[0], action_space.low, action_space.high, args) 56 | else: 57 | self.compensator = None 58 | 59 | def select_action(self, state, dynamics_model, evaluate=False, warmup=False): 60 | 61 | state = to_tensor(state, torch.FloatTensor, self.device) 62 | expand_dim = len(state.shape) == 1 63 | if expand_dim: 64 | state = state.unsqueeze(0) 65 | 66 | if warmup: 67 | batch_size = state.shape[0] 68 | action = torch.zeros((batch_size, self.action_space.shape[0])).to(self.device) 69 | for i in range(batch_size): 70 | action[i] = torch.from_numpy(self.action_space.sample()).to(self.device) 71 | else: 72 | if evaluate is False: 73 | action, _, _ = self.policy.sample(state) 74 | else: 75 | _, _, action = self.policy.sample(state) 76 | 77 | if self.compensator: 78 | action_comp = self.compensator(state) 79 | action += action_comp 80 | 81 | safe_action = self.get_safe_action(state, action, dynamics_model) 82 | 83 | if not self.compensator: 84 | return safe_action.detach().cpu().numpy()[0] if expand_dim else safe_action.detach().cpu().numpy() 85 | else: 86 | action_cbf = safe_action - action 87 | action_comp = action_comp.detach().cpu().numpy()[0] if expand_dim else action_comp.detach().cpu().numpy() 88 | action_cbf = action_cbf.detach().cpu().numpy()[0] if expand_dim else action_cbf.detach().cpu().numpy() 89 | safe_action = safe_action.detach().cpu().numpy()[0] if expand_dim else safe_action.detach().cpu().numpy() 90 | return safe_action, action_comp, action_cbf 91 | 92 | def update_parameters(self, memory, batch_size, updates, dynamics_model, memory_model=None, real_ratio=None): 93 | """ 94 | 95 | Parameters 96 | ---------- 97 | memory : ReplayMemory 98 | batch_size : int 99 | updates : int 100 | dynamics_model : GP Dynamics' Disturbance model D(x) in x_dot = f(x) + g(x)u + D(x) 101 | memory_model : ReplayMemory, optional 102 | If not none, perform model-based RL. 103 | real_ratio : float, optional 104 | If performing model-based RL, then real_ratio*batch_size are sampled from the real buffer, and the rest 105 | is sampled from the model buffer. 106 | Returns 107 | ------- 108 | 109 | """ 110 | 111 | # Model-based vs regular RL 112 | if memory_model and real_ratio: 113 | state_batch, action_batch, reward_batch, next_state_batch, mask_batch, t_batch, next_t_batch = memory.sample(batch_size=int(real_ratio*batch_size)) 114 | state_batch_m, action_batch_m, reward_batch_m, next_state_batch_m, mask_batch_m, t_batch_m, next_t_batch_m = memory_model.sample( 115 | batch_size=int((1-real_ratio) * batch_size)) 116 | state_batch = np.vstack((state_batch, state_batch_m)) 117 | action_batch = np.vstack((action_batch, action_batch_m)) 118 | reward_batch = np.hstack((reward_batch, reward_batch_m)) 119 | next_state_batch = np.vstack((next_state_batch, next_state_batch_m)) 120 | mask_batch = np.hstack((mask_batch, mask_batch_m)) 121 | else: 122 | state_batch, action_batch, reward_batch, next_state_batch, mask_batch, t_batch, next_t_batch = memory.sample(batch_size=batch_size) 123 | 124 | state_batch = torch.FloatTensor(state_batch).to(self.device) 125 | next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) 126 | action_batch = torch.FloatTensor(action_batch).to(self.device) 127 | reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1) 128 | mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) 129 | 130 | with torch.no_grad(): 131 | next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch) 132 | if self.diff_qp: # Compute next safe actions using Differentiable CBF-QP 133 | next_state_action = self.get_safe_action(next_state_batch, next_state_action, dynamics_model) 134 | qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action) 135 | min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi 136 | next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target) 137 | qf1, qf2 = self.critic(state_batch, action_batch) # Two Q-functions to mitigate positive bias in the policy improvement step 138 | qf1_loss = F.mse_loss(qf1, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] 139 | qf2_loss = F.mse_loss(qf2, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] 140 | qf_loss = qf1_loss + qf2_loss 141 | 142 | self.critic_optim.zero_grad() 143 | qf_loss.backward() 144 | self.critic_optim.step() 145 | 146 | # Compute Actions and log probabilities 147 | pi, log_pi, _ = self.policy.sample(state_batch) 148 | if self.diff_qp: # Compute safe action using Differentiable CBF-QP 149 | pi = self.get_safe_action(state_batch, pi, dynamics_model) 150 | 151 | qf1_pi, qf2_pi = self.critic(state_batch, pi) 152 | min_qf_pi = torch.min(qf1_pi, qf2_pi) 153 | 154 | policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] 155 | 156 | self.policy_optim.zero_grad() 157 | policy_loss.backward() 158 | self.policy_optim.step() 159 | 160 | if self.automatic_entropy_tuning: 161 | alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() 162 | 163 | self.alpha_optim.zero_grad() 164 | alpha_loss.backward() 165 | self.alpha_optim.step() 166 | 167 | self.alpha = self.log_alpha.exp() 168 | alpha_tlogs = self.alpha.clone() # For Comet.ml logs 169 | else: 170 | alpha_loss = torch.tensor(0.).to(self.device) 171 | alpha_tlogs = torch.tensor(self.alpha) # For Comet.ml logs 172 | 173 | if updates % self.target_update_interval == 0: 174 | soft_update(self.critic_target, self.critic, self.tau) 175 | 176 | return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item() 177 | 178 | def update_parameters_compensator(self, comp_rollouts): 179 | 180 | if self.compensator: 181 | self.compensator.train(comp_rollouts) 182 | 183 | 184 | # Save model parameters 185 | def save_model(self, output): 186 | print('Saving models in {}'.format(output)) 187 | torch.save( 188 | self.policy.state_dict(), 189 | '{}/actor.pkl'.format(output) 190 | ) 191 | torch.save( 192 | self.critic.state_dict(), 193 | '{}/critic.pkl'.format(output) 194 | ) 195 | if self.compensator: 196 | self.compensator.save_model(output) 197 | 198 | # Load model parameters 199 | def load_weights(self, output): 200 | if output is None: return 201 | print('Loading models from {}'.format(output)) 202 | 203 | self.policy.load_state_dict( 204 | torch.load('{}/actor.pkl'.format(output), map_location=torch.device(self.device)) 205 | ) 206 | self.critic.load_state_dict( 207 | torch.load('{}/critic.pkl'.format(output), map_location=torch.device(self.device)) 208 | ) 209 | if self.compensator: 210 | self.compensator.load_weights(output) 211 | 212 | def load_model(self, actor_path, critic_path): 213 | if actor_path is not None: 214 | self.policy.load_state_dict(torch.load(actor_path)) 215 | if critic_path is not None: 216 | self.critic.load_state_dict(torch.load(critic_path)) 217 | 218 | def get_safe_action(self, obs_batch, action_batch, dynamics_model): 219 | """Given a nominal action, returns a minimally-altered safe action to take. 220 | 221 | Parameters 222 | ---------- 223 | obs_batch : torch.tensor 224 | action_batch : torch.tensor 225 | dynamics_model : DynamicsModel 226 | 227 | Returns 228 | ------- 229 | safe_action_batch : torch.tensor 230 | Safe actions to be taken (cbf_action + action). 231 | """ 232 | 233 | state_batch = dynamics_model.get_state(obs_batch) 234 | mean_pred_batch, sigma_pred_batch = dynamics_model.predict_disturbance(state_batch) 235 | 236 | safe_action_batch = self.cbf_layer.get_safe_action(state_batch, action_batch, mean_pred_batch, sigma_pred_batch) 237 | 238 | return safe_action_batch 239 | 240 | 241 | 242 | -------------------------------------------------------------------------------- /envs/pyglet_rendering.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2D rendering framework (Obtained from: https://github.com/openai/gym/blob/95d649fdeca89bfed5e199829d4d905a52d2da2e/gym/utils/pyglet_rendering.py) 3 | """ 4 | import os 5 | import sys 6 | 7 | if "Apple" in sys.version: 8 | if "DYLD_FALLBACK_LIBRARY_PATH" in os.environ: 9 | os.environ["DYLD_FALLBACK_LIBRARY_PATH"] += ":/usr/lib" 10 | # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite 11 | 12 | from gym import error 13 | 14 | try: 15 | import pyglet 16 | except ImportError as e: 17 | raise ImportError( 18 | """ 19 | Cannot import pyglet. 20 | HINT: you can install pyglet directly via 'pip install pyglet'. 21 | But if you really just want to install all Gym dependencies and not have to think about it, 22 | 'pip install -e .[all]' or 'pip install gym[all]' will do it. 23 | """ 24 | ) 25 | 26 | try: 27 | from pyglet.gl import * 28 | except ImportError as e: 29 | raise ImportError( 30 | """ 31 | Error occurred while running `from pyglet.gl import *` 32 | HINT: make sure you have OpenGL installed. On Ubuntu, you can run 'apt-get install python-opengl'. 33 | If you're running on a server, you may need a virtual frame buffer; something like this should work: 34 | 'xvfb-run -s \"-screen 0 1400x900x24\" python ' 35 | """ 36 | ) 37 | 38 | import math 39 | import numpy as np 40 | 41 | RAD2DEG = 57.29577951308232 42 | 43 | 44 | def get_display(spec): 45 | """Convert a display specification (such as :0) into an actual Display 46 | object. 47 | Pyglet only supports multiple Displays on Linux. 48 | """ 49 | if spec is None: 50 | return pyglet.canvas.get_display() 51 | # returns already available pyglet_display, 52 | # if there is no pyglet display available then it creates one 53 | elif isinstance(spec, str): 54 | return pyglet.canvas.Display(spec) 55 | else: 56 | raise error.Error( 57 | f"Invalid display specification: {spec}. (Must be a string like :0 or None.)" 58 | ) 59 | 60 | 61 | def get_window(width, height, display, **kwargs): 62 | """ 63 | Will create a pyglet window from the display specification provided. 64 | """ 65 | screen = display.get_screens() # available screens 66 | config = screen[0].get_best_config() # selecting the first screen 67 | context = config.create_context(None) # create GL context 68 | 69 | return pyglet.window.Window( 70 | width=width, 71 | height=height, 72 | display=display, 73 | config=config, 74 | context=context, 75 | **kwargs, 76 | ) 77 | 78 | 79 | class Viewer: 80 | def __init__(self, width, height, display=None): 81 | display = get_display(display) 82 | 83 | self.width = width 84 | self.height = height 85 | self.window = get_window(width=width, height=height, display=display) 86 | self.window.on_close = self.window_closed_by_user 87 | self.isopen = True 88 | self.geoms = [] 89 | self.onetime_geoms = [] 90 | self.transform = Transform() 91 | 92 | glEnable(GL_BLEND) 93 | glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) 94 | 95 | def close(self): 96 | if self.isopen and sys.meta_path: 97 | # ^^^ check sys.meta_path to avoid 'ImportError: sys.meta_path is None, Python is likely shutting down' 98 | self.window.close() 99 | self.isopen = False 100 | 101 | def window_closed_by_user(self): 102 | self.isopen = False 103 | 104 | def set_bounds(self, left, right, bottom, top): 105 | assert right > left and top > bottom 106 | scalex = self.width / (right - left) 107 | scaley = self.height / (top - bottom) 108 | self.transform = Transform( 109 | translation=(-left * scalex, -bottom * scaley), scale=(scalex, scaley) 110 | ) 111 | 112 | def add_geom(self, geom): 113 | self.geoms.append(geom) 114 | 115 | def add_onetime(self, geom): 116 | self.onetime_geoms.append(geom) 117 | 118 | def render(self, return_rgb_array=False): 119 | glClearColor(1, 1, 1, 1) 120 | self.window.clear() 121 | self.window.switch_to() 122 | self.window.dispatch_events() 123 | self.transform.enable() 124 | for geom in self.geoms: 125 | geom.render() 126 | for geom in self.onetime_geoms: 127 | geom.render() 128 | self.transform.disable() 129 | arr = None 130 | if return_rgb_array: 131 | buffer = pyglet.image.get_buffer_manager().get_color_buffer() 132 | image_data = buffer.get_image_data() 133 | arr = np.frombuffer(image_data.get_data(), dtype=np.uint8) 134 | # In https://github.com/openai/gym-http-api/issues/2, we 135 | # discovered that someone using Xmonad on Arch was having 136 | # a window of size 598 x 398, though a 600 x 400 window 137 | # was requested. (Guess Xmonad was preserving a pixel for 138 | # the boundary.) So we use the buffer height/width rather 139 | # than the requested one. 140 | arr = arr.reshape(buffer.height, buffer.width, 4) 141 | arr = arr[::-1, :, 0:3] 142 | self.window.flip() 143 | self.onetime_geoms = [] 144 | return arr if return_rgb_array else self.isopen 145 | 146 | # Convenience 147 | def draw_circle(self, radius=10, res=30, filled=True, **attrs): 148 | geom = make_circle(radius=radius, res=res, filled=filled) 149 | _add_attrs(geom, attrs) 150 | self.add_onetime(geom) 151 | return geom 152 | 153 | def draw_polygon(self, v, filled=True, **attrs): 154 | geom = make_polygon(v=v, filled=filled) 155 | _add_attrs(geom, attrs) 156 | self.add_onetime(geom) 157 | return geom 158 | 159 | def draw_polyline(self, v, **attrs): 160 | geom = make_polyline(v=v) 161 | _add_attrs(geom, attrs) 162 | self.add_onetime(geom) 163 | return geom 164 | 165 | def draw_line(self, start, end, **attrs): 166 | geom = Line(start, end) 167 | _add_attrs(geom, attrs) 168 | self.add_onetime(geom) 169 | return geom 170 | 171 | def get_array(self): 172 | self.window.flip() 173 | image_data = ( 174 | pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() 175 | ) 176 | self.window.flip() 177 | arr = np.fromstring(image_data.get_data(), dtype=np.uint8, sep="") 178 | arr = arr.reshape(self.height, self.width, 4) 179 | return arr[::-1, :, 0:3] 180 | 181 | def __del__(self): 182 | self.close() 183 | 184 | 185 | def _add_attrs(geom, attrs): 186 | if "color" in attrs: 187 | geom.set_color(*attrs["color"]) 188 | if "linewidth" in attrs: 189 | geom.set_linewidth(attrs["linewidth"]) 190 | 191 | 192 | class Geom: 193 | def __init__(self): 194 | self._color = Color((0, 0, 0, 1.0)) 195 | self.attrs = [self._color] 196 | 197 | def render(self): 198 | for attr in reversed(self.attrs): 199 | attr.enable() 200 | self.render1() 201 | for attr in self.attrs: 202 | attr.disable() 203 | 204 | def render1(self): 205 | raise NotImplementedError 206 | 207 | def add_attr(self, attr): 208 | self.attrs.append(attr) 209 | 210 | def set_color(self, r, g, b): 211 | self._color.vec4 = (r, g, b, 1) 212 | 213 | 214 | class Attr: 215 | def enable(self): 216 | raise NotImplementedError 217 | 218 | def disable(self): 219 | pass 220 | 221 | 222 | class Transform(Attr): 223 | def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1, 1)): 224 | self.set_translation(*translation) 225 | self.set_rotation(rotation) 226 | self.set_scale(*scale) 227 | 228 | def enable(self): 229 | glPushMatrix() 230 | glTranslatef( 231 | self.translation[0], self.translation[1], 0 232 | ) # translate to GL loc ppint 233 | glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0) 234 | glScalef(self.scale[0], self.scale[1], 1) 235 | 236 | def disable(self): 237 | glPopMatrix() 238 | 239 | def set_translation(self, newx, newy): 240 | self.translation = (float(newx), float(newy)) 241 | 242 | def set_rotation(self, new): 243 | self.rotation = float(new) 244 | 245 | def set_scale(self, newx, newy): 246 | self.scale = (float(newx), float(newy)) 247 | 248 | 249 | class Color(Attr): 250 | def __init__(self, vec4): 251 | self.vec4 = vec4 252 | 253 | def enable(self): 254 | glColor4f(*self.vec4) 255 | 256 | 257 | class LineStyle(Attr): 258 | def __init__(self, style): 259 | self.style = style 260 | 261 | def enable(self): 262 | glEnable(GL_LINE_STIPPLE) 263 | glLineStipple(1, self.style) 264 | 265 | def disable(self): 266 | glDisable(GL_LINE_STIPPLE) 267 | 268 | 269 | class LineWidth(Attr): 270 | def __init__(self, stroke): 271 | self.stroke = stroke 272 | 273 | def enable(self): 274 | glLineWidth(self.stroke) 275 | 276 | 277 | class Point(Geom): 278 | def __init__(self): 279 | Geom.__init__(self) 280 | 281 | def render1(self): 282 | glBegin(GL_POINTS) # draw point 283 | glVertex3f(0.0, 0.0, 0.0) 284 | glEnd() 285 | 286 | 287 | class FilledPolygon(Geom): 288 | def __init__(self, v): 289 | Geom.__init__(self) 290 | self.v = v 291 | 292 | def render1(self): 293 | if len(self.v) == 4: 294 | glBegin(GL_QUADS) 295 | elif len(self.v) > 4: 296 | glBegin(GL_POLYGON) 297 | else: 298 | glBegin(GL_TRIANGLES) 299 | for p in self.v: 300 | glVertex3f(p[0], p[1], 0) # draw each vertex 301 | glEnd() 302 | 303 | 304 | def make_circle(radius=10, res=30, filled=True): 305 | points = [] 306 | for i in range(res): 307 | ang = 2 * math.pi * i / res 308 | points.append((math.cos(ang) * radius, math.sin(ang) * radius)) 309 | if filled: 310 | return FilledPolygon(points) 311 | else: 312 | return PolyLine(points, True) 313 | 314 | 315 | def make_polygon(v, filled=True): 316 | if filled: 317 | return FilledPolygon(v) 318 | else: 319 | return PolyLine(v, True) 320 | 321 | 322 | def make_polyline(v): 323 | return PolyLine(v, False) 324 | 325 | 326 | def make_capsule(length, width): 327 | l, r, t, b = 0, length, width / 2, -width / 2 328 | box = make_polygon([(l, b), (l, t), (r, t), (r, b)]) 329 | circ0 = make_circle(width / 2) 330 | circ1 = make_circle(width / 2) 331 | circ1.add_attr(Transform(translation=(length, 0))) 332 | geom = Compound([box, circ0, circ1]) 333 | return geom 334 | 335 | 336 | class Compound(Geom): 337 | def __init__(self, gs): 338 | Geom.__init__(self) 339 | self.gs = gs 340 | for g in self.gs: 341 | g.attrs = [a for a in g.attrs if not isinstance(a, Color)] 342 | 343 | def render1(self): 344 | for g in self.gs: 345 | g.render() 346 | 347 | 348 | class PolyLine(Geom): 349 | def __init__(self, v, close): 350 | Geom.__init__(self) 351 | self.v = v 352 | self.close = close 353 | self.linewidth = LineWidth(1) 354 | self.add_attr(self.linewidth) 355 | 356 | def render1(self): 357 | glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP) 358 | for p in self.v: 359 | glVertex3f(p[0], p[1], 0) # draw each vertex 360 | glEnd() 361 | 362 | def set_linewidth(self, x): 363 | self.linewidth.stroke = x 364 | 365 | 366 | class Line(Geom): 367 | def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)): 368 | Geom.__init__(self) 369 | self.start = start 370 | self.end = end 371 | self.linewidth = LineWidth(1) 372 | self.add_attr(self.linewidth) 373 | 374 | def render1(self): 375 | glBegin(GL_LINES) 376 | glVertex2f(*self.start) 377 | glVertex2f(*self.end) 378 | glEnd() 379 | 380 | 381 | class Image(Geom): 382 | def __init__(self, fname, width, height): 383 | Geom.__init__(self) 384 | self.set_color(1.0, 1.0, 1.0) 385 | self.width = width 386 | self.height = height 387 | img = pyglet.image.load(fname) 388 | self.img = img 389 | self.flip = False 390 | 391 | def render1(self): 392 | self.img.blit( 393 | -self.width / 2, -self.height / 2, width=self.width, height=self.height 394 | ) 395 | 396 | 397 | # ================================================================ 398 | 399 | 400 | class SimpleImageViewer: 401 | def __init__(self, display=None, maxwidth=500): 402 | self.window = None 403 | self.isopen = False 404 | self.display = get_display(display) 405 | self.maxwidth = maxwidth 406 | 407 | def imshow(self, arr): 408 | if self.window is None: 409 | height, width, _channels = arr.shape 410 | if width > self.maxwidth: 411 | scale = self.maxwidth / width 412 | width = int(scale * width) 413 | height = int(scale * height) 414 | self.window = get_window( 415 | width=width, 416 | height=height, 417 | display=self.display, 418 | vsync=False, 419 | resizable=True, 420 | ) 421 | self.width = width 422 | self.height = height 423 | self.isopen = True 424 | 425 | @self.window.event 426 | def on_resize(width, height): 427 | self.width = width 428 | self.height = height 429 | 430 | @self.window.event 431 | def on_close(): 432 | self.isopen = False 433 | 434 | assert len(arr.shape) == 3, "You passed in an image with the wrong number shape" 435 | image = pyglet.image.ImageData( 436 | arr.shape[1], arr.shape[0], "RGB", arr.tobytes(), pitch=arr.shape[1] * -3 437 | ) 438 | texture = image.get_texture() 439 | gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST) 440 | texture.width = self.width 441 | texture.height = self.height 442 | self.window.clear() 443 | self.window.switch_to() 444 | self.window.dispatch_events() 445 | texture.blit(0, 0) # draw 446 | self.window.flip() 447 | 448 | def close(self): 449 | if self.isopen and sys.meta_path: 450 | # ^^^ check sys.meta_path to avoid 'ImportError: sys.meta_path is None, Python is likely shutting down' 451 | self.window.close() 452 | self.isopen = False 453 | 454 | def __del__(self): 455 | self.close() 456 | -------------------------------------------------------------------------------- /rcbf_sac/cbf_qp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rcbf_sac.dynamics import DYNAMICS_MODE 3 | from quadprog import solve_qp 4 | 5 | class CascadeCBFLayer: 6 | 7 | def __init__(self, env, gamma_b=100, k_d=1.5, l_p=0.03): 8 | """Constructor of CBFLayer. 9 | 10 | Parameters 11 | ---------- 12 | env : gym.env 13 | Gym environment. 14 | gamma_b : float, optional 15 | gamma of control barrier certificate. 16 | k_d : float, optional 17 | confidence parameter desired (2.0 corresponds to ~95% for example). 18 | """ 19 | 20 | self.env = env 21 | self.u_min, self.u_max = self.get_control_bounds() 22 | self.gamma_b = gamma_b 23 | self.k_d = k_d 24 | self.l_p = l_p 25 | 26 | if self.env.dynamics_mode not in DYNAMICS_MODE: 27 | raise Exception('Dynamics mode not supported.') 28 | 29 | def get_u_safe(self, u_nom, s, mean_pred, sigma): 30 | """Given the current state of the system, this function computes the control input necessary to render the nominal 31 | control `u_nom` safe (i.e. u_safe + u_nom is safe). 32 | 33 | Parameters 34 | ---------- 35 | u_nom : ndarray 36 | Nominal control input. 37 | s : ndarray 38 | current state. 39 | mean_pred : ndarray 40 | mean prediction. 41 | sigma : ndarray 42 | standard deviation in additive disturbance. 43 | 44 | Returns 45 | ------- 46 | u_safe : ndarray 47 | Safe control input to be added to `u_nom` as such `env.step(u_nom + u_safe)` 48 | """ 49 | 50 | P, q, G, h = self.get_cbf_qp_constraints(u_nom, s, mean_pred, sigma) 51 | u_safe = self.solve_qp(P, q, G, h) 52 | 53 | return u_safe 54 | 55 | def get_cbf_qp_constraints(self, u_nom, state, mean_pred, sigma_pred): 56 | """Build up matrices required to solve qp 57 | Program specifically solves: 58 | minimize_{u,eps} 0.5 * u^T P u + q^T u 59 | subject to G[u,eps]^T <= h 60 | 61 | Each control barrier certificate is of the form: 62 | dh/dx^T (f_out + g_out u) >= -gamma^b h_out^3 where out here is an output of the state. 63 | 64 | In the case of SafetyGym_point dynamics: 65 | state = [x y θ v ω] 66 | state_d = [v*cos(θ) v*sin(θ) omega ω u^v u^ω] 67 | 68 | Parameters 69 | ---------- 70 | u_nom : ndarray 71 | Nominal control input. 72 | state : ndarray 73 | current state (check dynamics.py for details on each dynamics' specifics) 74 | mean_pred : ndarray 75 | mean disturbance prediction state, dimensions (n_s, n_u) 76 | sigma_pred : ndarray 77 | standard deviation in additive disturbance after undergoing the output dynamics. 78 | 79 | Returns 80 | ------- 81 | P : ndarray 82 | Quadratic cost matrix in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u) 83 | q : ndarray 84 | Linear cost vector in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u) 85 | G : ndarray 86 | Inequality constraint matrix (G[u,eps] <= h) of size (num_constraints, n_u + 1) 87 | h : ndarray 88 | Inequality constraint vector (G[u,eps] <= h) of size (num_constraints,) 89 | """ 90 | 91 | if self.env.dynamics_mode == 'Unicycle': 92 | 93 | collision_radius = 1.2 * self.env.hazards_radius # add a little buffer 94 | 95 | # p(x): lookahead output 96 | p_x = np.array([state[0] + self.l_p * np.cos(state[2]), state[1] + self.l_p * np.sin(state[2])]) 97 | 98 | # p_dot = f_p + g_p u 99 | f_p = np.zeros(2) 100 | theta = state[2] 101 | c_theta = np.cos(theta) 102 | s_theta = np.sin(theta) 103 | R = np.array([[c_theta, -s_theta], 104 | [s_theta, c_theta]]) 105 | L = np.array([[1, 0], 106 | [0, self.l_p]]) 107 | g_p = R @ L 108 | 109 | # hs 110 | hs = 0.5 * (np.sum((p_x - self.env.hazards_locations) ** 2, 111 | axis=1) - collision_radius ** 2) # 1/2 * (||x - x_obs||^2 - r^2) 112 | 113 | dhdxs = (p_x - self.env.hazards_locations) # each row is dhdx_i for hazard i 114 | 115 | # since we're dealing with p(x), we need to get the disturbance on p_x, p_y 116 | # Mean 117 | mean_p = mean_pred[:2] + self.l_p * np.array([-np.sin(state[2]), np.cos(state[2])]) * mean_pred[2] 118 | # Sigma (output is 2-dims (p_x, p_y)) 119 | sigma_p = sigma_pred[:2] + self.l_p * np.array([-np.sin(state[2]), np.cos(state[2])]) * sigma_pred[2] 120 | 121 | n_u = u_nom.shape[0] # dimension of control inputs 122 | num_constraints = hs.shape[0] + 2 * n_u # each cbf is a constraint, and we need to add actuator constraints (n_u of them) 123 | 124 | # Inequality constraints (G[u, eps] <= h) 125 | G = np.zeros((num_constraints, n_u + 1)) # the plus 1 is for epsilon (to make sure qp is always feasible) 126 | h = np.zeros(num_constraints) 127 | ineq_constraint_counter = 0 128 | 129 | # First let's add the cbf constraints 130 | for c in range(hs.shape[0]): 131 | # extract current affine cbf (h = h1^T x + h0) 132 | h_ = hs[c] 133 | dhdx_ = dhdxs[c] 134 | 135 | # Add inequality constraints 136 | G[ineq_constraint_counter, :n_u] = -dhdx_ @ g_p # h1^Tg(x) 137 | G[ineq_constraint_counter, n_u] = -1 # for slack 138 | 139 | h[ineq_constraint_counter] = self.gamma_b * (h_ ** 3) + np.dot(dhdx_, (f_p+mean_p)) + np.dot(dhdx_ @ g_p, 140 | u_nom) \ 141 | - self.k_d * np.dot(np.abs(dhdx_), sigma_p) 142 | 143 | ineq_constraint_counter += 1 144 | 145 | # Let's also build the cost matrices, vectors to minimize control effort and penalize slack 146 | P = np.diag([1.e1, 1.e-4, 1e7]) # in the original code, they use 1e24 instead of 1e7, but quadprog can't handle that... 147 | q = np.zeros(n_u + 1) 148 | 149 | elif self.env.dynamics_mode == 'SimulatedCars': 150 | 151 | n_u = u_nom.shape[0] # dimension of control inputs 152 | num_constraints = 4 # each cbf is a constraint, and we need to add actuator constraints (n_u of them) 153 | collision_radius = 3.5 154 | 155 | # Inequality constraints (G[u, eps] <= h) 156 | G = np.zeros((num_constraints, n_u + 1)) 157 | h = np.zeros((num_constraints)) 158 | ineq_constraint_counter = 0 159 | 160 | # Current State 161 | pos = state[::2] 162 | vels = state[1::2] 163 | 164 | # Action (acceleration) 165 | vels_des = 30.0 * np.ones(5) # Desired velocities 166 | # vels_des[0] -= 10 * np.sin(0.2 * t_batch) 167 | accels = self.env.kp * (vels_des - vels) 168 | accels[1] -= self.env.k_brake * (pos[0] - pos[1]) * ((pos[0] - pos[1]) < 6.0) 169 | accels[2] -= self.env.k_brake * (pos[1] - pos[2]) * ((pos[1] - pos[2]) < 6.0) 170 | accels[3] = 0.0 # Car 4's acceleration is controlled directly 171 | accels[4] -= self.env.k_brake * (pos[2] - pos[4]) * ((pos[2] - pos[4]) < 13.0) 172 | 173 | # f(x) 174 | f_x = np.zeros(state.shape[0]) 175 | f_x[::2] = vels 176 | f_x[1::2] = accels 177 | 178 | # g(x) 179 | g_x = np.zeros(state.shape[0]) 180 | g_x[7] = 50.0 # Car 4's acceleration 181 | 182 | # h1 183 | h13 = 0.5 * (((pos[2] - pos[3]) ** 2) - collision_radius ** 2) 184 | h15 = 0.5 * (((pos[4] - pos[3]) ** 2) - collision_radius ** 2) 185 | 186 | # dh1/dt = Lfh1 187 | h13_dot = (pos[3] - pos[2]) * (vels[3] - vels[2]) 188 | h15_dot = (pos[3] - pos[4]) * (vels[3] - vels[4]) 189 | 190 | # Lffh1 191 | dLfh13dx = np.zeros(10) 192 | dLfh13dx[4] = (vels[2] - vels[3]) # dLfh13/d(pos_car_3) 193 | dLfh13dx[5] = (pos[2] - pos[3]) # dLfh13/d(vel_car_3) 194 | dLfh13dx[6] = (vels[3] - vels[2]) 195 | dLfh13dx[7] = (pos[3] - pos[2]) # dLfh13/d(vel_car_4) 196 | Lffh13 = np.dot(dLfh13dx, f_x) 197 | 198 | dLfh15dx = np.zeros(10) 199 | dLfh15dx[8] = (vels[4] - vels[3]) # Car 5 pos 200 | dLfh15dx[9] = (pos[4] - pos[3]) # Car 5 vels 201 | dLfh15dx[6] = (vels[3] - vels[4]) 202 | dLfh15dx[7] = (pos[3] - pos[4]) 203 | Lffh15 = np.dot(dLfh15dx, f_x) 204 | 205 | # Lfgh1 206 | Lgfh13 = np.dot(dLfh13dx, g_x) 207 | Lgfh15 = np.dot(dLfh15dx, g_x) 208 | 209 | # Inequality constraints (G[u, eps] <= h) 210 | h[0] = Lffh13 + (self.gamma_b + self.gamma_b) * h13_dot + self.gamma_b * self.gamma_b * h13 + Lgfh13 * u_nom 211 | h[1] = Lffh15 + (self.gamma_b + self.gamma_b) * h15_dot + self.gamma_b * self.gamma_b * h15 + Lgfh15 * u_nom 212 | G[0, 0] = -Lgfh13 213 | G[1, 0] = -Lgfh15 214 | G[:2, n_u] = -2e2 # for slack 215 | ineq_constraint_counter += 2 216 | 217 | # Let's also build the cost matrices, vectors to minimize control effort and penalize slack 218 | P = np.diag([0.1, 1e1]) 219 | q = np.zeros(n_u + 1) 220 | 221 | else: 222 | raise Exception('Dynamics mode unknown!') 223 | 224 | # Second let's add actuator constraints 225 | n_u = u_nom.shape[0] # dimension of control inputs 226 | for c in range(n_u): 227 | 228 | # u_max >= u_nom + u ---> u <= u_max - u_nom 229 | if self.u_max is not None: 230 | G[ineq_constraint_counter, c] = 1 231 | h[ineq_constraint_counter] = self.u_max[c] - u_nom[c] 232 | ineq_constraint_counter += 1 233 | 234 | # u_min <= u_nom + u ---> -u <= u_min - u_nom 235 | if self.u_min is not None: 236 | G[ineq_constraint_counter, c] = -1 237 | h[ineq_constraint_counter] = -self.u_min[c] + u_nom[c] 238 | ineq_constraint_counter += 1 239 | 240 | return P, q, G, h 241 | 242 | def solve_qp(self, P, q, G, h): 243 | """Solves: 244 | minimize_{u,eps} 0.5 * u^T P u + q^T u 245 | subject to G[u,eps]^T <= h 246 | 247 | Parameters 248 | ---------- 249 | P : ndarray 250 | Quadratic cost matrix 251 | q : ndarray 252 | Linear cost vector 253 | G : ndarray 254 | Inequality Constraint Matrix 255 | h : ndarray 256 | Inequality Constraint Vector 257 | 258 | Returns 259 | ------- 260 | u_safe : ndarray 261 | The solution of the qp without the last dimension (the slack). 262 | """ 263 | 264 | # print('P =\t {}'.format(P)) 265 | # print('q =\t {}'.format(q)) 266 | # print('G =\t {}'.format(G)) 267 | # print('h =\t {}'.format(h)) 268 | 269 | # Here we normalize G and h to stay consistent with what we do in CVXPYLAYER which often crashes with big #s 270 | Gh = np.concatenate((G, np.expand_dims(h, 1)), 1) 271 | Gh_norm = np.expand_dims(np.max(np.abs(Gh), axis=1), axis=1) 272 | G /= Gh_norm 273 | h = h / Gh_norm.squeeze(-1) 274 | 275 | try: 276 | sol = solve_qp(P, q, -G.T, -h) 277 | u_safe = sol[0][:-1] 278 | print('quadprog = {} eps = {}'.format(u_safe, sol[0][-1])) 279 | except ValueError as e: 280 | print('P = {},\nq = {},\nG = {},\nh = {}.'.format(P, q, G, h)) 281 | raise e 282 | 283 | if np.abs(sol[0][-1]) > 1e-1: 284 | print('CBF indicates constraint violation might occur. epsilon = {}'.format(sol[0][-1])) 285 | 286 | return u_safe 287 | 288 | def get_cbfs(self, hazards_locations, hazards_radius): 289 | """Returns CBF function h(x) and its derivative dh/dx(x) for each hazard. Note that the CBF is defined with 290 | with respect to an output of the state. 291 | 292 | Parameters 293 | ---------- 294 | hazards_locations : list 295 | List of hazard-xy positions where each item is a list of length 2 296 | hazards_radius : float 297 | Radius of the hazards 298 | 299 | Returns 300 | ------- 301 | get_h : 302 | List of cbfs each corresponding to a constraint. Each cbf is affine (h(s) = h1^Ts + h0), as such each cbf is represented 303 | as a tuple of two entries: h_cbf = (h1, h0) where h1 and h0 are ndarrays. 304 | get_dhdx : 305 | """ 306 | 307 | hazards_locations = np.array(hazards_locations) 308 | collision_radius = hazards_radius + 0.07 # add a little buffer 309 | 310 | def get_h(state): 311 | # p(x): lookahead output 312 | if self.env.dynamics_mode in ('SafetyGym_point', 'Unicycle'): 313 | state = np.array([state[0] + self.l_p * np.cos(state[2]), state[1] + self.l_p * np.sin(state[2])]) 314 | return 0.5 * (np.sum((state - hazards_locations)**2, axis=1) - collision_radius**2) # 1/2 * (||x - x_obs||^2 - r^2) 315 | 316 | def get_dhdx(state): 317 | # p(x): lookahead output 318 | if self.env.dynamics_mode in ('SafetyGym_point', 'Unicycle'): 319 | state = np.array([state[0] + self.l_p * np.cos(state[2]), state[1] + self.l_p * np.sin(state[2])]) 320 | dhdx = (state - hazards_locations) # each row is dhdx_i for hazard i 321 | return dhdx 322 | 323 | return get_h, get_dhdx 324 | 325 | def get_control_bounds(self): 326 | """ 327 | 328 | Returns 329 | ------- 330 | u_min : ndarray 331 | min control input. 332 | u_max : ndarray 333 | max control input. 334 | """ 335 | 336 | u_min = self.env.safe_action_space.low 337 | u_max = self.env.safe_action_space.high 338 | 339 | return u_min, u_max 340 | 341 | def get_min_h_val(self, state): 342 | """ 343 | 344 | Parameters 345 | ---------- 346 | state : ndarray 347 | Current State 348 | 349 | Returns 350 | ------- 351 | min_h_val : float 352 | Minimum h(x) over all hs. If below 0, then at least 1 constraint is violated. 353 | 354 | """ 355 | 356 | get_h, _ = self.get_cbfs(self.env.hazards_locations, self.env.hazards_radius) 357 | min_h_val = np.min(get_h(state)) 358 | return min_h_val 359 | 360 | -------------------------------------------------------------------------------- /envs/unicycle_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | from envs.utils import to_pixel 6 | 7 | 8 | class UnicycleEnv(gym.Env): 9 | """Custom Environment that follows SafetyGym interface""" 10 | 11 | metadata = {'render.modes': ['human']} 12 | 13 | def __init__(self): 14 | 15 | super(UnicycleEnv, self).__init__() 16 | 17 | self.dynamics_mode = 'Unicycle' 18 | # Define action and observation space 19 | # They must be gym.spaces objects 20 | # Example when using discrete actions: 21 | self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,)) 22 | self.safe_action_space = spaces.Box(low=-2.5, high=2.5, shape=(2,)) 23 | self.observation_space = spaces.Box(low=-1e10, high=1e10, shape=(7,)) 24 | self.bds = np.array([[-3., -3.], [3., 3.]]) 25 | self.hazards_radius = 0.6 26 | self.hazards_locations = np.array([[0., 0.], [-1., 1.], [-1., -1.], [1., -1.], [1., 1.]]) * 1.5 27 | self.dt = 0.02 28 | self.max_episode_steps = 1000 29 | self.reward_goal = 1.0 30 | self.goal_size = 0.3 31 | # Initialize Env 32 | self.state = None 33 | self.episode_step = 0 34 | self.goal_pos = np.array([2.5, 2.5]) 35 | 36 | self.reset() 37 | # Get Dynamics 38 | self.get_f, self.get_g = self._get_dynamics() 39 | # Disturbance 40 | self.disturb_mean = np.zeros((3,)) 41 | self.disturb_covar = np.diag([0.005, 0.005, 0.05]) * 20 42 | 43 | # Viewer 44 | self.viewer = None 45 | 46 | def step(self, action): 47 | """Organize the observation to understand what's going on 48 | 49 | Parameters 50 | ---------- 51 | action : ndarray 52 | Action that the agent takes in the environment 53 | 54 | Returns 55 | ------- 56 | new_obs : ndarray 57 | The new observation with the following structure: 58 | [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, dist2goal] 59 | 60 | """ 61 | 62 | action = np.clip(action, -1.0, 1.0) 63 | state, reward, done, info = self._step(action) 64 | return self.get_obs(), reward, done, info 65 | 66 | def _step(self, action): 67 | """ 68 | 69 | Parameters 70 | ---------- 71 | action 72 | 73 | Returns 74 | ------- 75 | state : ndarray 76 | New internal state of the agent. 77 | reward : float 78 | Reward collected during this transition. 79 | done : bool 80 | Whether the episode terminated. 81 | info : dict 82 | Additional info relevant to the environment. 83 | """ 84 | 85 | # Start with our prior for continuous time system x' = f(x) + g(x)u 86 | self.state += self.dt * (self.get_f(self.state) + self.get_g(self.state) @ action) 87 | self.state -= self.dt * 0.1 * self.get_g(self.state) @ np.array([np.cos(self.state[2]), 0]) #* np.random.multivariate_normal(self.disturb_mean, self.disturb_covar, 1).squeeze() 88 | 89 | self.episode_step += 1 90 | 91 | info = dict() 92 | 93 | dist_goal = self._goal_dist() 94 | reward = (self.last_goal_dist - dist_goal) 95 | self.last_goal_dist = dist_goal 96 | # Check if goal is met 97 | if self.goal_met(): 98 | info['goal_met'] = True 99 | reward += self.reward_goal 100 | done = True 101 | else: 102 | done = self.episode_step >= self.max_episode_steps 103 | 104 | 105 | # Include constraint cost in reward 106 | if np.any(np.sum((self.state[:2] - self.hazards_locations)**2, axis=1) < self.hazards_radius**2): 107 | if 'cost' in info: 108 | info['cost'] += 0.1 109 | else: 110 | info['cost'] = 0.1 111 | return self.state, reward, done, info 112 | 113 | def goal_met(self): 114 | """Return true if the current goal is met this step 115 | 116 | Returns 117 | ------- 118 | goal_met : bool 119 | True if the goal condition is met. 120 | 121 | """ 122 | 123 | return np.linalg.norm(self.state[:2] - self.goal_pos) <= self.goal_size 124 | 125 | def reset(self): 126 | """ Reset the state of the environment to an initial state. 127 | 128 | Returns 129 | ------- 130 | observation : ndarray 131 | Next observation. 132 | """ 133 | 134 | self.episode_step = 0 135 | 136 | # Re-initialize state 137 | self.state = np.array([-2.5, -2.5, 0.]) 138 | 139 | # Re-initialize last goal dist 140 | self.last_goal_dist = self._goal_dist() 141 | 142 | # TODO: Randomize this 143 | return self.get_obs() 144 | 145 | def render(self, mode='human', close=False): 146 | """Render the environment to the screen 147 | 148 | Parameters 149 | ---------- 150 | mode : str 151 | close : bool 152 | 153 | Returns 154 | ------- 155 | 156 | """ 157 | 158 | if mode != 'human' and mode != 'rgb_array': 159 | rel_loc = self.goal_pos - self.state[:2] 160 | theta_error = np.arctan2(rel_loc[1], rel_loc[0]) - self.state[2] 161 | print('Ep_step = {}, \tState = {}, \tDist2Goal = {}, alignment_error = {}'.format(self.episode_step, 162 | self.state, 163 | self._goal_dist(), 164 | theta_error)) 165 | 166 | screen_width = 600 167 | screen_height = 400 168 | 169 | if self.viewer is None: 170 | from envs import pyglet_rendering 171 | self.viewer = pyglet_rendering.Viewer(screen_width, screen_height) 172 | # Draw obstacles 173 | obstacles = [] 174 | for i in range(len(self.hazards_locations)): 175 | obstacles.append( 176 | pyglet_rendering.make_circle(radius=to_pixel(self.hazards_radius, shift=0), filled=True)) 177 | obs_trans = pyglet_rendering.Transform(translation=( 178 | to_pixel(self.hazards_locations[i][0], shift=screen_width / 2), 179 | to_pixel(self.hazards_locations[i][1], shift=screen_height / 2))) 180 | obstacles[i].set_color(1.0, 0.0, 0.0) 181 | obstacles[i].add_attr(obs_trans) 182 | self.viewer.add_geom(obstacles[i]) 183 | 184 | # Make Goal 185 | goal = pyglet_rendering.make_circle(radius=to_pixel(0.1, shift=0), filled=True) 186 | goal_trans = pyglet_rendering.Transform(translation=( 187 | to_pixel(self.goal_pos[0], shift=screen_width / 2), to_pixel(self.goal_pos[1], shift=screen_height / 2))) 188 | goal.add_attr(goal_trans) 189 | goal.set_color(0.0, 0.5, 0.0) 190 | self.viewer.add_geom(goal) 191 | 192 | # Make Robot 193 | self.robot = pyglet_rendering.make_circle(radius=to_pixel(0.1), filled=True) 194 | self.robot_trans = pyglet_rendering.Transform(translation=( 195 | to_pixel(self.state[0], shift=screen_width / 2), to_pixel(self.state[1], shift=screen_height / 2))) 196 | self.robot_trans.set_rotation(self.state[2]) 197 | self.robot.add_attr(self.robot_trans) 198 | self.robot.set_color(0.5, 0.5, 0.8) 199 | self.viewer.add_geom(self.robot) 200 | self.robot_orientation = pyglet_rendering.Line(start=(0.0, 0.0), end=(15.0, 0.0)) 201 | self.robot_orientation.linewidth.stroke = 2 202 | self.robot_orientation.add_attr(self.robot_trans) 203 | self.robot_orientation.set_color(0, 0, 0) 204 | self.viewer.add_geom(self.robot_orientation) 205 | 206 | if self.state is None: 207 | return None 208 | 209 | self.robot_trans.set_translation(to_pixel(self.state[0], shift=screen_width / 2), 210 | to_pixel(self.state[1], shift=screen_height / 2)) 211 | self.robot_trans.set_rotation(self.state[2]) 212 | 213 | return self.viewer.render(return_rgb_array=mode == "rgb_array") 214 | 215 | def get_obs(self): 216 | """Given the state, this function returns it to an observation akin to the one obtained by calling env.step 217 | 218 | Parameters 219 | ---------- 220 | 221 | Returns 222 | ------- 223 | observation : ndarray 224 | Observation: [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, exp(-dist2goal)] 225 | """ 226 | 227 | rel_loc = self.goal_pos - self.state[:2] 228 | goal_dist = np.linalg.norm(rel_loc) 229 | goal_compass = self.obs_compass() # compass to the goal 230 | 231 | return np.array([self.state[0], self.state[1], np.cos(self.state[2]), np.sin(self.state[2]), goal_compass[0], goal_compass[1], np.exp(-goal_dist)]) 232 | 233 | def _get_dynamics(self): 234 | """Get affine CBFs for a given environment. 235 | 236 | Parameters 237 | ---------- 238 | 239 | Returns 240 | ------- 241 | get_f : callable 242 | Drift dynamics of the continuous system x' = f(x) + g(x)u 243 | get_g : callable 244 | Control dynamics of the continuous system x' = f(x) + g(x)u 245 | """ 246 | 247 | def get_f(state): 248 | f_x = np.zeros(state.shape) 249 | return f_x 250 | 251 | def get_g(state): 252 | theta = state[2] 253 | g_x = np.array([[np.cos(theta), 0], 254 | [np.sin(theta), 0], 255 | [ 0, 1.0]]) 256 | return g_x 257 | 258 | return get_f, get_g 259 | 260 | def obs_compass(self): 261 | """ 262 | Return a robot-centric compass observation of a list of positions. 263 | Compass is a normalized (unit-lenght) egocentric XY vector, 264 | from the agent to the object. 265 | This is equivalent to observing the egocentric XY angle to the target, 266 | projected into the sin/cos space we use for joints. 267 | (See comment on joint observation for why we do this.) 268 | """ 269 | 270 | # Get ego vector in world frame 271 | vec = self.goal_pos - self.state[:2] 272 | # Rotate into frame 273 | R = np.array([[np.cos(self.state[2]), -np.sin(self.state[2])], [np.sin(self.state[2]), np.cos(self.state[2])]]) 274 | vec = np.matmul(vec, R) 275 | # Normalize 276 | vec /= np.sqrt(np.sum(np.square(vec))) + 0.001 277 | return vec 278 | 279 | def _goal_dist(self): 280 | return np.linalg.norm(self.goal_pos - self.state[:2]) 281 | 282 | def get_random_hazard_locations(n_hazards, hazard_radius, bds=None): 283 | """ 284 | 285 | Parameters 286 | ---------- 287 | n_hazards : int 288 | Number of hazards to create 289 | hazard_radius : float 290 | Radius of hazards 291 | bds : list, optional 292 | List of the form [[x_lb, x_ub], [y_lb, y_ub] denoting the bounds of the 2D arena 293 | 294 | Returns 295 | ------- 296 | hazards_locs : ndarray 297 | Numpy array of shape (n_hazards, 2) containing xy locations of hazards. 298 | """ 299 | 300 | if bds is None: 301 | bds = np.array([[-3., -3.], [3., 3.]]) 302 | 303 | # Create buffer with boundaries 304 | buffered_bds = bds 305 | buffered_bds[0] += hazard_radius 306 | buffered_bds[1] -= hazard_radius 307 | 308 | hazards_locs = np.zeros((n_hazards, 2)) 309 | 310 | for i in range(n_hazards): 311 | successfully_placed = False 312 | iter = 0 313 | while not successfully_placed and iter < 500: 314 | hazards_locs[i] = (bds[1] - bds[0]) * np.random.random(2) + bds[0] 315 | successfully_placed = np.all(np.linalg.norm(hazards_locs[:i] - hazards_locs[i], axis=1) > 3*hazard_radius) 316 | iter += 1 317 | 318 | if iter >= 500: 319 | raise Exception('Could not place hazards in arena.') 320 | 321 | return hazards_locs 322 | 323 | if __name__ == "__main__": 324 | 325 | import matplotlib.pyplot as plt 326 | from rcbf_sac.cbf_qp import CascadeCBFLayer 327 | from rcbf_sac.dynamics import DynamicsModel 328 | import argparse 329 | 330 | parser = argparse.ArgumentParser() 331 | parser.add_argument('--env-name', default="SafetyGym", help='Either SafetyGym or Unicycle.') 332 | parser.add_argument('--gp_model_size', default=2000, type=int, help='gp') 333 | parser.add_argument('--k_d', default=3.0, type=float) 334 | parser.add_argument('--gamma_b', default=40, type=float) 335 | parser.add_argument('--cuda', action="store_true", help='run on CUDA (default: False)') 336 | args = parser.parse_args() 337 | 338 | env = UnicycleEnv() 339 | dynamics_model = DynamicsModel(env, args) 340 | cbf_wrapper = CascadeCBFLayer(env, gamma_b=args.gamma_b, k_d=args.k_d) 341 | 342 | 343 | def simple_controller(env, state, goal): 344 | goal_xy = goal[:2] 345 | goal_dist = -np.log(goal[2]) # the observation is np.exp(-goal_dist) 346 | v = 4.0 * goal_dist 347 | relative_theta = 1.0 * np.arctan2(goal_xy[1], goal_xy[0]) 348 | omega = 5.0 * relative_theta 349 | return np.clip(np.array([v, omega]), env.action_space.low, env.action_space.high) 350 | 351 | obs = env.reset() 352 | done = False 353 | episode_reward = 0 354 | episode_step = 0 355 | 356 | # Plot initial state 357 | fig, ax = plt.subplots() # note we must use plt.subplots, not plt.subplot 358 | for i in range(len(env.hazards_locations)): 359 | ax.add_patch(plt.Circle(env.hazards_locations[i], env.hazards_radius, color='r')) 360 | ax.add_patch(plt.Circle(env.goal_pos, env.goal_size, color='g')) 361 | p_pos = ax.scatter(obs[0], obs[1], s=300) 362 | p_theta = plt.quiver(obs[0], obs[1], obs[0] + .2 * obs[2], .2 * obs[3]) 363 | plt.xlim([-3.0, 3.0]) 364 | plt.ylim([-3.0, 3.0]) 365 | ax.set_aspect('equal', 'box') 366 | 367 | while not done: 368 | # Plot current state 369 | p_pos.set_offsets([obs[0], obs[1]]) 370 | p_theta.XY[:, 0] = obs[0] 371 | p_theta.XY[:, 1] = obs[1] 372 | p_theta.set_UVC(.2 * obs[2], .2 * obs[3]) 373 | # Take Action and get next state 374 | # random_action = env.action_space.sample() 375 | state = dynamics_model.get_state(obs) 376 | random_action = simple_controller(env, state, obs[-3:]) 377 | disturb_mean, disturb_std = dynamics_model.predict_disturbance(state) 378 | action_safe = cbf_wrapper.get_u_safe(random_action, state, disturb_mean, disturb_std) 379 | obs, reward, done, info = env.step(random_action + action_safe) 380 | plt.pause(0.01) 381 | episode_reward += reward 382 | episode_step += 1 383 | print('step {} \tepisode_reward = {}'.format(episode_step, episode_reward)) 384 | plt.show() 385 | 386 | -------------------------------------------------------------------------------- /rcbf_sac/dynamics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from rcbf_sac.gp_model import GPyDisturbanceEstimator 4 | from rcbf_sac.utils import to_tensor, to_numpy 5 | 6 | """ 7 | This file contains the DynamicsModel class. Depending on the environment selected, it contains the dynamics 8 | priors and learns the remaining disturbance term using GPs. The system is described as follows: 9 | x_dot ∈ f(x) + g(x)u + D(x), 10 | where x is the state, f and g are the drift and control dynamics respectively, and D(x) is the learned disturbance set. 11 | 12 | A few things to note: 13 | - The prior depends on the dynamics. This is hard-coded as of now. In the future, one direction to take would be to 14 | get a few episodes in the env to determine the affine prior first using least-squares or something similar. 15 | - The state is not the same as the observation and typically requires some pre-processing. These functions are supplied 16 | here which are also, unfortunately, hard-coded as of now. 17 | - The functions here are sometimes used in batch-form or single point form, and sometimes the inputs are torch tensors 18 | and other times are numpy arrays. These are things to be mindful of, and output format is always same as input format. 19 | """ 20 | 21 | 22 | DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}, # state = [x y θ] 23 | 'SimulatedCars': {'n_s': 10, 'n_u': 1}} # state = [x y θ v ω] 24 | MAX_STD = {'Unicycle': [2e-1, 2e-1, 2e-1], 'SimulatedCars': [0, 0.2, 0, 0.2, 0, 0.2, 0, 0.2, 0, 0.2]} 25 | 26 | 27 | class DynamicsModel: 28 | 29 | def __init__(self, env, args): 30 | """Constructor of DynamicsModel. 31 | 32 | Parameters 33 | ---------- 34 | env : gym.env 35 | Gym environment. 36 | """ 37 | 38 | self.env = env 39 | # Get Dynamics 40 | self.get_f, self.get_g = self.get_dynamics() 41 | self.n_s = DYNAMICS_MODE[self.env.dynamics_mode]['n_s'] 42 | self.n_u = DYNAMICS_MODE[self.env.dynamics_mode]['n_u'] 43 | 44 | # Keep Disturbance History to estimate it using GPs 45 | self.disturb_estimators = None 46 | self.disturbance_history = dict() 47 | self.history_counter = 0 # keeping only 1000 points in the buffer 48 | self.max_history_count = args.gp_model_size # How many points we want to have in the GP 49 | self.disturbance_history['state'] = np.zeros((self.max_history_count, self.n_s)) 50 | self.disturbance_history['disturbance'] = np.zeros((self.max_history_count, self.n_s)) 51 | self.train_x = None # x-data used to fit the last GP models 52 | self.train_y = None # y-data used to fit the last GP models 53 | 54 | # Point Robot specific dynamics (approx using unicycle + look-ahead) 55 | if hasattr(args, 'l_p'): 56 | self.l_p = args.l_p 57 | 58 | self.device = torch.device("cuda" if args.cuda else "cpu") 59 | 60 | def predict_next_state(self, state_batch, u_batch, t_batch=None, use_gps=True): 61 | """Given the current state and action, this function predicts the next state. 62 | 63 | Parameters 64 | ---------- 65 | state_batch : ndarray 66 | State 67 | u_batch : ndarray 68 | Action 69 | t_batch: ndarray, optional 70 | Time batch for state dependant dynamics 71 | use_gps : bool, optional 72 | Use GPs to return mean and var 73 | 74 | Returns 75 | ------- 76 | next_state : ndarray 77 | Next state 78 | """ 79 | 80 | expand_dims = len(state_batch.shape) == 1 81 | if expand_dims: 82 | state_batch = np.expand_dims(state_batch, axis=0) 83 | 84 | # Start with our prior for continuous time system x' = f(x) + g(x)u 85 | if t_batch is not None: 86 | next_state_batch = state_batch + self.env.dt * (self.get_f(state_batch, t_batch) + (self.get_g(state_batch, t_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1)) 87 | else: 88 | next_state_batch = state_batch + self.env.dt * (self.get_f(state_batch) + (self.get_g(state_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1)) 89 | 90 | if use_gps: # if we want estimate the disturbance, let's do it! 91 | pred_mean, pred_std = self.predict_disturbance(state_batch) 92 | next_state_batch += self.env.dt * pred_mean 93 | else: 94 | pred_std = np.zeros(state_batch.shape) 95 | 96 | if expand_dims: 97 | next_state_batch = next_state_batch.squeeze(0) 98 | if pred_std is not None: 99 | pred_std = pred_std.squeeze(0) 100 | 101 | if t_batch is not None: 102 | next_t_batch = t_batch + self.env.dt 103 | return next_state_batch, self.env.dt * pred_std, next_t_batch 104 | 105 | return next_state_batch, self.env.dt * pred_std, t_batch 106 | 107 | def predict_next_obs(self, state, u): 108 | """Predicts the next observation given the state and u. Note that this only predicts the mean next observation. 109 | 110 | Parameters 111 | ---------- 112 | state : ndarray 113 | u : ndarray 114 | 115 | Returns 116 | ------- 117 | next_obs : ndarray 118 | Next observation 119 | """ 120 | 121 | next_state, _, _ = self.predict_next_state(state, u) 122 | next_obs = self.get_obs(next_state) 123 | return next_obs 124 | 125 | def get_dynamics(self): 126 | """Get affine CBFs for a given environment. 127 | 128 | Parameters 129 | ---------- 130 | 131 | Returns 132 | ------- 133 | get_f : callable 134 | Drift dynamics of the continuous system x' = f(x) + g(x)u 135 | get_g : callable 136 | Control dynamics of the continuous system x' = f(x) + g(x)u 137 | """ 138 | 139 | if self.env.dynamics_mode == 'Unicycle': 140 | 141 | def get_f(state_batch, t_batch=None): 142 | f_x = np.zeros(state_batch.shape) 143 | return f_x 144 | 145 | def get_g(state_batch, t_batch=None): 146 | theta = state_batch[:, 2] 147 | g_x = np.zeros((state_batch.shape[0], 3, 2)) 148 | g_x[:, 0, 0] = np.cos(theta) 149 | g_x[:, 1, 0] = np.sin(theta) 150 | g_x[:, 2, 1] = 1.0 151 | return g_x 152 | 153 | elif self.env.dynamics_mode == 'SimulatedCars': 154 | 155 | kp = 4.0 156 | k_brake = 20.0 157 | 158 | def get_g(state_batch, t_batch): 159 | 160 | g_x = np.zeros((state_batch.shape[0], 10, 1)) 161 | g_x[:, 7, 0] = 50.0 # Car 4's acceleration 162 | return g_x 163 | 164 | def get_f(state_batch, t_batch): 165 | 166 | # Current State 167 | pos = state_batch[:, ::2] 168 | vels = state_batch[:, 1::2] 169 | 170 | # Action (acceleration) 171 | vels_des = 30.0 * np.ones((state_batch.shape[0], 5)) # Desired velocities 172 | vels_des[:, 0] -= 10 * np.sin(0.2 * t_batch) 173 | accels = kp * (vels_des - vels) 174 | accels[:, 1] -= k_brake * (pos[:, 0] - pos[:, 1]) * ((pos[:, 0] - pos[:, 1]) < 6.0) 175 | accels[:, 2] -= k_brake * (pos[:, 1] - pos[:, 2]) * ((pos[:, 1] - pos[:, 2]) < 6.0) 176 | accels[:, 3] = 0.0 # Car 4's acceleration is controlled directly 177 | accels[:, 4] -= k_brake * (pos[:, 2] - pos[:, 4]) * ((pos[:, 2] - pos[:, 4]) < 13.0) 178 | 179 | # f(x) 180 | f_x = np.zeros(state_batch.shape) 181 | f_x[:, ::2] = vels 182 | f_x[:, 1::2] = accels 183 | return f_x 184 | 185 | else: 186 | raise Exception('Unknown Dynamics mode.') 187 | 188 | return get_f, get_g 189 | 190 | def get_state(self, obs): 191 | """Given the observation, this function does the pre-processing necessary and returns the state. 192 | 193 | Parameters 194 | ---------- 195 | obs_batch : ndarray or torch.tensor 196 | Environment observation. 197 | 198 | Returns 199 | ------- 200 | state_batch : ndarray or torch.tensor 201 | State of the system. 202 | 203 | """ 204 | 205 | expand_dims = len(obs.shape) == 1 206 | is_tensor = torch.is_tensor(obs) 207 | 208 | if is_tensor: 209 | dtype = obs.dtype 210 | device = obs.device 211 | obs = to_numpy(obs) 212 | 213 | if expand_dims: 214 | obs = np.expand_dims(obs, 0) 215 | 216 | if self.env.dynamics_mode == 'Unicycle': 217 | theta = np.arctan2(obs[:, 3], obs[:, 2]) 218 | state_batch = np.zeros((obs.shape[0], 3)) 219 | state_batch[:, 0] = obs[:, 0] 220 | state_batch[:, 1] = obs[:, 1] 221 | state_batch[:, 2] = theta 222 | elif self.env.dynamics_mode == 'SimulatedCars': 223 | state_batch = np.copy(obs) 224 | state_batch[:, ::2] *= 100.0 # Scale Positions 225 | state_batch[:, 1::2] *= 30.0 # Scale Velocities 226 | else: 227 | raise Exception('Unknown dynamics') 228 | 229 | if expand_dims: 230 | state_batch = state_batch.squeeze(0) 231 | 232 | return to_tensor(state_batch, dtype, device) if is_tensor else state_batch 233 | 234 | def get_obs(self, state_batch): 235 | """Given the state, this function returns it to an observation akin to the one obtained by calling env.step 236 | 237 | Parameters 238 | ---------- 239 | state : ndarray 240 | Environment state batch of shape (batch_size, n_s) 241 | 242 | Returns 243 | ------- 244 | obs : ndarray 245 | Observation batch of shape (batch_size, n_o) 246 | 247 | """ 248 | 249 | if self.env.dynamics_mode == 'Unicycle': 250 | obs = np.zeros((state_batch.shape[0], 4)) 251 | obs[:, 0] = state_batch[:, 0] 252 | obs[:, 1] = state_batch[:, 1] 253 | obs[:, 2] = np.cos(state_batch[:, 2]) 254 | obs[:, 3] = np.sin(state_batch[:, 2]) 255 | elif self.env.dynamics_mode == 'SimulatedCars': 256 | obs = np.copy(state_batch) 257 | obs[:, ::2] /= 100.0 # Scale Positions 258 | obs[:, 1::2] /= 30.0 # Scale Velocities 259 | else: 260 | raise Exception('Unknown dynamics') 261 | return obs 262 | 263 | def append_transition(self, state_batch, u_batch, next_state_batch, t_batch=None): 264 | """Estimates the disturbance from the current dynamics transition and adds it to buffer. 265 | 266 | Parameters 267 | ---------- 268 | state_batch : ndarray 269 | shape (n_s,) or (batch_size, n_s) 270 | u_batch : ndarray 271 | shape (n_u,) or (batch_size, n_u) 272 | next_state_batch : ndarray 273 | shape (n_s,) or (batch_size, n_s) 274 | t_batch : ndarray, optional 275 | shape (1,) or (batch_size, 1) 276 | 277 | Returns 278 | ------- 279 | 280 | """ 281 | 282 | expand_dims = len(state_batch.shape) == 1 283 | 284 | if expand_dims: 285 | state_batch = np.expand_dims(state_batch, 0) 286 | next_state_batch = np.expand_dims(next_state_batch, 0) 287 | u_batch = np.expand_dims(u_batch, 0) 288 | 289 | u_batch = np.expand_dims(u_batch, -1) # for broadcasting batch matrix multiplication purposes 290 | 291 | disturbance_batch = (next_state_batch - state_batch - self.env.dt * (self.get_f(state_batch, t_batch) + (self.get_g(state_batch, t_batch) @ u_batch).squeeze(-1))) / self.env.dt 292 | 293 | # Append new data point (state, disturbance) to our dataset 294 | for i in range(state_batch.shape[0]): 295 | 296 | self.disturbance_history['state'][self.history_counter % self.max_history_count] = state_batch[i] 297 | self.disturbance_history['disturbance'][self.history_counter % self.max_history_count] = disturbance_batch[i] 298 | 299 | # Increment how many data points we have 300 | self.history_counter += 1 301 | 302 | # Update GP models every max_history_count data points 303 | if self.history_counter % (self.max_history_count/10) == 0: 304 | self.fit_gp_model() 305 | 306 | def fit_gp_model(self, training_iter=70): 307 | """ 308 | 309 | Parameters 310 | ---------- 311 | training_iter : int 312 | Number of training iterations for GP model. 313 | 314 | Returns 315 | ------- 316 | 317 | """ 318 | 319 | if self.history_counter < self.max_history_count: # didn't fill the buffer yet 320 | train_x = self.disturbance_history['state'][:self.history_counter] 321 | train_y = self.disturbance_history['disturbance'][:self.history_counter] 322 | else: # buffer filled, use all the data points 323 | train_x = self.disturbance_history['state'] 324 | train_y = self.disturbance_history['disturbance'] 325 | 326 | # Normalize Data 327 | train_x_std = np.std(train_x, axis=0) 328 | train_x_normalized = train_x / (train_x_std + 1e-8) 329 | train_y_std = np.std(train_y, axis=0) 330 | train_y_normalized = train_y / (train_y_std + 1e-8) 331 | 332 | self.disturb_estimators = [] 333 | for i in range(self.n_s): 334 | # self.disturb_estimators.append(GPyDisturbanceEstimator(train_x, train_y[:, i])) 335 | self.disturb_estimators.append(GPyDisturbanceEstimator(train_x_normalized, train_y_normalized[:, i], MAX_STD[self.env.dynamics_mode][i], device=self.device)) 336 | self.disturb_estimators[i].train(training_iter) 337 | 338 | # track the data I last used to fit the GPs for saving purposes (need it to initialize before loading weights) 339 | self.train_x = train_x 340 | self.train_y = train_y 341 | 342 | def predict_disturbance(self, test_x): 343 | """Predict the disturbance at the queried states using the GP models. 344 | 345 | Parameters 346 | ---------- 347 | test_x : ndarray or torch.tensor 348 | shape(n_test, n_s) 349 | Returns 350 | ------- 351 | means: ndarray or torch.tensor 352 | Prediction means -- shape(n_test, n_s) 353 | vars: ndarray or torch.tensor 354 | Prediction variances -- shape(n_test, n_s) 355 | """ 356 | 357 | is_tensor = torch.is_tensor(test_x) 358 | 359 | if is_tensor: 360 | dtype = test_x.dtype 361 | device = test_x.device 362 | test_x = to_numpy(test_x) 363 | 364 | expand_dims = len(test_x.shape) == 1 365 | if expand_dims: 366 | test_x = np.expand_dims(test_x, axis=0) 367 | 368 | means = np.zeros(test_x.shape) 369 | f_std = np.zeros(test_x.shape) # standard deviation 370 | 371 | if self.disturb_estimators: 372 | # Normalize 373 | train_x_std = np.std(self.train_x, axis=0) 374 | train_y_std = np.std(self.train_y, axis=0) 375 | test_x = test_x / train_x_std 376 | for i in range(self.n_s): 377 | prediction_ = self.disturb_estimators[i].predict(test_x) 378 | means[:, i] = prediction_['mean'] * (train_y_std[i] + 1e-8) 379 | f_std[:, i] = np.sqrt(prediction_['f_var']) * (train_y_std[i] + 1e-8) 380 | 381 | else: # zero-mean, max_sigma prior 382 | f_std = np.ones(test_x.shape) 383 | for i in range(self.n_s): 384 | f_std[:, i] *= MAX_STD[self.env.dynamics_mode][i] 385 | 386 | if expand_dims: 387 | means = means.squeeze(0) 388 | f_std = f_std.squeeze(0) 389 | 390 | return (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) if is_tensor else (means, f_std) 391 | 392 | def load_disturbance_models(self, output): 393 | 394 | if output is None: 395 | return 396 | 397 | self.disturb_estimators = [] 398 | 399 | try: 400 | weights = torch.load('{}/gp_models.pkl'.format(output), map_location=torch.device(self.device)) 401 | self.train_x = torch.load('{}/gp_models_train_x.pkl'.format(output)) 402 | self.train_y = torch.load('{}/gp_models_train_y.pkl'.format(output)) 403 | for i in range(self.n_s): 404 | self.disturb_estimators.append(GPyDisturbanceEstimator(self.train_x, self.train_y[:, i], MAX_STD[self.env.dynamics_mode][i], device=self.device)) 405 | self.disturb_estimators[i].model.load_state_dict(weights[i]) 406 | except: 407 | raise Exception('Could not load GP models from {}'.format(output)) 408 | 409 | def save_disturbance_models(self, output): 410 | 411 | if not self.disturb_estimators or self.train_x is None or self.train_y is None: 412 | return 413 | weights = [] 414 | for i in range(len(self.disturb_estimators)): 415 | weights.append(self.disturb_estimators[i].model.state_dict()) 416 | torch.save(weights, '{}/gp_models.pkl'.format(output)) 417 | # Also save data used to fit model (needed for initializing the model before loading weights) 418 | torch.save(self.train_x, '{}/gp_models_train_x.pkl'.format(output)) 419 | torch.save(self.train_y, '{}/gp_models_train_y.pkl'.format(output)) 420 | 421 | def seed(self, s): 422 | torch.manual_seed(s) 423 | if torch.cuda.is_available(): 424 | torch.cuda.manual_seed(s) 425 | 426 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # import comet_ml at the top of your file 2 | from comet_ml import Experiment 3 | 4 | import argparse 5 | import torch 6 | import numpy as np 7 | 8 | from rcbf_sac.sac_cbf import RCBF_SAC 9 | from rcbf_sac.replay_memory import ReplayMemory 10 | from rcbf_sac.dynamics import DynamicsModel 11 | from build_env import * 12 | import os 13 | 14 | from rcbf_sac.utils import prGreen, get_output_folder, prYellow 15 | from rcbf_sac.evaluator import Evaluator 16 | from rcbf_sac.generate_rollouts import generate_model_rollouts 17 | 18 | 19 | def train(agent, env, dynamics_model, args, experiment=None): 20 | 21 | # Memory 22 | memory = ReplayMemory(args.replay_size, args.seed) 23 | memory_model = ReplayMemory(args.replay_size, args.seed) 24 | 25 | # Training Loop 26 | total_numsteps = 0 27 | updates = 0 28 | 29 | if args.use_comp: 30 | compensator_rollouts = [] 31 | comp_buffer_idx = 0 32 | 33 | for i_episode in range(args.max_episodes): 34 | episode_reward = 0 35 | episode_cost = 0 36 | episode_steps = 0 37 | done = False 38 | obs = env.reset() 39 | 40 | # Saving rollout here to train compensator 41 | if args.use_comp: 42 | episode_rollout = dict() 43 | episode_rollout['obs'] = np.zeros((0, env.observation_space.shape[0])) 44 | episode_rollout['u_safe'] = np.zeros((0, env.action_space.shape[0])) 45 | episode_rollout['u_comp'] = np.zeros((0, env.action_space.shape[0])) 46 | 47 | while not done: 48 | if episode_steps % 10 == 0: 49 | prYellow('Episode {} - step {} - eps_rew {} - eps_cost {}'.format(i_episode, episode_steps, episode_reward, episode_cost)) 50 | state = dynamics_model.get_state(obs) 51 | 52 | # Generate Model rollouts 53 | if args.model_based and episode_steps % 5 == 0 and len(memory) > dynamics_model.max_history_count / 3: 54 | memory_model = generate_model_rollouts(env, memory_model, memory, agent, dynamics_model, 55 | k_horizon=args.k_horizon, 56 | batch_size=min(len(memory), 5 * args.rollout_batch_size), 57 | warmup=args.start_steps > total_numsteps) 58 | 59 | # If using model-based RL then we only need to have enough data for the real portion of the replay buffer 60 | if len(memory) + len(memory_model) * args.model_based > args.batch_size: 61 | 62 | # Number of updates per step in environment 63 | for i in range(args.updates_per_step): 64 | 65 | # Update parameters of all the networks 66 | if args.model_based: 67 | # Pick the ratio of data to be sampled from the real vs model buffers 68 | real_ratio = max(min(args.real_ratio, len(memory) / args.batch_size), 1 - len(memory_model) / args.batch_size) 69 | # Update parameters of all the networks 70 | critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, 71 | args.batch_size, 72 | updates, 73 | dynamics_model, 74 | memory_model, 75 | real_ratio) 76 | else: 77 | critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, 78 | args.batch_size, 79 | updates, 80 | dynamics_model) 81 | 82 | if experiment: 83 | experiment.log_metric('loss/critic_1', critic_1_loss, updates) 84 | experiment.log_metric('loss/critic_2', critic_2_loss, step=updates) 85 | experiment.log_metric('loss/policy', policy_loss, step=updates) 86 | experiment.log_metric('loss/entropy_loss', ent_loss, step=updates) 87 | experiment.log_metric('entropy_temperature/alpha', alpha, step=updates) 88 | updates += 1 89 | 90 | if args.use_comp: 91 | action, action_comp, action_cbf = agent.select_action(obs, dynamics_model, warmup=args.start_steps > total_numsteps) 92 | else: 93 | action = agent.select_action(obs, dynamics_model, warmup=args.start_steps > total_numsteps) # Sample action from policy 94 | 95 | next_obs, reward, done, info = env.step(action) # Step 96 | if 'cost_exception' in info: 97 | prYellow('Cost exception occured.') 98 | episode_steps += 1 99 | total_numsteps += 1 100 | episode_reward += reward 101 | episode_cost += info.get('cost', 0) 102 | 103 | # Ignore the "done" signal if it comes from hitting the time horizon. 104 | # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) 105 | mask = 1 if episode_steps == env.max_episode_steps else float(not done) 106 | 107 | memory.push(obs, action, reward, next_obs, mask, t=episode_steps * env.dt, next_t=(episode_steps+1) * env.dt) # Append transition to memory 108 | 109 | # Update state and store transition for GP model learning 110 | next_state = dynamics_model.get_state(next_obs) 111 | if episode_steps % 2 == 0 and i_episode < args.gp_max_episodes: # Stop learning the dynamics after a while to stabilize learning 112 | # TODO: Clean up line below, specifically (t_batch) 113 | dynamics_model.append_transition(state, action, next_state, t_batch=np.array([episode_steps*env.dt])) 114 | 115 | # append comp rollout with step before updating 116 | if args.use_comp: 117 | episode_rollout['obs'] = np.vstack((episode_rollout['obs'], obs)) 118 | episode_rollout['u_safe'] = np.vstack((episode_rollout['u_safe'], action_cbf)) 119 | episode_rollout['u_comp'] = np.vstack((episode_rollout['u_comp'], action_comp)) 120 | 121 | obs = next_obs 122 | 123 | # Train compensator 124 | if args.use_comp and i_episode < args.comp_train_episodes: 125 | if comp_buffer_idx < 50: # TODO: Turn the 50 into an arg 126 | compensator_rollouts.append(episode_rollout) 127 | else: 128 | comp_buffer_idx[comp_buffer_idx] = episode_rollout 129 | comp_buffer_idx = (comp_buffer_idx + 1) % 50 130 | if i_episode % args.comp_update_episode == 0: 131 | agent.update_parameters_compensator(compensator_rollouts) 132 | 133 | # [optional] save intermediate model 134 | if i_episode % int(args.max_episodes / 10) == 0: 135 | agent.save_model(args.output) 136 | dynamics_model.save_disturbance_models(args.output) 137 | 138 | if experiment: 139 | # Comet.ml logging 140 | experiment.log_metric('reward/train', episode_reward, step=i_episode) 141 | experiment.log_metric('cost/train', episode_cost, step=i_episode) 142 | prGreen("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}, cost: {}".format(i_episode, total_numsteps, 143 | episode_steps, 144 | round(episode_reward, 2), round(episode_cost, 2))) 145 | 146 | # Evaluation 147 | if i_episode % 5 == 0 and args.eval is True: 148 | print('Size of replay buffers: real : {}, \t\t model : {}'.format(len(memory), len(memory_model))) 149 | avg_reward = 0. 150 | avg_cost = 0. 151 | episodes = 5 152 | for _ in range(episodes): 153 | obs = env.reset() 154 | episode_reward = 0 155 | episode_cost = 0 156 | done = False 157 | while not done: 158 | if args.use_comp: 159 | action, _, _ = agent.select_action(obs, dynamics_model, evaluate=True) 160 | else: 161 | action = agent.select_action(obs, dynamics_model, evaluate=True) # Sample action from policy 162 | next_obs, reward, done, info = env.step(action) 163 | episode_reward += reward 164 | episode_cost += info.get('cost', 0) 165 | obs = next_obs 166 | 167 | avg_reward += episode_reward 168 | avg_cost += episode_cost 169 | avg_reward /= episodes 170 | avg_cost /= episodes 171 | if experiment: 172 | experiment.log_metric('avg_reward/test', avg_reward, step=i_episode) 173 | experiment.log_metric('avg_cost/test', avg_cost, step=i_episode) 174 | 175 | print("----------------------------------------") 176 | print("Test Episodes: {}, Avg. Reward: {}, Avg. Cost: {}".format(episodes, round(avg_reward, 2), round(avg_cost, 2))) 177 | print("----------------------------------------") 178 | 179 | 180 | def test(agent, env, dynamics_model, evaluate, model_path, visualize=True, debug=False): 181 | 182 | agent.load_weights(model_path) 183 | dynamics_model.load_disturbance_models(model_path) 184 | 185 | def policy(observation): 186 | if args.use_comp: 187 | action, action_comp, action_cbf = agent.select_action(observation, dynamics_model, evaluate=True) 188 | else: 189 | action = agent.select_action(observation, dynamics_model,evaluate=True) 190 | return action 191 | 192 | evaluate(env, policy, dynamics_model=dynamics_model, debug=debug, visualize=visualize, save=False) 193 | 194 | 195 | if __name__ == "__main__": 196 | 197 | parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args') 198 | # Environment Args 199 | parser.add_argument('--env-name', default="SimulatedCars", help='Options are Unicycle or SimulatedCars.') 200 | # Comet ML 201 | parser.add_argument('--log_comet', action='store_true', dest='log_comet', help="Whether to log data") 202 | parser.add_argument('--comet_key', default='', help='Comet API key') 203 | parser.add_argument('--comet_workspace', default='', help='Comet workspace') 204 | # SAC Args 205 | parser.add_argument('--mode', default='train', type=str, help='support option: train/test') 206 | parser.add_argument('--visualize', action='store_true', dest='visualize', help='visualize env -only in available test mode') 207 | parser.add_argument('--output', default='output', type=str, help='') 208 | parser.add_argument('--policy', default="Gaussian", 209 | help='Policy Type: Gaussian | Deterministic (default: Gaussian)') 210 | parser.add_argument('--eval', type=bool, default=True, 211 | help='Evaluates a policy a policy every 5 episode (default: True)') 212 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G', 213 | help='discount factor for reward (default: 0.99)') 214 | parser.add_argument('--tau', type=float, default=0.005, metavar='G', 215 | help='target smoothing coefficient(τ) (default: 0.005)') 216 | parser.add_argument('--lr', type=float, default=0.0003, metavar='G', 217 | help='learning rate (default: 0.0003)') 218 | parser.add_argument('--alpha', type=float, default=0.2, metavar='G', 219 | help='Temperature parameter α determines the relative importance of the entropy\ 220 | term against the reward (default: 0.2)') 221 | parser.add_argument('--automatic_entropy_tuning', type=bool, default=True, metavar='G', 222 | help='Automatically adjust α (default: False)') 223 | parser.add_argument('--seed', type=int, default=12345, metavar='N', 224 | help='random seed (default: 12345)') 225 | parser.add_argument('--batch_size', type=int, default=256, metavar='N', 226 | help='batch size (default: 256)') 227 | parser.add_argument('--max_episodes', type=int, default=400, metavar='N', 228 | help='maximum number of episodes (default: 400)') 229 | parser.add_argument('--hidden_size', type=int, default=256, metavar='N', 230 | help='hidden size (default: 256)') 231 | parser.add_argument('--updates_per_step', type=int, default=1, metavar='N', 232 | help='model updates per simulator step (default: 1)') 233 | parser.add_argument('--start_steps', type=int, default=5000, metavar='N', 234 | help='Steps sampling random actions (default: 10000)') 235 | parser.add_argument('--target_update_interval', type=int, default=1, metavar='N', 236 | help='Value target update per no. of updates per step (default: 1)') 237 | parser.add_argument('--replay_size', type=int, default=10000000, metavar='N', 238 | help='size of replay buffer (default: 10000000)') 239 | parser.add_argument('--cuda', action="store_true", 240 | help='run on CUDA (default: False)') 241 | parser.add_argument('--device_num', type=int, default=0, help='Select GPU number for CUDA (default: 0)') 242 | parser.add_argument('--resume', default='default', type=str, help='Resuming model path for testing') 243 | parser.add_argument('--validate_episodes', default=5, type=int, help='how many episode to perform during validate experiment') 244 | parser.add_argument('--validate_steps', default=1000, type=int, help='how many steps to perform a validate experiment') 245 | # CBF, Dynamics, Env Args 246 | parser.add_argument('--no_diff_qp', action='store_false', dest='diff_qp', help='Should the agent diff through the CBF?') 247 | parser.add_argument('--gp_model_size', default=3000, type=int, help='gp') 248 | parser.add_argument('--gp_max_episodes', default=100, type=int, help='gp max train episodes.') 249 | parser.add_argument('--k_d', default=3.0, type=float) 250 | parser.add_argument('--gamma_b', default=20, type=float) 251 | parser.add_argument('--l_p', default=0.03, type=float, 252 | help="Look-ahead distance for unicycle dynamics output.") 253 | # Model Based Learning 254 | parser.add_argument('--model_based', action='store_true', dest='model_based', help='If selected, will use data from the model to train the RL agent.') 255 | parser.add_argument('--real_ratio', default=0.3, type=float, help='Portion of data obtained from real replay buffer for training.') 256 | parser.add_argument('--k_horizon', default=1, type=int, help='horizon of model-based rollouts') 257 | parser.add_argument('--rollout_batch_size', default=5, type=int, help='Size of initial states batch to rollout from.') 258 | # Compensator 259 | parser.add_argument('--comp_rate', default=0.005, type=float, help='Compensator learning rate') 260 | parser.add_argument('--comp_train_episodes', default=200, type=int, help='Number of initial episodes to train compensator for.') 261 | parser.add_argument('--comp_update_episode', default=50, type=int, help='Modulo for compensator updates') 262 | parser.add_argument('--use_comp', type=bool, default=False, help='Should the compensator be used.') 263 | args = parser.parse_args() 264 | 265 | if args.mode == 'train': 266 | args.output = get_output_folder(args.output, args.env_name) 267 | if args.resume == 'default': 268 | args.resume = os.getcwd() + '/output/{}-run0'.format(args.env_name) 269 | elif args.resume.isnumeric(): 270 | args.resume = os.getcwd() + '/output/{}-run{}'.format(args.env_name, args.resume) 271 | 272 | if args.cuda: 273 | torch.cuda.set_device(args.device_num) 274 | 275 | if args.mode == 'train' and args.log_comet: 276 | project_name = 'sac-rcbf-unicycle-environment' if args.env_name == 'Unicycle' else 'sac-rcbf-sim-cars-env' 277 | prYellow('Logging experiment on comet.ml!') 278 | # Create an experiment with your api key 279 | experiment = Experiment( 280 | api_key=args.comet_key, 281 | project_name=project_name, 282 | workspace=args.comet_workspace, 283 | ) 284 | # Log args on comet.ml 285 | experiment.log_parameters(vars(args)) 286 | experiment_tags = ['MB' if args.model_based else 'MF', 287 | str(args.batch_size) + '_batch', 288 | str(args.updates_per_step) + '_step_updates', 289 | 'diff_qp' if args.diff_qp else 'reg_qp'] 290 | if args.use_comp: 291 | experiment_tags.append('use_comp') 292 | print(experiment_tags) 293 | experiment.add_tags(experiment_tags) 294 | else: 295 | experiment = None 296 | 297 | if args.use_comp and (args.model_based or args.diff_qp): 298 | raise Exception('Compensator can only be used with model free RL and regular CBF.') 299 | 300 | # Environment 301 | env = build_env(args) 302 | 303 | # Agent 304 | agent = RCBF_SAC(env.observation_space.shape[0], env.action_space, env, args) 305 | dynamics_model = DynamicsModel(env, args) 306 | 307 | # Random Seed 308 | if args.seed > 0: 309 | env.seed(args.seed) 310 | env.action_space.seed(args.seed) 311 | torch.manual_seed(args.seed) 312 | np.random.seed(args.seed) 313 | dynamics_model.seed(args.seed) 314 | 315 | # If model based, we warm up in the model too 316 | if args.model_based: 317 | args.start_steps /= (1 + args.rollout_batch_size) 318 | 319 | if args.mode == 'train': 320 | train(agent, env, dynamics_model, args, experiment) 321 | elif args.mode == 'test': 322 | evaluate = Evaluator(args.validate_episodes, args.validate_steps, args.output) 323 | test(agent, env, dynamics_model, evaluate, args.resume, visualize=args.visualize, debug=True) 324 | 325 | env.close() 326 | 327 | -------------------------------------------------------------------------------- /rcbf_sac/diff_cbf_qp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import torch 4 | from rcbf_sac.dynamics import DYNAMICS_MODE 5 | from rcbf_sac.utils import to_tensor, prRed 6 | from time import time 7 | from qpth.qp import QPFunction 8 | 9 | 10 | class CBFQPLayer: 11 | 12 | def __init__(self, env, args, gamma_b=100, k_d=1.5, l_p=0.03): 13 | """Constructor of CBFLayer. 14 | 15 | Parameters 16 | ---------- 17 | env : gym.env 18 | Gym environment. 19 | gamma_b : float, optional 20 | gamma of control barrier certificate. 21 | k_d : float, optional 22 | confidence parameter desired (2.0 corresponds to ~95% for example). 23 | """ 24 | 25 | self.device = torch.device("cuda" if args.cuda else "cpu") 26 | 27 | self.env = env 28 | self.u_min, self.u_max = self.get_control_bounds() 29 | self.gamma_b = gamma_b 30 | 31 | if self.env.dynamics_mode not in DYNAMICS_MODE: 32 | raise Exception('Dynamics mode not supported.') 33 | 34 | if self.env.dynamics_mode == 'Unicycle': 35 | self.num_cbfs = len(env.hazards_locations) 36 | self.k_d = k_d 37 | self.l_p = l_p 38 | elif self.env.dynamics_mode == 'SimulatedCars': 39 | self.num_cbfs = 2 40 | 41 | self.action_dim = env.action_space.shape[0] 42 | self.num_ineq_constraints = self.num_cbfs + 2 * self.action_dim 43 | 44 | def get_safe_action(self, state_batch, action_batch, mean_pred_batch, sigma_batch): 45 | """ 46 | 47 | Parameters 48 | ---------- 49 | state_batch : torch.tensor or ndarray 50 | action_batch : torch.tensor or ndarray 51 | State batch 52 | mean_pred_batch : torch.tensor or ndarray 53 | Mean of disturbance 54 | sigma_batch : torch.tensor or ndarray 55 | Standard deviation of disturbance 56 | 57 | Returns 58 | ------- 59 | final_action_batch : torch.tensor 60 | Safe actions to take in the environment. 61 | """ 62 | 63 | # batch form if only a single data point is passed 64 | expand_dims = len(state_batch.shape) == 1 65 | if expand_dims: 66 | action_batch = action_batch.unsqueeze(0) 67 | state_batch = state_batch.unsqueeze(0) 68 | mean_pred_batch = mean_pred_batch.unsqueeze(0) 69 | sigma_batch = sigma_batch.unsqueeze(0) 70 | 71 | start_time = time() 72 | Ps, qs, Gs, hs = self.get_cbf_qp_constraints(state_batch, action_batch, mean_pred_batch, sigma_batch) 73 | build_qp_time = time() 74 | safe_action_batch = self.solve_qp(Ps, qs, Gs, hs) 75 | # prCyan('Time to get constraints = {} - Time to solve QP = {} - time per qp = {} - batch_size = {} - device = {}'.format(build_qp_time - start_time, time() - build_qp_time, (time() - build_qp_time) / safe_action_batch.shape[0], Ps.shape[0], Ps.device)) 76 | # The actual safe action is the cbf action + the nominal action 77 | final_action = torch.clamp(action_batch + safe_action_batch, self.u_min.repeat(action_batch.shape[0], 1), self.u_max.repeat(action_batch.shape[0], 1)) 78 | 79 | return final_action if not expand_dims else final_action.squeeze(0) 80 | 81 | def solve_qp(self, Ps: torch.Tensor, qs: torch.Tensor, Gs: torch.Tensor, hs: torch.Tensor): 82 | """Solves: 83 | minimize_{u,eps} 0.5 * u^T P u + q^T u 84 | subject to G[u,eps]^T <= h 85 | 86 | Parameters 87 | ---------- 88 | Ps : torch.Tensor 89 | (batch_size, n_u+1, n_u+1) 90 | qs : torch.Tensor 91 | (batch_size, n_u+1) 92 | Gs : torch.Tensor 93 | (batch_size, num_ineq_constraints, n_u+1) 94 | hs : torch.Tensor 95 | (batch_size, num_ineq_constraints) 96 | Returns 97 | ------- 98 | safe_action_batch : torch.tensor 99 | The solution of the qp without the last dimension (the slack). 100 | """ 101 | 102 | 103 | Ghs = torch.cat((Gs, hs.unsqueeze(2)), -1) 104 | Ghs_norm = torch.max(torch.abs(Ghs), dim=2, keepdim=True)[0] 105 | Gs /= Ghs_norm 106 | hs = hs / Ghs_norm.squeeze(-1) 107 | sol = self.cbf_layer(Ps, qs, Gs, hs, solver_args={"check_Q_spd": False, "maxIter": 100000, "notImprovedLim": 10, "eps": 1e-4}) 108 | safe_action_batch = sol[:, :-1] 109 | return safe_action_batch 110 | 111 | def cbf_layer(self, Qs, ps, Gs, hs, As=None, bs=None, solver_args=None): 112 | """ 113 | 114 | Parameters 115 | ---------- 116 | Qs : torch.Tensor 117 | ps : torch.Tensor 118 | Gs : torch.Tensor 119 | shape (batch_size, num_ineq_constraints, num_vars) 120 | hs : torch.Tensor 121 | shape (batch_size, num_ineq_constraints) 122 | As : torch.Tensor, optional 123 | bs : torch.Tensor, optional 124 | solver_args : dict, optional 125 | 126 | Returns 127 | ------- 128 | result : torch.Tensor 129 | Result of QP 130 | """ 131 | 132 | if solver_args is None: 133 | solver_args = {} 134 | 135 | if As is None or bs is None: 136 | As = torch.Tensor().to(self.device).double() 137 | bs = torch.Tensor().to(self.device).double() 138 | 139 | result = QPFunction(verbose=0, **solver_args)(Qs.double(), ps.double(), Gs.double(), hs.double(), As, bs).float() 140 | 141 | if torch.any(torch.isnan(result)): 142 | prRed('QP Failed to solve - result is nan == {}!'.format(torch.any(torch.isnan(result)))) 143 | raise Exception('QP Failed to solve') 144 | return result 145 | 146 | def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sigma_pred_batch): 147 | """Build up matrices required to solve qp 148 | Program specifically solves: 149 | minimize_{u,eps} 0.5 * u^T P u + q^T u 150 | subject to G[u,eps]^T <= h 151 | 152 | Each control barrier certificate is of the form: 153 | dh/dx^T (f_out + g_out u) >= -gamma^b h_out^3 where out here is an output of the state. 154 | 155 | In the case of SafetyGym_point dynamics: 156 | state = [x y θ v ω] 157 | state_d = [v*cos(θ) v*sin(θ) omega ω u^v u^ω] 158 | 159 | Quick Note on batch matrix multiplication for matrices A and B: 160 | - Batch size should be first dim 161 | - Everything needs to be 3-dimensional 162 | - E.g. if B is a vec, i.e. shape (batch_size, vec_length) --> .view(batch_size, vec_length, 1) 163 | 164 | Parameters 165 | ---------- 166 | state_batch : torch.tensor 167 | current state (check dynamics.py for details on each dynamics' specifics) 168 | action_batch : torch.tensor 169 | Nominal control input. 170 | mean_pred_batch : torch.tensor 171 | mean disturbance prediction state, dimensions (n_s, n_u) 172 | sigma_pred_batch : torch.tensor 173 | standard deviation in additive disturbance after undergoing the output dynamics. 174 | gamma_b : float, optional 175 | CBF parameter for the class-Kappa function 176 | 177 | Returns 178 | ------- 179 | P : torch.tensor 180 | Quadratic cost matrix in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u) 181 | q : torch.tensor 182 | Linear cost vector in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u) 183 | G : torch.tensor 184 | Inequality constraint matrix (G[u,eps] <= h) of size (num_constraints, n_u + 1) 185 | h : torch.tensor 186 | Inequality constraint vector (G[u,eps] <= h) of size (num_constraints,) 187 | """ 188 | 189 | assert len(state_batch.shape) == 2 and len(action_batch.shape) == 2 and len(mean_pred_batch.shape) == 2 and len( 190 | sigma_pred_batch.shape) == 2, print(state_batch.shape, action_batch.shape, mean_pred_batch.shape, 191 | sigma_pred_batch.shape) 192 | 193 | batch_size = state_batch.shape[0] 194 | gamma_b = self.gamma_b 195 | 196 | # Expand dims 197 | state_batch = torch.unsqueeze(state_batch, -1) 198 | action_batch = torch.unsqueeze(action_batch, -1) 199 | mean_pred_batch = torch.unsqueeze(mean_pred_batch, -1) 200 | sigma_pred_batch = torch.unsqueeze(sigma_pred_batch, -1) 201 | 202 | if self.env.dynamics_mode == 'Unicycle': 203 | 204 | num_cbfs = self.num_cbfs 205 | hazards_radius = self.env.hazards_radius 206 | hazards_locations = to_tensor(self.env.hazards_locations, torch.FloatTensor, self.device) 207 | collision_radius = 1.2 * hazards_radius # add a little buffer 208 | l_p = self.l_p 209 | 210 | thetas = state_batch[:, 2, :].squeeze(-1) 211 | c_thetas = torch.cos(thetas) 212 | s_thetas = torch.sin(thetas) 213 | 214 | # p(x): lookahead output (batch_size, 2) 215 | ps = torch.zeros((batch_size, 2)).to(self.device) 216 | ps[:, 0] = state_batch[:, 0, :].squeeze(-1) + l_p * c_thetas 217 | ps[:, 1] = state_batch[:, 1, :].squeeze(-1) + l_p * s_thetas 218 | 219 | # p_dot(x) = f_p(x) + g_p(x)u + D_p where f_p(x) = 0, g_p(x) = RL and D_p is the disturbance 220 | 221 | # f_p(x) = [0,...,0]^T 222 | f_ps = torch.zeros((batch_size, 2, 1)).to(self.device) 223 | 224 | # g_p(x) = RL where L = diag([1, l_p]) 225 | Rs = torch.zeros((batch_size, 2, 2)).to(self.device) 226 | Rs[:, 0, 0] = c_thetas 227 | Rs[:, 0, 1] = -s_thetas 228 | Rs[:, 1, 0] = s_thetas 229 | Rs[:, 1, 1] = c_thetas 230 | Ls = torch.zeros((batch_size, 2, 2)).to(self.device) 231 | Ls[:, 0, 0] = 1 232 | Ls[:, 1, 1] = l_p 233 | g_ps = torch.bmm(Rs, Ls) # (batch_size, 2, 2) 234 | 235 | # D_p(x) = g_p [0 D_θ]^T + [D_x1 D_x2]^T 236 | mu_theta_aug = torch.zeros([batch_size, 2, 1]).to(self.device) 237 | mu_theta_aug[:, 1, :] = mean_pred_batch[:, 2, :] 238 | mu_ps = torch.bmm(g_ps, mu_theta_aug) + mean_pred_batch[:, :2, :] 239 | sigma_theta_aug = torch.zeros([batch_size, 2, 1]).to(self.device) 240 | sigma_theta_aug[:, 1, :] = sigma_pred_batch[:, 2, :] 241 | sigma_ps = torch.bmm(torch.abs(g_ps), sigma_theta_aug) + sigma_pred_batch[:, :2, :] 242 | 243 | # hs (batch_size, hazards_locations) 244 | ps_hzds = ps.repeat((1, num_cbfs)).reshape((batch_size, num_cbfs, 2)) 245 | 246 | hs = 0.5 * (torch.sum((ps_hzds - hazards_locations.view(1, num_cbfs, -1)) ** 2, axis=2) - collision_radius ** 2) # 1/2 * (||x - x_obs||^2 - r^2) 247 | 248 | dhdps = (ps_hzds - hazards_locations.view(1, num_cbfs, -1)) # (batch_size, n_cbfs, 2) 249 | # (batch_size, 2, 1) 250 | n_u = action_batch.shape[1] # dimension of control inputs 251 | num_constraints = num_cbfs + 2 * n_u # each cbf is a constraint, and we need to add actuator constraints (n_u of them) 252 | 253 | # Inequality constraints (G[u, eps] <= h) 254 | G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) # the extra variable is for epsilon (to make sure qp is always feasible) 255 | h = torch.zeros((batch_size, num_constraints)).to(self.device) 256 | ineq_constraint_counter = 0 257 | 258 | # Add inequality constraints 259 | G[:, :num_cbfs, :n_u] = -torch.bmm(dhdps, g_ps) # h1^Tg(x) 260 | G[:, :num_cbfs, n_u] = -1 # for slack 261 | h[:, :num_cbfs] = gamma_b * (hs ** 3) + (torch.bmm(dhdps, f_ps + mu_ps) - torch.bmm(torch.abs(dhdps), sigma_ps) + torch.bmm(torch.bmm(dhdps, g_ps), action_batch)).squeeze(-1) 262 | ineq_constraint_counter += num_cbfs 263 | 264 | # Let's also build the cost matrices, vectors to minimize control effort and penalize slack 265 | P = torch.diag(torch.tensor([1.e0, 1.e-2, 1e5])).repeat(batch_size, 1, 1).to(self.device) 266 | q = torch.zeros((batch_size, n_u + 1)).to(self.device) 267 | 268 | elif self.env.dynamics_mode == 'SimulatedCars': 269 | 270 | n_u = action_batch.shape[1] # dimension of control inputs 271 | num_constraints = self.num_cbfs + 2 * n_u # each cbf is a constraint, and we need to add actuator constraints (n_u of them) 272 | collision_radius = 3.5 273 | 274 | # Inequality constraints (G[u, eps] <= h) 275 | G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) # the extra variable is for epsilon (to make sure qp is always feasible) 276 | h = torch.zeros((batch_size, num_constraints)).to(self.device) 277 | ineq_constraint_counter = 0 278 | 279 | # Current State 280 | pos = state_batch[:, ::2, 0] 281 | vels = state_batch[:, 1::2, 0] 282 | 283 | # Action (acceleration) 284 | vels_des = 30.0 * torch.ones((state_batch.shape[0], 5)).to(self.device) # Desired velocities 285 | # vels_des[:, 0] -= 10 * torch.sin(0.2 * t_batch) 286 | accels = self.env.kp * (vels_des - vels) 287 | accels[:, 1] -= self.env.k_brake * (pos[:, 0] - pos[:, 1]) * ((pos[:, 0] - pos[:, 1]) < 6.0) 288 | accels[:, 2] -= self.env.k_brake * (pos[:, 1] - pos[:, 2]) * ((pos[:, 1] - pos[:, 2]) < 6.0) 289 | accels[:, 3] = 0.0 # Car 4's acceleration is controlled directly 290 | accels[:, 4] -= self.env.k_brake * (pos[:, 2] - pos[:, 4]) * ((pos[:, 2] - pos[:, 4]) < 13.0) 291 | 292 | # f(x) 293 | f_x = torch.zeros((state_batch.shape[0], state_batch.shape[1])).to(self.device) 294 | f_x[:, ::2] = vels 295 | f_x[:, 1::2] = accels 296 | 297 | # f_D(x) - disturbance in the drift dynamics 298 | fD_x = torch.zeros((state_batch.shape[0], state_batch.shape[1])).to(self.device) 299 | fD_x[:, 1::2] = sigma_pred_batch[:, 1::2, 0].squeeze(-1) 300 | 301 | # g(x) 302 | g_x = torch.zeros((state_batch.shape[0], state_batch.shape[1], 1)).to(self.device) 303 | g_x[:, 7, 0] = 50.0 # Car 4's acceleration 304 | 305 | # h1 306 | h13 = 0.5 * (((pos[:, 2] - pos[:, 3]) ** 2) - collision_radius ** 2) 307 | h15 = 0.5 * (((pos[:, 4] - pos[:, 3]) ** 2) - collision_radius ** 2) 308 | 309 | # dh1/dt = Lfh1 310 | h13_dot = (pos[:, 3] - pos[:, 2]) * (vels[:, 3] - vels[:, 2]) 311 | h15_dot = (pos[:, 3] - pos[:, 4]) * (vels[:, 3] - vels[:, 4]) 312 | 313 | # Lffh1 314 | dLfh13dx = torch.zeros((batch_size, 10)).to(self.device) 315 | dLfh13dx[:, 4] = (vels[:, 2] - vels[:, 3]) # Car 3 pos 316 | dLfh13dx[:, 5] = (pos[:, 2] - pos[:, 3]) # Car 3 vel 317 | dLfh13dx[:, 6] = (vels[:, 3] - vels[:, 2]) 318 | dLfh13dx[:, 7] = (pos[:, 3] - pos[:, 2]) 319 | Lffh13 = torch.bmm(dLfh13dx.view(batch_size, 1, -1), f_x.view(batch_size, -1, 1)).squeeze() 320 | LfDfh13 = torch.bmm(torch.abs(dLfh13dx.view(batch_size, 1, -1)), fD_x.view(batch_size, -1, 1)).squeeze() 321 | 322 | dLfh15dx = torch.zeros((batch_size, 10)).to(self.device) 323 | dLfh15dx[:, 8] = (vels[:, 4] - vels[:, 3]) # Car 5 pos 324 | dLfh15dx[:, 9] = (pos[:, 4] - pos[:, 3]) # Car 5 vels 325 | dLfh15dx[:, 6] = (vels[:, 3] - vels[:, 4]) 326 | dLfh15dx[:, 7] = (pos[:, 3] - pos[:, 4]) 327 | Lffh15 = torch.bmm(dLfh15dx.view(batch_size, 1, -1), f_x.view(batch_size, -1, 1)).squeeze() 328 | LfDfh15 = torch.bmm(torch.abs(dLfh15dx.view(batch_size, 1, -1)), fD_x.view(batch_size, -1, 1)).squeeze() 329 | 330 | # Lfgh1 331 | Lgfh13 = torch.bmm(dLfh13dx.view(batch_size, 1, -1), g_x) 332 | Lgfh15 = torch.bmm(dLfh15dx.view(batch_size, 1, -1), g_x) 333 | 334 | # print('state = {}'.format(state_batch)) 335 | # print('f_x = {}'.format(f_x)) 336 | # print('g_x = {}'.format(g_x)) 337 | # print('h13 = {}'.format(h13)) 338 | # print('h15 = {}'.format(h15)) 339 | # print('gamma_b = {}'.format(self.gamma_b)) 340 | # print('h13_dot = {}'.format(h13_dot)) 341 | # print('h15_dot = {}'.format(h15_dot)) 342 | # print('Lffh13 = {}'.format(Lffh13)) 343 | # print('Lffh15 = {}'.format(Lffh15)) 344 | # print('Lgfh13 = {}'.format(Lgfh13)) 345 | # print('Lgfh15 = {}'.format(Lgfh15)) 346 | 347 | # Inequality constraints (G[u, eps] <= h) 348 | h[:, 0] = Lffh13 - LfDfh13 + (gamma_b + gamma_b) * h13_dot + gamma_b * gamma_b * h13 + torch.bmm(Lgfh13, action_batch).squeeze() 349 | h[:, 1] = Lffh15 - LfDfh15 + (gamma_b + gamma_b) * h15_dot + gamma_b * gamma_b * h15 + torch.bmm(Lgfh15, action_batch).squeeze() 350 | G[:, 0, 0] = -Lgfh13.squeeze() 351 | G[:, 1, 0] = -Lgfh15.squeeze() 352 | G[:, :self.num_cbfs, n_u] = -2e2 # for slack 353 | ineq_constraint_counter += self.num_cbfs 354 | 355 | # Let's also build the cost matrices, vectors to minimize control effort and penalize slack 356 | P = torch.diag(torch.tensor([0.1, 1e1])).repeat(batch_size, 1, 1).to(self.device) 357 | q = torch.zeros((batch_size, n_u + 1)).to(self.device) 358 | 359 | else: 360 | raise Exception('Dynamics mode unknown!') 361 | 362 | # Second let's add actuator constraints 363 | n_u = action_batch.shape[1] # dimension of control inputs 364 | 365 | for c in range(n_u): 366 | 367 | # u_max >= u_nom + u ---> u <= u_max - u_nom 368 | if self.u_max is not None: 369 | G[:, ineq_constraint_counter, c] = 1 370 | h[:, ineq_constraint_counter] = self.u_max[c] - action_batch[:, c].squeeze(-1) 371 | ineq_constraint_counter += 1 372 | 373 | # u_min <= u_nom + u ---> -u <= u_min - u_nom 374 | if self.u_min is not None: 375 | G[:, ineq_constraint_counter, c] = -1 376 | h[:, ineq_constraint_counter] = -self.u_min[c] + action_batch[:, c].squeeze(-1) 377 | ineq_constraint_counter += 1 378 | 379 | return P, q, G, h 380 | 381 | def get_control_bounds(self): 382 | """ 383 | 384 | Returns 385 | ------- 386 | u_min : torch.tensor 387 | min control input. 388 | u_max : torch.tensor 389 | max control input. 390 | """ 391 | 392 | u_min = torch.tensor(self.env.safe_action_space.low).to(self.device) 393 | u_max = torch.tensor(self.env.safe_action_space.high).to(self.device) 394 | 395 | return u_min, u_max 396 | 397 | 398 | if __name__ == "__main__": 399 | 400 | from build_env import build_env 401 | from rcbf_sac.dynamics import DynamicsModel 402 | from copy import deepcopy 403 | from rcbf_sac.utils import to_numpy, prGreen 404 | 405 | 406 | def simple_controller(env, state, goal): 407 | goal_xy = goal[:2] 408 | goal_dist = -np.log(goal[2]) # the observation is np.exp(-goal_dist) 409 | v = 0.02 * goal_dist 410 | relative_theta = 1.0 * np.arctan2(goal_xy[1], goal_xy[0]) 411 | omega = 1.0 * relative_theta 412 | 413 | return np.clip(np.array([v, omega]), env.action_space.low, env.action_space.high) 414 | 415 | 416 | parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args') 417 | # Environment Args 418 | parser.add_argument('--env-name', default="SafetyGym", help='Options are Unicycle or SafetyGym') 419 | parser.add_argument('--robot_xml', default='xmls/point.xml', 420 | help="SafetyGym Currently only supporting xmls/point.xml") 421 | parser.add_argument('--k_d', default=3.0, type=float) 422 | parser.add_argument('--gamma_b', default=100, type=float) 423 | parser.add_argument('--l_p', default=0.03, type=float) 424 | parser.add_argument('--gp_model_size', default=2000, type=int, help='gp') 425 | parser.add_argument('--cuda', action='store_true', help='run on CUDA (default: False)') 426 | args = parser.parse_args() 427 | # Environment 428 | env = build_env(args) 429 | 430 | device = torch.device('cuda' if args.cuda else 'cpu') 431 | 432 | 433 | def to_def_tensor(ndarray): 434 | 435 | return to_tensor(ndarray, torch.FloatTensor, device) 436 | 437 | 438 | diff_cbf_layer = CBFQPLayer(env, args, args.gamma_b, args.k_d, args.l_p) 439 | dynamics_model = DynamicsModel(env, args) 440 | 441 | obs = env.reset() 442 | done = False 443 | 444 | ep_ret = 0 445 | ep_cost = 0 446 | ep_step = 0 447 | 448 | for i_step in range(3000): 449 | 450 | if done: 451 | prGreen('Episode Return: %.3f \t Episode Cost: %.3f' % (ep_ret, ep_cost)) 452 | ep_ret, ep_cost, ep_step = 0, 0, 0 453 | obs = env.reset() 454 | 455 | state = dynamics_model.get_state(obs) 456 | 457 | print('state = {}, dist2hazards = {}'.format(state[:2], 458 | np.sqrt(np.sum((env.hazards_locations - state[:2]) ** 2, 1)))) 459 | 460 | disturb_mean, disturb_std = dynamics_model.predict_disturbance(state) 461 | 462 | action = simple_controller(env, state, obs[-3:]) # TODO: observations last 3 indicated 463 | # action = 2*np.random.rand(2) - 1.0 464 | assert env.action_space.contains(action) 465 | final_action = diff_cbf_layer.get_safe_action(to_def_tensor(state), to_def_tensor(action), 466 | to_def_tensor(disturb_mean), to_def_tensor(disturb_std)) 467 | final_action = to_numpy(final_action) 468 | 469 | # Env Step 470 | observation2, reward, done, info = env.step(final_action) 471 | observation2 = deepcopy(observation2) 472 | 473 | # Update state and store transition for GP model learning 474 | next_state = dynamics_model.get_state(observation2) 475 | if ep_step % 2 == 0: 476 | dynamics_model.append_transition(state, final_action, next_state) 477 | 478 | # print('reward', reward) 479 | ep_ret += reward 480 | ep_cost += info.get('cost', 0) 481 | ep_step += 1 482 | # env.render() 483 | 484 | obs = observation2 485 | state = next_state --------------------------------------------------------------------------------