├── figures
├── qp.png
├── diff_inc.png
├── policy_diagram.pdf
├── policy_diagram.png
├── cars_env_reward.png
├── framework_diagram.png
├── unicycle_env_reward.png
└── unicycle_snapshot.png
├── .gitignore
├── envs
├── utils.py
├── simulated_cars_env.py
├── pyglet_rendering.py
└── unicycle_env.py
├── rcbf_sac
├── replay_memory.py
├── evaluator.py
├── generate_rollouts.py
├── compensator.py
├── utils.py
├── model.py
├── gp_model.py
├── sac_cbf.py
├── cbf_qp.py
├── dynamics.py
└── diff_cbf_qp.py
├── README.md
└── main.py
/figures/qp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/qp.png
--------------------------------------------------------------------------------
/figures/diff_inc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/diff_inc.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | */runs/*
3 | */output/*
4 | *__pycache__/
5 | */.DS_Store
6 | output/*
7 |
--------------------------------------------------------------------------------
/envs/utils.py:
--------------------------------------------------------------------------------
1 |
2 | def to_pixel(meas_cm, shift=0):
3 | return 1.5 * 37.795 * meas_cm + shift
--------------------------------------------------------------------------------
/figures/policy_diagram.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/policy_diagram.pdf
--------------------------------------------------------------------------------
/figures/policy_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/policy_diagram.png
--------------------------------------------------------------------------------
/figures/cars_env_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/cars_env_reward.png
--------------------------------------------------------------------------------
/figures/framework_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/framework_diagram.png
--------------------------------------------------------------------------------
/figures/unicycle_env_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/unicycle_env_reward.png
--------------------------------------------------------------------------------
/figures/unicycle_snapshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yemam3/SAC-RCBF/HEAD/figures/unicycle_snapshot.png
--------------------------------------------------------------------------------
/rcbf_sac/replay_memory.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 |
4 | class ReplayMemory:
5 |
6 | def __init__(self, capacity, seed):
7 | random.seed(seed)
8 | self.capacity = capacity
9 | self.buffer = []
10 | self.position = 0
11 |
12 | def push(self, state, action, reward, next_state, mask, t=None, next_t=None):
13 |
14 | if len(self.buffer) < self.capacity:
15 | self.buffer.append(None)
16 |
17 | self.buffer[self.position] = (state, action, reward, next_state, mask, t, next_t)
18 | self.position = (self.position + 1) % self.capacity
19 |
20 | def batch_push(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, t_batch=None, next_t_batch=None):
21 |
22 | for i in range(state_batch.shape[0]): # TODO: Optimize This
23 | if t_batch is not None and next_t_batch is not None:
24 | self.push(state_batch[i], action_batch[i], reward_batch[i], next_state_batch[i], mask_batch[i], t_batch[i], next_t_batch[i]) # Append transition to memory
25 | else:
26 | self.push(state_batch[i], action_batch[i], reward_batch[i], next_state_batch[i], mask_batch[i]) # Append transition to memory
27 |
28 | def sample(self, batch_size):
29 |
30 | batch = random.sample(self.buffer, batch_size)
31 | state, action, reward, next_state, mask, t, next_t = map(np.stack, zip(*batch))
32 | return state, action, reward, next_state, mask, t, next_t
33 |
34 | def __len__(self):
35 | return len(self.buffer)
36 |
--------------------------------------------------------------------------------
/rcbf_sac/evaluator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from scipy.io import savemat
4 | from rcbf_sac.utils import *
5 |
6 |
7 | class Evaluator(object):
8 |
9 | def __init__(self, num_episodes, interval, save_path=''):
10 | self.num_episodes = num_episodes
11 | self.interval = interval
12 | self.save_path = save_path
13 | self.results = np.array([]).reshape(num_episodes, 0)
14 |
15 | def __call__(self, env, policy, cbf_wrapper=None, dynamics_model=None, debug=False, visualize=False, save=True):
16 |
17 | self.is_training = False
18 | result = []
19 |
20 | for episode in range(self.num_episodes):
21 |
22 | # reset at the start of episode
23 | observation = env.reset()
24 |
25 | # Make sure to start from a safe state
26 | if cbf_wrapper and dynamics_model:
27 | out = None
28 | while out is None or cbf_wrapper.get_min_h_val(out) < 1e-4:
29 | observation = env.reset()
30 | state = dynamics_model.get_state(observation)
31 | out = dynamics_model.get_output(state)
32 |
33 | episode_steps = 0
34 | episode_reward = 0.
35 |
36 | assert observation is not None
37 |
38 | # start episode
39 | done = False
40 | while not done:
41 | # basic operation, action ,reward, blablabla ...
42 | action = policy(observation)
43 |
44 | observation, reward, done, info = env.step(action)
45 |
46 | if visualize:
47 | env.render(mode='human')
48 |
49 | # update
50 | episode_reward += reward
51 | episode_steps += 1
52 |
53 | if debug: prYellow('[Evaluate] #Episode{}: episode_reward:{}'.format(episode, episode_reward))
54 | result.append(episode_reward)
55 |
56 | result = np.array(result).reshape(-1, 1)
57 | self.results = np.hstack([self.results, result])
58 |
59 | if save:
60 | self.save_results('{}/validate_reward'.format(self.save_path))
61 | return np.mean(result)
62 |
63 | def save_results(self, fn):
64 |
65 | y = np.mean(self.results, axis=0)
66 | error = np.std(self.results, axis=0)
67 |
68 | x = range(0, self.results.shape[1] * self.interval, self.interval)
69 | fig, ax = plt.subplots(1, 1, figsize=(6, 5))
70 | plt.xlabel('Timestep')
71 | plt.ylabel('Average Reward')
72 | ax.errorbar(x, y, yerr=error, fmt='-o')
73 | plt.savefig(fn + '.png')
74 | savemat(fn + '.mat', {'reward': self.results})
75 | plt.close()
--------------------------------------------------------------------------------
/rcbf_sac/generate_rollouts.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from copy import deepcopy
3 | from rcbf_sac.utils import euler_to_mat_2d, prCyan, prRed
4 |
5 |
6 | def generate_model_rollouts(env, memory_model, memory, agent, dynamics_model, k_horizon=1, batch_size=20, warmup=False):
7 |
8 | def policy(observation):
9 |
10 | if warmup and env.action_space:
11 | action = agent.select_action(observation, dynamics_model, warmup=True) # Sample action from policy
12 | else:
13 | action = agent.select_action(observation, dynamics_model, evaluate=False) # Sample action from policy
14 |
15 | return action
16 |
17 | # Sample a batch from memory
18 | obs_batch, action_batch, reward_batch, next_obs_batch, mask_batch, t_batch, next_t_batch = memory.sample(batch_size=batch_size)
19 |
20 | obs_batch_ = deepcopy(obs_batch)
21 | done_batch_ = [False for _ in range(batch_size)]
22 | t_batch_ = deepcopy(t_batch)
23 |
24 | for k in range(k_horizon):
25 |
26 | batch_size_ = obs_batch_.shape[0] # that's because we remove steps where done = True so batch_size shrinks
27 |
28 | action_batch_ = policy(obs_batch_)
29 | state_batch_ = dynamics_model.get_state(obs_batch_)
30 | next_state_mu_, next_state_std_, next_t_batch_ = dynamics_model.predict_next_state(state_batch_, action_batch_, t_batch=t_batch)
31 | next_state_batch_ = np.random.normal(next_state_mu_, next_state_std_)
32 | next_obs_batch_ = dynamics_model.get_obs(next_state_batch_)
33 |
34 | if env.dynamics_mode == 'Unicycle':
35 |
36 | # Construct Next Observation from State
37 | dist2goal_prev = -np.log(obs_batch_[:, -1])
38 | goal_rel = env.unwrapped.goal_pos[:2] - next_obs_batch_[:, :2]
39 | dist2goal = np.linalg.norm(goal_rel, axis=1)
40 | assert dist2goal.shape == (batch_size_,), 'dist2goal should be a vector of size (batch_size,), got {} instead'.format(dist2goal.shape)
41 | # generate compass
42 | compass = np.matmul(np.expand_dims(goal_rel, 1), euler_to_mat_2d(next_state_batch_[:, 2])).squeeze(1)
43 | compass /= np.sqrt(np.sum(np.square(compass), axis=1, keepdims=True)) + 0.001
44 | next_obs_batch_ = np.hstack((next_obs_batch_, compass, np.expand_dims(np.exp(-dist2goal), axis=-1)))
45 |
46 | # Compute Reward
47 | goal_size = 0.3
48 | reward_goal = 1.0
49 | reward_distance = 1.0
50 | reward_batch_ = (dist2goal_prev - dist2goal) * reward_distance + (dist2goal <= goal_size) * reward_goal
51 | # Compute Done
52 | reached_goal = dist2goal <= goal_size
53 | reward_batch_ += reward_goal * reached_goal
54 | done_batch_ = reached_goal
55 | mask_batch_ = np.invert(done_batch_)
56 |
57 | elif env.dynamics_mode == 'SimulatedCars':
58 |
59 | # Compute Reward
60 | # car_4_vel = next_state_batch_[:, 7] # car's 4 velocity
61 | # reward_batch_ = -np.abs(car_4_vel) * np.abs(action_batch_.squeeze()) * (action_batch_.squeeze() > 0) / env.max_episode_steps
62 | reward_batch_ = -5.0 * np.abs(action_batch_.squeeze() ** 2) / env.max_episode_steps
63 |
64 | # Compute Done
65 | done_batch_ = next_t_batch_ >= env.max_episode_steps * env.dt # done?
66 | mask_batch_ = np.invert(done_batch_)
67 |
68 | else:
69 | raise Exception('Environment/Dynamics mode {} not Recognized!'.format(env.dynamics_mode))
70 |
71 | memory_model.batch_push(obs_batch_, action_batch_, reward_batch_, next_obs_batch_, mask_batch_, t_batch_, next_t_batch_) # Append transition to memory
72 | t_batch_ = deepcopy(next_t_batch_)
73 |
74 | # Update Current Observation Batch
75 | obs_batch_ = deepcopy(next_obs_batch_)
76 |
77 | # Delete Done Trajectories
78 | if np.sum(done_batch_) > 0:
79 | obs_batch_ = np.delete(obs_batch_, done_batch_ > 0, axis=0)
80 |
81 | return memory_model
82 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SAC-RCBF (DEPRECATED, repo from updated paper [here](https://github.com/yemam3/Mod-RL-RCBF))
2 |
3 | Repository containing the code for the paper "Safe Model-Based Reinforcement Learning using Robust Control Barrier Functions". Specifically, an implementation of SAC + Robust Control Barrier Functions (RCBFs) for safe reinforcement learning in two custom environments.
4 |
5 | While exploring, an RL agent can take actions that lead the system to unsafe states. Here, we use a differentiable RCBF safety layer that minimially alters (in the least-squares sense) the actions taken by the RL agent to ensure the safety of the agent.
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | ## Robust Control Barrier Functions (RCBFs)
14 |
15 | In this work, we focus on RCBFs that are formulated with respect to differential inclusions of the following form:
16 |
17 |
18 |
19 |
20 |
21 | Here `D(x)` is a disturbance set unkown apriori to the robot, which we learn online during traing via Gaussian Processes (GPs). The underlying library is GPyTorch.
22 |
23 | The QP used to ensure the system's safety is given by:
24 |
25 |
26 |
27 |
28 |
29 | where `h(x)` is the RCBF, and `u_RL` is the action outputted by the RL policy. As such, the final (safe) action taken in the environment is given by `u = u_RL + u_RCBF` as shown in the following diagram:
30 |
31 |
32 |
33 |
34 |
35 |
36 | ## Coupling RL & RCBFs to Improve Training Performance
37 |
38 | The above is sufficient to ensure the safety of the system, however, we would also like to improve the performance of the learning by letting the RCBF layer guide the training. This is achieved via:
39 | * Using a differentiable version of the safety layer that allows us to backpropagte through the RCBF based Quadratic Program (QP) resulting in an end-to-end policy.
40 | * Using the GPs and the dynamics prior to generate synthetic data (model-based RL).
41 |
42 | ## Other Approaches
43 |
44 | In addition, the approach is compared against two other frameworks (implemented here) in the experiments:
45 | * A vanilla baseline that uses SAC with RCBFs without generating synthetic data nor backproping through the QP (RL loss computed wrt ouput of RL policy).
46 | * A modified approach from ["End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous Control Tasks"](https://ojs.aaai.org/index.php/AAAI/article/view/4213) that replaces their discrete time CBF formulation with RCBFs, but makes use of the supervised learning component to speed up the learning.
47 |
48 | ## Running the Experiments
49 |
50 | The two environments are `Unicycle` and `SimulatedCars`. `Unicycle` involves a unicycle robot tasked with reaching a desired location while avoiding obstacles and `SimulatedCars` involves a chain of cars driving in a lane, the RL agent controls the 4th car and must try minimzing control effort while avoiding colliding with the other cars.
51 |
52 | ### `Unicycle` Env:
53 |
54 | * Training the proposed approach:
55 | `python main.py --env Unicycle --gamma_b 20 --max_episodes 400 --cuda --updates_per_step 2 --batch_size 512 --seed 12345 --model_based`
56 |
57 | * Training the baseline:
58 | `python main.py --env Unicycle --gamma_b 20 --max_episodes 400 --cuda --updates_per_step 1 --batch_size 256 --seed 12345 --no_diff_qp`
59 |
60 | * Training the modified approach from "End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous Control Tasks":
61 | `python main.py --env Unicycle --gamma_b 20 --max_episodes 400 --cuda --updates_per_step 1 --batch_size 256 --seed 12345 --no_diff_qp --use_comp True`
62 |
63 | ### `SimulatedCars` Env:
64 |
65 | * Training the proposed approach:
66 | `python main.py --env SimulatedCars --gamma_b 20 --max_episodes 200 --cuda --updates_per_step 2 --batch_size 512 --seed 12345 --model_based`
67 |
68 | * Training the baseline:
69 | `python main.py --env SimulatedCars --gamma_b 20 --max_episodes 200 --cuda --updates_per_step 1 --batch_size 256 --seed 12345 --no_diff_qp`
70 |
71 | * Training the modified approach from "End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous Control Tasks":
72 | `python main.py --env SimulatedCars --gamma_b 20 --max_episodes 200 --cuda --updates_per_step 1 --batch_size 256 --seed 12345 --no_diff_qp --use_comp True`
73 |
74 | ### Testing
75 |
76 | * To test, add `--mode test` and `--resume /path/to/output/{1}-run-{2}`, where `{1}` is the env name and `{2}` is the experiment number, to any of the commands above.
77 |
--------------------------------------------------------------------------------
/rcbf_sac/compensator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | from torch.optim import Adam
5 | from rcbf_sac.utils import scale_action, to_tensor, to_numpy
6 |
7 | criterion = nn.MSELoss()
8 |
9 |
10 | def fanin_init(size, fanin=None):
11 | fanin = fanin or size[0]
12 | v = 1. / np.sqrt(fanin)
13 | return torch.Tensor(size).uniform_(-v, v)
14 |
15 |
16 | class CompensatorModel(nn.Module):
17 |
18 | def __init__(self, state_dim, action_dim, hidden1=30, hidden2=20, init_w=3e-3):
19 | super(CompensatorModel, self).__init__()
20 | self.fc1 = nn.Linear(state_dim, hidden1)
21 | self.fc2 = nn.Linear(hidden1, hidden2)
22 | self.fc3 = nn.Linear(hidden2, action_dim)
23 | self.init_weights(init_w)
24 |
25 | def init_weights(self, init_w):
26 | self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())
27 | self.fc2.weight.data = fanin_init(self.fc2.weight.data.size())
28 | self.fc3.weight.data.uniform_(-init_w, init_w)
29 |
30 | def forward(self, state):
31 | state = state
32 | out = self.fc1(state)
33 | out = torch.relu(out)
34 | out = self.fc2(out)
35 | out = torch.relu(out)
36 | action = torch.tanh(self.fc3(out)) # TODO:Not sure if they have the tanh here in Cheng's paper
37 | return action
38 |
39 |
40 | class Compensator:
41 |
42 | def __init__(self, state_dim, action_dim, action_lb, action_ub, args):
43 |
44 | if args.seed > 0:
45 | self.seed(args.seed)
46 |
47 | self.state_dim = state_dim
48 | self.action_dim = action_dim
49 |
50 | # # Create Actor and Critic Network
51 | # net_cfg = {
52 | # 'hidden1': args.hidden1,
53 | # 'hidden2': args.hidden2,
54 | # 'init_w': args.init_w
55 | # }
56 | # self.actor = CompensatorModel(state_dim, action_dim, **net_cfg)
57 | self.comp_actor = CompensatorModel(state_dim, action_dim)
58 | self.comp_actor_optim = Adam(self.comp_actor.parameters(), lr=args.comp_rate)
59 |
60 | self.device = torch.device("cuda" if args.cuda else "cpu")
61 | self.comp_actor.to(self.device)
62 |
63 | # Action Lower and Upper Bounds
64 | self.action_lb = to_tensor(action_lb, torch.FloatTensor, self.device)
65 | self.action_ub = to_tensor(action_ub, torch.FloatTensor, self.device)
66 |
67 | # If its never been trained then we don't want to use it (useful for eval only)
68 | self.is_trained = False
69 |
70 |
71 | def __call__(self, observation):
72 | action = self.comp_actor(observation) * self.is_trained
73 | return scale_action(action, self.action_lb, self.action_ub)
74 |
75 | def train(self, rollouts, epochs=1):
76 | """
77 |
78 | Parameters
79 | ----------
80 | rollout : list
81 | List of dicts with each dict containing the keys ['obs'], ['u_nom'],
82 | ['u_safe'], ['u_comp']. Each of those keys has a ndarray value where the rows are steps.
83 | Note here state refers to observation not internal state of the dynamics.
84 |
85 | Returns
86 | -------
87 |
88 | """
89 |
90 | self.is_trained = True # only want to use it if its been trained
91 |
92 | for epoch in range(epochs):
93 |
94 | # Compensator Actor update
95 | self.comp_actor.zero_grad()
96 |
97 | # Stack everything appropriately
98 | all_obs = np.zeros((0, self.state_dim))
99 | all_u_comp = np.zeros((0, self.action_dim))
100 | all_u_safe = np.zeros((0, self.action_dim))
101 |
102 | for rollout in rollouts:
103 | all_obs = np.vstack((all_obs, rollout['obs']))
104 | all_u_comp = np.vstack((all_u_comp, rollout['u_comp']))
105 | all_u_safe = np.vstack((all_u_safe, rollout['u_safe']))
106 |
107 | all_obs = to_tensor(all_obs, torch.FloatTensor, self.device)
108 | all_u_comp = to_tensor(all_u_comp, torch.FloatTensor, self.device)
109 | all_u_safe = to_tensor(all_u_safe, torch.FloatTensor, self.device)
110 | target = scale_action(all_u_comp + all_u_safe, self.action_lb, self.action_ub)
111 | comp_actor_loss = criterion(self.comp_actor(all_obs), target)
112 |
113 | comp_actor_loss.backward()
114 | self.comp_actor_optim.step()
115 |
116 | def load_weights(self, output):
117 | if output is None: return
118 |
119 | self.comp_actor.load_state_dict(
120 | torch.load('{}/comp_actor.pkl'.format(output))
121 | )
122 |
123 | def save_model(self, output):
124 | torch.save(
125 | self.comp_actor.state_dict(),
126 | '{}/comp_actor.pkl'.format(output)
127 | )
128 |
129 | def seed(self, s):
130 |
131 | torch.manual_seed(s)
132 | if torch.cuda.is_available():
133 | torch.cuda.manual_seed(s)
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
--------------------------------------------------------------------------------
/rcbf_sac/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import os
4 | import torch
5 | from torch.autograd import Variable
6 |
7 | USE_CUDA = torch.cuda.is_available()
8 |
9 |
10 | def prRed(prt): print("\033[91m {}\033[00m".format(prt))
11 |
12 |
13 | def prGreen(prt): print("\033[92m {}\033[00m".format(prt))
14 |
15 |
16 | def prYellow(prt): print("\033[93m {}\033[00m".format(prt))
17 |
18 |
19 | def prLightPurple(prt): print("\033[94m {}\033[00m".format(prt))
20 |
21 |
22 | def prPurple(prt): print("\033[95m {}\033[00m".format(prt))
23 |
24 |
25 | def prCyan(prt): print("\033[96m {}\033[00m".format(prt))
26 |
27 |
28 | def prLightGray(prt): print("\033[97m {}\033[00m".format(prt))
29 |
30 |
31 | def prBlack(prt): print("\033[98m {}\033[00m".format(prt))
32 |
33 |
34 | def mat_to_euler_2d(rot_mat):
35 | """
36 | rot_mat has shape:
37 | [[c -s 0],
38 | [s c 0],
39 | [0 0 1]]
40 | """
41 |
42 | theta = np.arcsin(rot_mat[1, 0])
43 | return theta
44 |
45 |
46 | def euler_to_mat_2d(theta_batch):
47 | s = np.sin(theta_batch)
48 | c = np.cos(theta_batch)
49 | Rs = np.zeros((theta_batch.shape[0], 2, 2))
50 | Rs[:, 0, 0] = c
51 | Rs[:, 0, 1] = -s
52 | Rs[:, 1, 0] = s
53 | Rs[:, 1, 1] = c
54 | return Rs
55 |
56 | def to_numpy(x):
57 | # convert torch tensor to numpy array
58 | return x.cpu().detach().double().numpy()
59 |
60 | def to_tensor(x, dtype, device, requires_grad=False):
61 | # convert numpy array to torch tensor
62 | return torch.from_numpy(x).type(dtype).to(device).requires_grad_(requires_grad)
63 |
64 | def scale_action(action, action_lb, action_ub, device=None):
65 |
66 | act_k = (action_ub - action_lb) / 2.
67 | act_b = (action_ub + action_lb) / 2.
68 | return act_k * action + act_b
69 |
70 |
71 | def soft_update(target, source, tau):
72 | for target_param, param in zip(target.parameters(), source.parameters()):
73 | target_param.data.copy_(
74 | target_param.data * (1.0 - tau) + param.data * tau
75 | )
76 |
77 |
78 | def hard_update(target, source):
79 | for target_param, param in zip(target.parameters(), source.parameters()):
80 | target_param.data.copy_(param.data)
81 |
82 |
83 | def create_log_gaussian(mean, log_std, t):
84 | quadratic = -((0.5 * (t - mean) / (log_std.exp())).pow(2))
85 | l = mean.shape
86 | log_z = log_std
87 | z = l[-1] * math.log(2 * math.pi)
88 | log_p = quadratic.sum(dim=-1) - log_z.sum(dim=-1) - 0.5 * z
89 | return log_p
90 |
91 |
92 | def logsumexp(inputs, dim=None, keepdim=False):
93 | if dim is None:
94 | inputs = inputs.view(-1)
95 | dim = 0
96 | s, _ = torch.max(inputs, dim=dim, keepdim=True)
97 | outputs = s + (inputs - s).exp().sum(dim=dim, keepdim=True).log()
98 | if not keepdim:
99 | outputs = outputs.squeeze(dim)
100 | return outputs
101 |
102 |
103 | def get_output_folder(parent_dir, env_name):
104 | """Return save folder.
105 |
106 | Assumes folders in the parent_dir have suffix -run{run
107 | number}. Finds the highest run number and sets the output folder
108 | to that number + 1. This is just convenient so that if you run the
109 | same script multiple times tensorboard can plot all of the results
110 | on the same plots with different names.
111 |
112 | Parameters
113 | ----------
114 | parent_dir: str
115 | Path of the directory containing all experiment runs.
116 |
117 | Returns
118 | -------
119 | parent_dir/run_dir
120 | Path to this run's save directory.
121 | """
122 | os.makedirs(parent_dir, exist_ok=True)
123 | experiment_id = 0
124 | for folder_name in os.listdir(parent_dir):
125 | if not os.path.isdir(os.path.join(parent_dir, folder_name)):
126 | continue
127 | try:
128 | folder_name = int(folder_name.split('-run')[-1])
129 | if folder_name > experiment_id:
130 | experiment_id = folder_name
131 | except:
132 | pass
133 | experiment_id += 1
134 |
135 | parent_dir = os.path.join(parent_dir, env_name)
136 | parent_dir = parent_dir + '-run{}'.format(experiment_id)
137 | os.makedirs(parent_dir, exist_ok=True)
138 | return parent_dir
139 |
140 |
141 | def get_wrapped_policy(agent, cbf_wrapper, dynamics_model, compensator=None, warmup=False, action_space=None,
142 | policy_eval=False):
143 |
144 | def wrapped_policy(observation):
145 |
146 | if warmup and action_space:
147 | action = action_space.sample() # Sample random action
148 | else:
149 | action = agent.select_action(observation, evaluate=policy_eval) # Sample action from policy
150 |
151 | if compensator:
152 | action_comp = compensator(observation)
153 | else:
154 | action_comp = 0
155 | state = dynamics_model.get_state(observation)
156 | disturb_mean, disturb_std = dynamics_model.predict_disturbance(state)
157 | action_safe = cbf_wrapper.get_u_safe(action + action_comp, state, disturb_mean, disturb_std)
158 | # print('state = {}, action = {}, action_comp = {}, u_safe = {}'.format(state, action, action_comp, u_safe))
159 | return action + action_comp + action_safe
160 |
161 | return wrapped_policy
162 |
--------------------------------------------------------------------------------
/rcbf_sac/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.distributions import Normal
5 |
6 | LOG_SIG_MAX = 2
7 | LOG_SIG_MIN = -20
8 | epsilon = 1e-6
9 |
10 | # Initialize Policy weights
11 | def weights_init_(m):
12 | if isinstance(m, nn.Linear):
13 | torch.nn.init.xavier_uniform_(m.weight, gain=1)
14 | torch.nn.init.constant_(m.bias, 0)
15 |
16 |
17 | class ValueNetwork(nn.Module):
18 | def __init__(self, num_inputs, hidden_dim):
19 | super(ValueNetwork, self).__init__()
20 |
21 | self.linear1 = nn.Linear(num_inputs, hidden_dim)
22 | self.linear2 = nn.Linear(hidden_dim, hidden_dim)
23 | self.linear3 = nn.Linear(hidden_dim, 1)
24 |
25 | self.apply(weights_init_)
26 |
27 | def forward(self, state):
28 | x = F.relu(self.linear1(state))
29 | x = F.relu(self.linear2(x))
30 | x = self.linear3(x)
31 | return x
32 |
33 |
34 | class QNetwork(nn.Module):
35 | def __init__(self, num_inputs, num_actions, hidden_dim):
36 | super(QNetwork, self).__init__()
37 |
38 | # Q1 architecture
39 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim)
40 | self.linear2 = nn.Linear(hidden_dim, hidden_dim)
41 | self.linear3 = nn.Linear(hidden_dim, 1)
42 |
43 | # Q2 architecture
44 | self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
45 | self.linear5 = nn.Linear(hidden_dim, hidden_dim)
46 | self.linear6 = nn.Linear(hidden_dim, 1)
47 |
48 | self.apply(weights_init_)
49 |
50 | def forward(self, state, action):
51 | xu = torch.cat([state, action], 1)
52 |
53 | x1 = F.relu(self.linear1(xu))
54 | x1 = F.relu(self.linear2(x1))
55 | x1 = self.linear3(x1)
56 |
57 | x2 = F.relu(self.linear4(xu))
58 | x2 = F.relu(self.linear5(x2))
59 | x2 = self.linear6(x2)
60 |
61 | return x1, x2
62 |
63 |
64 | class GaussianPolicy(nn.Module):
65 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None):
66 | super(GaussianPolicy, self).__init__()
67 |
68 | self.linear1 = nn.Linear(num_inputs, hidden_dim)
69 | self.linear2 = nn.Linear(hidden_dim, hidden_dim)
70 |
71 | self.mean_linear = nn.Linear(hidden_dim, num_actions)
72 | self.log_std_linear = nn.Linear(hidden_dim, num_actions)
73 |
74 | self.apply(weights_init_)
75 |
76 | # action rescaling
77 | if action_space is None:
78 | self.action_scale = torch.tensor(1.)
79 | self.action_bias = torch.tensor(0.)
80 | else:
81 | self.action_scale = torch.FloatTensor(
82 | (action_space.high - action_space.low) / 2.)
83 | self.action_bias = torch.FloatTensor(
84 | (action_space.high + action_space.low) / 2.)
85 |
86 | def forward(self, state):
87 | x = F.relu(self.linear1(state))
88 | x = F.relu(self.linear2(x))
89 | mean = self.mean_linear(x)
90 | log_std = self.log_std_linear(x)
91 | log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX)
92 | return mean, log_std
93 |
94 | def sample(self, state):
95 | mean, log_std = self.forward(state)
96 | std = log_std.exp()
97 | normal = Normal(mean, std)
98 | x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1))
99 | y_t = torch.tanh(x_t)
100 | action = y_t * self.action_scale + self.action_bias
101 | log_prob = normal.log_prob(x_t)
102 | # Enforcing Action Bound
103 | log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon)
104 | log_prob = log_prob.sum(1, keepdim=True)
105 | mean = torch.tanh(mean) * self.action_scale + self.action_bias
106 | return action, log_prob, mean
107 |
108 | def to(self, device):
109 | self.action_scale = self.action_scale.to(device)
110 | self.action_bias = self.action_bias.to(device)
111 | return super(GaussianPolicy, self).to(device)
112 |
113 |
114 | class DeterministicPolicy(nn.Module):
115 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None):
116 | super(DeterministicPolicy, self).__init__()
117 | self.linear1 = nn.Linear(num_inputs, hidden_dim)
118 | self.linear2 = nn.Linear(hidden_dim, hidden_dim)
119 |
120 | self.mean = nn.Linear(hidden_dim, num_actions)
121 | self.noise = torch.Tensor(num_actions)
122 |
123 | self.apply(weights_init_)
124 |
125 | # action rescaling
126 | if action_space is None:
127 | self.action_scale = 1.
128 | self.action_bias = 0.
129 | else:
130 | self.action_scale = torch.FloatTensor(
131 | (action_space.high - action_space.low) / 2.)
132 | self.action_bias = torch.FloatTensor(
133 | (action_space.high + action_space.low) / 2.)
134 |
135 | def forward(self, state):
136 | x = F.relu(self.linear1(state))
137 | x = F.relu(self.linear2(x))
138 | mean = torch.tanh(self.mean(x)) * self.action_scale + self.action_bias
139 | return mean
140 |
141 | def sample(self, state):
142 | mean = self.forward(state)
143 | noise = self.noise.normal_(0., std=0.1)
144 | noise = noise.clamp(-0.25, 0.25)
145 | action = mean + noise
146 | return action, torch.tensor(0.), mean
147 |
148 | def to(self, device):
149 | self.action_scale = self.action_scale.to(device)
150 | self.action_bias = self.action_bias.to(device)
151 | self.noise = self.noise.to(device)
152 | return super(DeterministicPolicy, self).to(device)
153 |
--------------------------------------------------------------------------------
/rcbf_sac/gp_model.py:
--------------------------------------------------------------------------------
1 | """ Adapted almost directly from:
2 | https://docs.gpytorch.ai/en/stable/examples/02_Scalable_Exact_GPs/Simple_GP_Regression_CUDA.html
3 |
4 | Training is performed rapidly (and exactly) using GPUs and prediction is done very rapidly using LOVE.
5 | """
6 |
7 | import torch
8 | import gpytorch
9 | from rcbf_sac.utils import to_tensor, to_numpy
10 |
11 |
12 | class BaseGPy(gpytorch.models.ExactGP):
13 |
14 | def __init__(self, train_x, train_y, prior_std, likelihood):
15 | super().__init__(train_x, train_y, likelihood)
16 | self.mean_module = gpytorch.means.ZeroMean()
17 | self.covar_module = gpytorch.kernels.ScaleKernel(
18 | gpytorch.kernels.RBFKernel(lengthscale_prior=gpytorch.priors.NormalPrior(1e5, 1e-5)),
19 | outputscale_prior=gpytorch.priors.NormalPrior(prior_std + 1e-6, 1e-5))
20 | # Initialize lengthscale and outputscale to mean of priors
21 | self.covar_module.base_kernel.lengthscale = 1e5
22 | self.covar_module.outputscale = prior_std + 1e-6
23 |
24 | def forward(self, x):
25 | mean = self.mean_module(x)
26 | covar = self.covar_module(x)
27 | return gpytorch.distributions.MultivariateNormal(mean, covar)
28 |
29 |
30 | class GPyDisturbanceEstimator:
31 | """
32 | A wrapper around teh BaseGPy model above.
33 | """
34 |
35 | def __init__(self, train_x, train_y, prior_std, likelihood=None, device=None):
36 |
37 |
38 |
39 | if device:
40 | self.device = device
41 | else:
42 | self.device = torch.device("cpu")
43 |
44 | if not torch.is_tensor(train_x):
45 | train_x = to_tensor(train_x, torch.FloatTensor, self.device)
46 | if not torch.is_tensor(train_y):
47 | train_y = to_tensor(train_y, torch.FloatTensor, self.device)
48 | self.train_x = train_x
49 | self.train_y = train_y
50 |
51 | if not likelihood:
52 | likelihood = gpytorch.likelihoods.GaussianLikelihood()
53 | self.likelihood = likelihood.to(self.device)
54 |
55 | self.model = BaseGPy(train_x, train_y, prior_std, likelihood)
56 | self.model = self.model.to(self.device)
57 |
58 | def train(self, training_iter, verbose=False):
59 |
60 | # Find optimal model hyperparameters
61 | self.model.train()
62 | self.likelihood.train()
63 |
64 | # Use the adam optimizer
65 | optimizer = torch.optim.Adam(self.model.parameters(), lr=0.1) # Includes GaussianLikelihood parameters
66 |
67 | # "Loss" for GPs - the marginal log likelihood
68 | mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model)
69 |
70 | for i in range(training_iter):
71 | # Zero gradients from previous iteration
72 | optimizer.zero_grad()
73 | # Output from model
74 | output = self.model(self.train_x)
75 | # Calc loss and backprop gradients
76 | loss = -mll(output, self.train_y)
77 | loss.backward()
78 | if verbose:
79 | print('\tIter %d/%d - Loss: %.3f lengthscale: %.3f noise: %.3f' % (
80 | i + 1, training_iter, loss.item(),
81 | self.model.covar_module.base_kernel.lengthscale.item(),
82 | self.model.likelihood.noise.item()
83 | ))
84 | optimizer.step()
85 |
86 | def predict(self, test_x):
87 |
88 | # Convert to torch tensor
89 | is_tensor = torch.is_tensor(test_x)
90 | if not is_tensor:
91 | test_x = to_tensor(test_x, torch.FloatTensor, self.device)
92 |
93 | # Get into evaluation (predictive posterior) mode
94 | self.model.eval()
95 | self.likelihood.eval()
96 |
97 | # Test points are regularly spaced along [0,1]
98 | # Make predictions by feeding model through likelihood
99 | with torch.no_grad(), gpytorch.settings.fast_pred_var():
100 | observed_pred = self.likelihood(self.model(test_x))
101 | pred_dict = dict()
102 | pred_dict['mean'] = observed_pred.mean.cpu()
103 | pred_dict['f_var'] = observed_pred.variance.cpu()
104 | pred_dict['f_covar'] = observed_pred.covariance_matrix.cpu()
105 | lower_ci, upper_ci = observed_pred.confidence_region()
106 | pred_dict['lower_ci'] = lower_ci.cpu()
107 | pred_dict['upper_ci'] = upper_ci.cpu()
108 |
109 | # If they gave us ndarray, we give back ndarray
110 | if not is_tensor:
111 | for key, val in pred_dict.items():
112 | pred_dict[key] = to_numpy(val)
113 |
114 | return pred_dict
115 |
116 |
117 | if __name__ == '__main__':
118 | """
119 | Simple code to test the GP model on a simple dataset.
120 | """
121 |
122 | # this is for running the notebook in our testing framework
123 | import os
124 | import matplotlib.pyplot as plt
125 | import math
126 |
127 | smoke_test = ('CI' in os.environ)
128 | training_iter = 2 if smoke_test else 50
129 | # Training data is 11 points in [0,1] inclusive regularly spaced
130 | train_x = torch.linspace(0, 1, 100)
131 | # True function is sin(2*pi*x) with Gaussian noise
132 | train_y = torch.sin(train_x * (2 * math.pi)) + torch.randn(train_x.size()) * 0.2
133 |
134 | disturb_estimator = GPyDisturbanceEstimator(train_x, train_y, 0.0)
135 | disturb_estimator.train(training_iter)
136 |
137 | print('Testing model...')
138 | test_x = torch.linspace(0, 1, 51)
139 | prediction = disturb_estimator.predict(test_x)
140 | train_x = train_x.cpu()
141 | train_y = train_y.cpu()
142 | test_x = test_x.cpu()
143 |
144 | with torch.no_grad():
145 | # Initialize plot
146 | f, ax = plt.subplots(1, 1, figsize=(4, 3))
147 | # Plot training data as black stars
148 | ax.plot(train_x.numpy(), train_y.numpy(), 'k*')
149 | # Plot predictive means as blue line
150 | ax.plot(test_x.numpy(), prediction['mean'].numpy(), 'b')
151 | # Shade between the lower and upper confidence bounds
152 | ax.fill_between(test_x.numpy(), prediction['lower_ci'].numpy(), prediction['upper_ci'].numpy(), alpha=0.5)
153 | ax.set_ylim([-3, 3])
154 | ax.legend(['Observed Data', 'Mean', 'Confidence'])
155 | plt.show()
--------------------------------------------------------------------------------
/envs/simulated_cars_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from gym import spaces
4 |
5 |
6 | class SimulatedCarsEnv(gym.Env):
7 | """Simulated Car Following Env, almost identical to https://github.com/rcheng805/RL-CBF/blob/master/car/DDPG/car_simulator.py
8 | Front <- Car 1 <- Car 2 <- Car 3 <- Car 4 (controlled) <- Car 5
9 | """
10 |
11 | metadata = {'render.modes': ['human']}
12 |
13 | def __init__(self):
14 |
15 | super(SimulatedCarsEnv, self).__init__()
16 |
17 | self.dynamics_mode = 'SimulatedCars'
18 | self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,))
19 | self.safe_action_space = spaces.Box(low=-10.0, high=10.0, shape=(1,))
20 | self.observation_space = spaces.Box(low=-1e10, high=1e10, shape=(10,))
21 | self.max_episode_steps = 300
22 | self.dt = 0.02
23 |
24 | # Gains
25 | self.kp = 4.0
26 | self.k_brake = 20.0
27 |
28 | self.state = None # State [x_1 v_1 ... x_5 v_5]
29 | self.t = 0 # Time
30 | self.episode_step = 0 # Episode Step
31 |
32 | # Gaussian Noise Parameters on the accelerations of the other vehicles
33 | self.disturb_mean = np.zeros((1,))
34 | self.disturb_covar = np.diag([0.2**2])
35 |
36 | self.reset()
37 |
38 | def step(self, action):
39 | """Organize the observation to understand what's going on
40 |
41 | Parameters
42 | ----------
43 | action : ndarray
44 | Action that the agent takes in the environment
45 |
46 | Returns
47 | -------
48 | new_obs : ndarray
49 | The new observation with the following structure:
50 | [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, dist2goal]
51 |
52 | """
53 |
54 | # Current State
55 | pos = self.state[::2]
56 | vels = self.state[1::2]
57 |
58 | # Actions (accelerations of Cars 1 to 5)
59 | vels_des = 30.0 * np.ones(5) # Desired velocities
60 | vels_des[0] -= 10*np.sin(0.2*self.t)
61 | accels = self.kp * (vels_des - vels)
62 | accels[1] += -self.k_brake * (pos[0] - pos[1]) * ((pos[0] - pos[1]) < 6.0)
63 | accels[2] += -self.k_brake * (pos[1] - pos[2]) * ((pos[1] - pos[2]) < 6.0)
64 | accels[4] += -self.k_brake * (pos[2] - pos[4]) * ((pos[2] - pos[4]) < 13.0)
65 |
66 | # Add deterministic disturbance
67 | accels *= 1.1
68 |
69 | # Determine action of each car
70 | f_x = np.zeros(10)
71 | g_x = np.zeros(10)
72 |
73 | f_x[::2] = vels # Derivatives of positions are velocities
74 | f_x[1::2] = accels # Derivatives of velocities are acceleration
75 | g_x[7] = 50.0 # Car 4's acceleration (idx = 2*4 - 1) is the control input
76 |
77 | self.state += self.dt * (f_x + g_x * action)
78 |
79 | self.t = self.t + self.dt # time
80 |
81 | self.episode_step += 1 # steps in episode
82 |
83 | done = self.episode_step >= self.max_episode_steps # done?
84 |
85 | info = {'cost': self._get_cost(), 'goal_met': False} # Goal is never met since we're driving into the sunset
86 |
87 | return self._get_obs(), self._get_reward(action[0]), done, info
88 |
89 | def _get_reward(self, action):
90 |
91 | # car_4_vel = self.state[7] # car's 4 velocity
92 | # return -np.abs(car_4_vel) * np.abs(action) * (action > 0) / self.max_episode_steps
93 | return -5.0 * np.abs(action**2) / self.max_episode_steps
94 |
95 | def _get_cost(self):
96 |
97 | car_4_pos = self.state[6] # car's 4 position
98 | cost = 0
99 |
100 | if (self.state[4] - car_4_pos) < 2.99: # How far is car 3?
101 | cost -= 0.1
102 |
103 | if (car_4_pos - self.state[8]) < 2.99: # How far is car 4?
104 | cost -= 0.1
105 |
106 | return cost
107 |
108 | def reset(self):
109 | """ Reset the state of the environment to an initial state.
110 |
111 | Returns
112 | -------
113 | observation : ndarray
114 | Next observation.
115 | """
116 |
117 | self.t = 0
118 | self.state = np.zeros(10) # first col is pos, 2nd is vel
119 | self.state[::2] = [34.0, 28.0, 22.0, 16.0, 10.0] # initial positions
120 | self.state[1::2] = 30.0 + np.random.normal(0, 0.5) # initial velocities
121 | self.state[7] = 35.0 # initial velocity of car 4
122 |
123 | self.episode_step = 0
124 |
125 | return self._get_obs()
126 |
127 |
128 | def render(self, mode='human', close=False):
129 | """Render the environment to the screen
130 |
131 | Parameters
132 | ----------
133 | mode : str
134 | close : bool
135 |
136 | Returns
137 | -------
138 |
139 | """
140 |
141 | print('Ep_step = {}, \tState = {}'.format(self.episode_step, self.state))
142 |
143 | def _get_obs(self):
144 | """Given the state, this function returns it to an observation akin to the one obtained by calling env.step
145 |
146 | Parameters
147 | ----------
148 |
149 | Returns
150 | -------
151 | observation : ndarray
152 | Observation: [car_1_x, car_1_v, car_1_a, ...]
153 | """
154 |
155 | obs = np.copy(np.ravel(self.state))
156 | obs[::2] /= 100.0 # scale positions
157 | obs[1::2] /= 30.0 # scale velocities
158 | return obs
159 |
160 |
161 | if __name__ == "__main__":
162 |
163 | import matplotlib.pyplot as plt
164 | from rcbf_sac.cbf_qp import CascadeCBFLayer
165 | from rcbf_sac.dynamics import DynamicsModel
166 | import argparse
167 |
168 | parser = argparse.ArgumentParser()
169 | parser.add_argument('--gp_model_size', default=2000, type=int, help='gp')
170 | parser.add_argument('--k_d', default=3.0, type=float)
171 | parser.add_argument('--gamma_b', default=20, type=float)
172 | parser.add_argument('--cuda', action="store_true", help='run on CUDA (default: False)')
173 | args = parser.parse_args()
174 |
175 | env = SimulatedCarsEnv()
176 | dynamics_model = DynamicsModel(env, args)
177 | cbf_wrapper = CascadeCBFLayer(env, gamma_b=args.gamma_b, k_d=args.k_d)
178 |
179 | obs = env.reset()
180 | state = dynamics_model.get_state(obs)
181 | done = False
182 | episode_reward = 0
183 | episode_step = 0
184 |
185 | # Plot initial state
186 | car_patches = []
187 | plt.figure(figsize=(20, 5), dpi=80)
188 | for i in range(5):
189 | car_patches.append(plt.Rectangle((state[2*i] - 1.5, -1.0), 3.0, 2.0, fc='blue', ec='blue', alpha=0.2))
190 | plt.gca().add_patch(car_patches[i])
191 | p_vel = plt.quiver(state[::2], np.zeros(5), state[1::2], np.zeros(5))
192 | plt.ylim([-6.0, 6.0])
193 | plt.grid()
194 |
195 | def controller(state):
196 | gain = 1.0
197 | action = np.array([gain * (state[4] - state[6] - 0.4) * (state[4] - state[6] - 0.4 < 0)])
198 | action += np.array([gain * (state[8] - state[6] + 0.4) * (state[8] - state[6] + 0.4 > 0)])
199 | return action
200 |
201 | while not done:
202 | # Plot current state
203 | state = dynamics_model.get_state(obs)
204 | print('obs = {}'.format(obs))
205 | print('state = {}'.format(state))
206 | pos = state[::2]
207 | for i in range(5):
208 | car_patches[i].set_x(state[2*i] - 1.5)
209 | p_vel.XY[:, 0] = state[::2]
210 | p_vel.set_UVC(state[1::2], np.zeros(5))
211 | # Take Action and get next state
212 | # random_action = env.action_space.sample()
213 | random_action = controller(state)
214 | disturb_mean, disturb_std = dynamics_model.predict_disturbance(state)
215 | action_safe = cbf_wrapper.get_u_safe(random_action, state, disturb_mean, disturb_std)
216 | # Predict next state (testing for model-based rollouts)
217 | # Note that in this env, obs and state are the same but that's not always the case!
218 | next_state, next_state_std, _ = dynamics_model.predict_next_state(obs, random_action + action_safe, t_batch=np.array([env.dt * episode_step]), use_gps=False)
219 | # Take Environment Action
220 | obs, reward, done, info = env.step(random_action + action_safe)
221 | # assert np.sum(np.abs(next_state - obs)) < 1e-6, 'Predicted and Actual Next States are not the same!\nPredicted: {} \nActual: {}'.format(next_state, obs)
222 | plt.xlim([pos[-1] - 5.0, pos[0] + 5.0])
223 | plt.pause(0.01)
224 | episode_reward += reward
225 | episode_step += 1
226 | print('action = {},\taction_cbf = {},\tepisode_reward = {:.3f}'.format(random_action + action_safe, action_safe, episode_reward))
227 |
228 | plt.show()
229 |
--------------------------------------------------------------------------------
/rcbf_sac/sac_cbf.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch.optim import Adam
4 | from rcbf_sac.utils import soft_update, hard_update
5 | from rcbf_sac.model import GaussianPolicy, QNetwork, DeterministicPolicy
6 | import numpy as np
7 | from rcbf_sac.diff_cbf_qp import CBFQPLayer
8 | from rcbf_sac.utils import to_tensor
9 | from rcbf_sac.compensator import Compensator
10 | from copy import deepcopy
11 |
12 | class RCBF_SAC(object):
13 |
14 | def __init__(self, num_inputs, action_space, env, args):
15 |
16 | self.gamma = args.gamma
17 | self.tau = args.tau
18 | self.alpha = args.alpha
19 |
20 | self.policy_type = args.policy
21 | self.target_update_interval = args.target_update_interval
22 | self.automatic_entropy_tuning = args.automatic_entropy_tuning
23 | self.action_space = action_space
24 | self.device = torch.device("cuda" if args.cuda else "cpu")
25 |
26 | self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
27 | self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)
28 |
29 | self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
30 | hard_update(self.critic_target, self.critic)
31 |
32 | if self.policy_type == "Gaussian":
33 | # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
34 | if self.automatic_entropy_tuning is True:
35 | self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
36 | self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
37 | self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
38 |
39 | self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
40 | self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
41 |
42 | else:
43 | self.alpha = 0
44 | self.automatic_entropy_tuning = False
45 | self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
46 | self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
47 |
48 | # CBF layer
49 | self.env = env
50 | self.cbf_layer = CBFQPLayer(env, args, args.gamma_b, args.k_d, args.l_p)
51 | self.diff_qp = args.diff_qp
52 |
53 | # compensator
54 | if args.use_comp:
55 | self.compensator = Compensator(num_inputs, action_space.shape[0], action_space.low, action_space.high, args)
56 | else:
57 | self.compensator = None
58 |
59 | def select_action(self, state, dynamics_model, evaluate=False, warmup=False):
60 |
61 | state = to_tensor(state, torch.FloatTensor, self.device)
62 | expand_dim = len(state.shape) == 1
63 | if expand_dim:
64 | state = state.unsqueeze(0)
65 |
66 | if warmup:
67 | batch_size = state.shape[0]
68 | action = torch.zeros((batch_size, self.action_space.shape[0])).to(self.device)
69 | for i in range(batch_size):
70 | action[i] = torch.from_numpy(self.action_space.sample()).to(self.device)
71 | else:
72 | if evaluate is False:
73 | action, _, _ = self.policy.sample(state)
74 | else:
75 | _, _, action = self.policy.sample(state)
76 |
77 | if self.compensator:
78 | action_comp = self.compensator(state)
79 | action += action_comp
80 |
81 | safe_action = self.get_safe_action(state, action, dynamics_model)
82 |
83 | if not self.compensator:
84 | return safe_action.detach().cpu().numpy()[0] if expand_dim else safe_action.detach().cpu().numpy()
85 | else:
86 | action_cbf = safe_action - action
87 | action_comp = action_comp.detach().cpu().numpy()[0] if expand_dim else action_comp.detach().cpu().numpy()
88 | action_cbf = action_cbf.detach().cpu().numpy()[0] if expand_dim else action_cbf.detach().cpu().numpy()
89 | safe_action = safe_action.detach().cpu().numpy()[0] if expand_dim else safe_action.detach().cpu().numpy()
90 | return safe_action, action_comp, action_cbf
91 |
92 | def update_parameters(self, memory, batch_size, updates, dynamics_model, memory_model=None, real_ratio=None):
93 | """
94 |
95 | Parameters
96 | ----------
97 | memory : ReplayMemory
98 | batch_size : int
99 | updates : int
100 | dynamics_model : GP Dynamics' Disturbance model D(x) in x_dot = f(x) + g(x)u + D(x)
101 | memory_model : ReplayMemory, optional
102 | If not none, perform model-based RL.
103 | real_ratio : float, optional
104 | If performing model-based RL, then real_ratio*batch_size are sampled from the real buffer, and the rest
105 | is sampled from the model buffer.
106 | Returns
107 | -------
108 |
109 | """
110 |
111 | # Model-based vs regular RL
112 | if memory_model and real_ratio:
113 | state_batch, action_batch, reward_batch, next_state_batch, mask_batch, t_batch, next_t_batch = memory.sample(batch_size=int(real_ratio*batch_size))
114 | state_batch_m, action_batch_m, reward_batch_m, next_state_batch_m, mask_batch_m, t_batch_m, next_t_batch_m = memory_model.sample(
115 | batch_size=int((1-real_ratio) * batch_size))
116 | state_batch = np.vstack((state_batch, state_batch_m))
117 | action_batch = np.vstack((action_batch, action_batch_m))
118 | reward_batch = np.hstack((reward_batch, reward_batch_m))
119 | next_state_batch = np.vstack((next_state_batch, next_state_batch_m))
120 | mask_batch = np.hstack((mask_batch, mask_batch_m))
121 | else:
122 | state_batch, action_batch, reward_batch, next_state_batch, mask_batch, t_batch, next_t_batch = memory.sample(batch_size=batch_size)
123 |
124 | state_batch = torch.FloatTensor(state_batch).to(self.device)
125 | next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
126 | action_batch = torch.FloatTensor(action_batch).to(self.device)
127 | reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
128 | mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)
129 |
130 | with torch.no_grad():
131 | next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch)
132 | if self.diff_qp: # Compute next safe actions using Differentiable CBF-QP
133 | next_state_action = self.get_safe_action(next_state_batch, next_state_action, dynamics_model)
134 | qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action)
135 | min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
136 | next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target)
137 | qf1, qf2 = self.critic(state_batch, action_batch) # Two Q-functions to mitigate positive bias in the policy improvement step
138 | qf1_loss = F.mse_loss(qf1, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
139 | qf2_loss = F.mse_loss(qf2, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
140 | qf_loss = qf1_loss + qf2_loss
141 |
142 | self.critic_optim.zero_grad()
143 | qf_loss.backward()
144 | self.critic_optim.step()
145 |
146 | # Compute Actions and log probabilities
147 | pi, log_pi, _ = self.policy.sample(state_batch)
148 | if self.diff_qp: # Compute safe action using Differentiable CBF-QP
149 | pi = self.get_safe_action(state_batch, pi, dynamics_model)
150 |
151 | qf1_pi, qf2_pi = self.critic(state_batch, pi)
152 | min_qf_pi = torch.min(qf1_pi, qf2_pi)
153 |
154 | policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
155 |
156 | self.policy_optim.zero_grad()
157 | policy_loss.backward()
158 | self.policy_optim.step()
159 |
160 | if self.automatic_entropy_tuning:
161 | alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
162 |
163 | self.alpha_optim.zero_grad()
164 | alpha_loss.backward()
165 | self.alpha_optim.step()
166 |
167 | self.alpha = self.log_alpha.exp()
168 | alpha_tlogs = self.alpha.clone() # For Comet.ml logs
169 | else:
170 | alpha_loss = torch.tensor(0.).to(self.device)
171 | alpha_tlogs = torch.tensor(self.alpha) # For Comet.ml logs
172 |
173 | if updates % self.target_update_interval == 0:
174 | soft_update(self.critic_target, self.critic, self.tau)
175 |
176 | return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item()
177 |
178 | def update_parameters_compensator(self, comp_rollouts):
179 |
180 | if self.compensator:
181 | self.compensator.train(comp_rollouts)
182 |
183 |
184 | # Save model parameters
185 | def save_model(self, output):
186 | print('Saving models in {}'.format(output))
187 | torch.save(
188 | self.policy.state_dict(),
189 | '{}/actor.pkl'.format(output)
190 | )
191 | torch.save(
192 | self.critic.state_dict(),
193 | '{}/critic.pkl'.format(output)
194 | )
195 | if self.compensator:
196 | self.compensator.save_model(output)
197 |
198 | # Load model parameters
199 | def load_weights(self, output):
200 | if output is None: return
201 | print('Loading models from {}'.format(output))
202 |
203 | self.policy.load_state_dict(
204 | torch.load('{}/actor.pkl'.format(output), map_location=torch.device(self.device))
205 | )
206 | self.critic.load_state_dict(
207 | torch.load('{}/critic.pkl'.format(output), map_location=torch.device(self.device))
208 | )
209 | if self.compensator:
210 | self.compensator.load_weights(output)
211 |
212 | def load_model(self, actor_path, critic_path):
213 | if actor_path is not None:
214 | self.policy.load_state_dict(torch.load(actor_path))
215 | if critic_path is not None:
216 | self.critic.load_state_dict(torch.load(critic_path))
217 |
218 | def get_safe_action(self, obs_batch, action_batch, dynamics_model):
219 | """Given a nominal action, returns a minimally-altered safe action to take.
220 |
221 | Parameters
222 | ----------
223 | obs_batch : torch.tensor
224 | action_batch : torch.tensor
225 | dynamics_model : DynamicsModel
226 |
227 | Returns
228 | -------
229 | safe_action_batch : torch.tensor
230 | Safe actions to be taken (cbf_action + action).
231 | """
232 |
233 | state_batch = dynamics_model.get_state(obs_batch)
234 | mean_pred_batch, sigma_pred_batch = dynamics_model.predict_disturbance(state_batch)
235 |
236 | safe_action_batch = self.cbf_layer.get_safe_action(state_batch, action_batch, mean_pred_batch, sigma_pred_batch)
237 |
238 | return safe_action_batch
239 |
240 |
241 |
242 |
--------------------------------------------------------------------------------
/envs/pyglet_rendering.py:
--------------------------------------------------------------------------------
1 | """
2 | 2D rendering framework (Obtained from: https://github.com/openai/gym/blob/95d649fdeca89bfed5e199829d4d905a52d2da2e/gym/utils/pyglet_rendering.py)
3 | """
4 | import os
5 | import sys
6 |
7 | if "Apple" in sys.version:
8 | if "DYLD_FALLBACK_LIBRARY_PATH" in os.environ:
9 | os.environ["DYLD_FALLBACK_LIBRARY_PATH"] += ":/usr/lib"
10 | # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
11 |
12 | from gym import error
13 |
14 | try:
15 | import pyglet
16 | except ImportError as e:
17 | raise ImportError(
18 | """
19 | Cannot import pyglet.
20 | HINT: you can install pyglet directly via 'pip install pyglet'.
21 | But if you really just want to install all Gym dependencies and not have to think about it,
22 | 'pip install -e .[all]' or 'pip install gym[all]' will do it.
23 | """
24 | )
25 |
26 | try:
27 | from pyglet.gl import *
28 | except ImportError as e:
29 | raise ImportError(
30 | """
31 | Error occurred while running `from pyglet.gl import *`
32 | HINT: make sure you have OpenGL installed. On Ubuntu, you can run 'apt-get install python-opengl'.
33 | If you're running on a server, you may need a virtual frame buffer; something like this should work:
34 | 'xvfb-run -s \"-screen 0 1400x900x24\" python '
35 | """
36 | )
37 |
38 | import math
39 | import numpy as np
40 |
41 | RAD2DEG = 57.29577951308232
42 |
43 |
44 | def get_display(spec):
45 | """Convert a display specification (such as :0) into an actual Display
46 | object.
47 | Pyglet only supports multiple Displays on Linux.
48 | """
49 | if spec is None:
50 | return pyglet.canvas.get_display()
51 | # returns already available pyglet_display,
52 | # if there is no pyglet display available then it creates one
53 | elif isinstance(spec, str):
54 | return pyglet.canvas.Display(spec)
55 | else:
56 | raise error.Error(
57 | f"Invalid display specification: {spec}. (Must be a string like :0 or None.)"
58 | )
59 |
60 |
61 | def get_window(width, height, display, **kwargs):
62 | """
63 | Will create a pyglet window from the display specification provided.
64 | """
65 | screen = display.get_screens() # available screens
66 | config = screen[0].get_best_config() # selecting the first screen
67 | context = config.create_context(None) # create GL context
68 |
69 | return pyglet.window.Window(
70 | width=width,
71 | height=height,
72 | display=display,
73 | config=config,
74 | context=context,
75 | **kwargs,
76 | )
77 |
78 |
79 | class Viewer:
80 | def __init__(self, width, height, display=None):
81 | display = get_display(display)
82 |
83 | self.width = width
84 | self.height = height
85 | self.window = get_window(width=width, height=height, display=display)
86 | self.window.on_close = self.window_closed_by_user
87 | self.isopen = True
88 | self.geoms = []
89 | self.onetime_geoms = []
90 | self.transform = Transform()
91 |
92 | glEnable(GL_BLEND)
93 | glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
94 |
95 | def close(self):
96 | if self.isopen and sys.meta_path:
97 | # ^^^ check sys.meta_path to avoid 'ImportError: sys.meta_path is None, Python is likely shutting down'
98 | self.window.close()
99 | self.isopen = False
100 |
101 | def window_closed_by_user(self):
102 | self.isopen = False
103 |
104 | def set_bounds(self, left, right, bottom, top):
105 | assert right > left and top > bottom
106 | scalex = self.width / (right - left)
107 | scaley = self.height / (top - bottom)
108 | self.transform = Transform(
109 | translation=(-left * scalex, -bottom * scaley), scale=(scalex, scaley)
110 | )
111 |
112 | def add_geom(self, geom):
113 | self.geoms.append(geom)
114 |
115 | def add_onetime(self, geom):
116 | self.onetime_geoms.append(geom)
117 |
118 | def render(self, return_rgb_array=False):
119 | glClearColor(1, 1, 1, 1)
120 | self.window.clear()
121 | self.window.switch_to()
122 | self.window.dispatch_events()
123 | self.transform.enable()
124 | for geom in self.geoms:
125 | geom.render()
126 | for geom in self.onetime_geoms:
127 | geom.render()
128 | self.transform.disable()
129 | arr = None
130 | if return_rgb_array:
131 | buffer = pyglet.image.get_buffer_manager().get_color_buffer()
132 | image_data = buffer.get_image_data()
133 | arr = np.frombuffer(image_data.get_data(), dtype=np.uint8)
134 | # In https://github.com/openai/gym-http-api/issues/2, we
135 | # discovered that someone using Xmonad on Arch was having
136 | # a window of size 598 x 398, though a 600 x 400 window
137 | # was requested. (Guess Xmonad was preserving a pixel for
138 | # the boundary.) So we use the buffer height/width rather
139 | # than the requested one.
140 | arr = arr.reshape(buffer.height, buffer.width, 4)
141 | arr = arr[::-1, :, 0:3]
142 | self.window.flip()
143 | self.onetime_geoms = []
144 | return arr if return_rgb_array else self.isopen
145 |
146 | # Convenience
147 | def draw_circle(self, radius=10, res=30, filled=True, **attrs):
148 | geom = make_circle(radius=radius, res=res, filled=filled)
149 | _add_attrs(geom, attrs)
150 | self.add_onetime(geom)
151 | return geom
152 |
153 | def draw_polygon(self, v, filled=True, **attrs):
154 | geom = make_polygon(v=v, filled=filled)
155 | _add_attrs(geom, attrs)
156 | self.add_onetime(geom)
157 | return geom
158 |
159 | def draw_polyline(self, v, **attrs):
160 | geom = make_polyline(v=v)
161 | _add_attrs(geom, attrs)
162 | self.add_onetime(geom)
163 | return geom
164 |
165 | def draw_line(self, start, end, **attrs):
166 | geom = Line(start, end)
167 | _add_attrs(geom, attrs)
168 | self.add_onetime(geom)
169 | return geom
170 |
171 | def get_array(self):
172 | self.window.flip()
173 | image_data = (
174 | pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
175 | )
176 | self.window.flip()
177 | arr = np.fromstring(image_data.get_data(), dtype=np.uint8, sep="")
178 | arr = arr.reshape(self.height, self.width, 4)
179 | return arr[::-1, :, 0:3]
180 |
181 | def __del__(self):
182 | self.close()
183 |
184 |
185 | def _add_attrs(geom, attrs):
186 | if "color" in attrs:
187 | geom.set_color(*attrs["color"])
188 | if "linewidth" in attrs:
189 | geom.set_linewidth(attrs["linewidth"])
190 |
191 |
192 | class Geom:
193 | def __init__(self):
194 | self._color = Color((0, 0, 0, 1.0))
195 | self.attrs = [self._color]
196 |
197 | def render(self):
198 | for attr in reversed(self.attrs):
199 | attr.enable()
200 | self.render1()
201 | for attr in self.attrs:
202 | attr.disable()
203 |
204 | def render1(self):
205 | raise NotImplementedError
206 |
207 | def add_attr(self, attr):
208 | self.attrs.append(attr)
209 |
210 | def set_color(self, r, g, b):
211 | self._color.vec4 = (r, g, b, 1)
212 |
213 |
214 | class Attr:
215 | def enable(self):
216 | raise NotImplementedError
217 |
218 | def disable(self):
219 | pass
220 |
221 |
222 | class Transform(Attr):
223 | def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1, 1)):
224 | self.set_translation(*translation)
225 | self.set_rotation(rotation)
226 | self.set_scale(*scale)
227 |
228 | def enable(self):
229 | glPushMatrix()
230 | glTranslatef(
231 | self.translation[0], self.translation[1], 0
232 | ) # translate to GL loc ppint
233 | glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
234 | glScalef(self.scale[0], self.scale[1], 1)
235 |
236 | def disable(self):
237 | glPopMatrix()
238 |
239 | def set_translation(self, newx, newy):
240 | self.translation = (float(newx), float(newy))
241 |
242 | def set_rotation(self, new):
243 | self.rotation = float(new)
244 |
245 | def set_scale(self, newx, newy):
246 | self.scale = (float(newx), float(newy))
247 |
248 |
249 | class Color(Attr):
250 | def __init__(self, vec4):
251 | self.vec4 = vec4
252 |
253 | def enable(self):
254 | glColor4f(*self.vec4)
255 |
256 |
257 | class LineStyle(Attr):
258 | def __init__(self, style):
259 | self.style = style
260 |
261 | def enable(self):
262 | glEnable(GL_LINE_STIPPLE)
263 | glLineStipple(1, self.style)
264 |
265 | def disable(self):
266 | glDisable(GL_LINE_STIPPLE)
267 |
268 |
269 | class LineWidth(Attr):
270 | def __init__(self, stroke):
271 | self.stroke = stroke
272 |
273 | def enable(self):
274 | glLineWidth(self.stroke)
275 |
276 |
277 | class Point(Geom):
278 | def __init__(self):
279 | Geom.__init__(self)
280 |
281 | def render1(self):
282 | glBegin(GL_POINTS) # draw point
283 | glVertex3f(0.0, 0.0, 0.0)
284 | glEnd()
285 |
286 |
287 | class FilledPolygon(Geom):
288 | def __init__(self, v):
289 | Geom.__init__(self)
290 | self.v = v
291 |
292 | def render1(self):
293 | if len(self.v) == 4:
294 | glBegin(GL_QUADS)
295 | elif len(self.v) > 4:
296 | glBegin(GL_POLYGON)
297 | else:
298 | glBegin(GL_TRIANGLES)
299 | for p in self.v:
300 | glVertex3f(p[0], p[1], 0) # draw each vertex
301 | glEnd()
302 |
303 |
304 | def make_circle(radius=10, res=30, filled=True):
305 | points = []
306 | for i in range(res):
307 | ang = 2 * math.pi * i / res
308 | points.append((math.cos(ang) * radius, math.sin(ang) * radius))
309 | if filled:
310 | return FilledPolygon(points)
311 | else:
312 | return PolyLine(points, True)
313 |
314 |
315 | def make_polygon(v, filled=True):
316 | if filled:
317 | return FilledPolygon(v)
318 | else:
319 | return PolyLine(v, True)
320 |
321 |
322 | def make_polyline(v):
323 | return PolyLine(v, False)
324 |
325 |
326 | def make_capsule(length, width):
327 | l, r, t, b = 0, length, width / 2, -width / 2
328 | box = make_polygon([(l, b), (l, t), (r, t), (r, b)])
329 | circ0 = make_circle(width / 2)
330 | circ1 = make_circle(width / 2)
331 | circ1.add_attr(Transform(translation=(length, 0)))
332 | geom = Compound([box, circ0, circ1])
333 | return geom
334 |
335 |
336 | class Compound(Geom):
337 | def __init__(self, gs):
338 | Geom.__init__(self)
339 | self.gs = gs
340 | for g in self.gs:
341 | g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
342 |
343 | def render1(self):
344 | for g in self.gs:
345 | g.render()
346 |
347 |
348 | class PolyLine(Geom):
349 | def __init__(self, v, close):
350 | Geom.__init__(self)
351 | self.v = v
352 | self.close = close
353 | self.linewidth = LineWidth(1)
354 | self.add_attr(self.linewidth)
355 |
356 | def render1(self):
357 | glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
358 | for p in self.v:
359 | glVertex3f(p[0], p[1], 0) # draw each vertex
360 | glEnd()
361 |
362 | def set_linewidth(self, x):
363 | self.linewidth.stroke = x
364 |
365 |
366 | class Line(Geom):
367 | def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
368 | Geom.__init__(self)
369 | self.start = start
370 | self.end = end
371 | self.linewidth = LineWidth(1)
372 | self.add_attr(self.linewidth)
373 |
374 | def render1(self):
375 | glBegin(GL_LINES)
376 | glVertex2f(*self.start)
377 | glVertex2f(*self.end)
378 | glEnd()
379 |
380 |
381 | class Image(Geom):
382 | def __init__(self, fname, width, height):
383 | Geom.__init__(self)
384 | self.set_color(1.0, 1.0, 1.0)
385 | self.width = width
386 | self.height = height
387 | img = pyglet.image.load(fname)
388 | self.img = img
389 | self.flip = False
390 |
391 | def render1(self):
392 | self.img.blit(
393 | -self.width / 2, -self.height / 2, width=self.width, height=self.height
394 | )
395 |
396 |
397 | # ================================================================
398 |
399 |
400 | class SimpleImageViewer:
401 | def __init__(self, display=None, maxwidth=500):
402 | self.window = None
403 | self.isopen = False
404 | self.display = get_display(display)
405 | self.maxwidth = maxwidth
406 |
407 | def imshow(self, arr):
408 | if self.window is None:
409 | height, width, _channels = arr.shape
410 | if width > self.maxwidth:
411 | scale = self.maxwidth / width
412 | width = int(scale * width)
413 | height = int(scale * height)
414 | self.window = get_window(
415 | width=width,
416 | height=height,
417 | display=self.display,
418 | vsync=False,
419 | resizable=True,
420 | )
421 | self.width = width
422 | self.height = height
423 | self.isopen = True
424 |
425 | @self.window.event
426 | def on_resize(width, height):
427 | self.width = width
428 | self.height = height
429 |
430 | @self.window.event
431 | def on_close():
432 | self.isopen = False
433 |
434 | assert len(arr.shape) == 3, "You passed in an image with the wrong number shape"
435 | image = pyglet.image.ImageData(
436 | arr.shape[1], arr.shape[0], "RGB", arr.tobytes(), pitch=arr.shape[1] * -3
437 | )
438 | texture = image.get_texture()
439 | gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
440 | texture.width = self.width
441 | texture.height = self.height
442 | self.window.clear()
443 | self.window.switch_to()
444 | self.window.dispatch_events()
445 | texture.blit(0, 0) # draw
446 | self.window.flip()
447 |
448 | def close(self):
449 | if self.isopen and sys.meta_path:
450 | # ^^^ check sys.meta_path to avoid 'ImportError: sys.meta_path is None, Python is likely shutting down'
451 | self.window.close()
452 | self.isopen = False
453 |
454 | def __del__(self):
455 | self.close()
456 |
--------------------------------------------------------------------------------
/rcbf_sac/cbf_qp.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from rcbf_sac.dynamics import DYNAMICS_MODE
3 | from quadprog import solve_qp
4 |
5 | class CascadeCBFLayer:
6 |
7 | def __init__(self, env, gamma_b=100, k_d=1.5, l_p=0.03):
8 | """Constructor of CBFLayer.
9 |
10 | Parameters
11 | ----------
12 | env : gym.env
13 | Gym environment.
14 | gamma_b : float, optional
15 | gamma of control barrier certificate.
16 | k_d : float, optional
17 | confidence parameter desired (2.0 corresponds to ~95% for example).
18 | """
19 |
20 | self.env = env
21 | self.u_min, self.u_max = self.get_control_bounds()
22 | self.gamma_b = gamma_b
23 | self.k_d = k_d
24 | self.l_p = l_p
25 |
26 | if self.env.dynamics_mode not in DYNAMICS_MODE:
27 | raise Exception('Dynamics mode not supported.')
28 |
29 | def get_u_safe(self, u_nom, s, mean_pred, sigma):
30 | """Given the current state of the system, this function computes the control input necessary to render the nominal
31 | control `u_nom` safe (i.e. u_safe + u_nom is safe).
32 |
33 | Parameters
34 | ----------
35 | u_nom : ndarray
36 | Nominal control input.
37 | s : ndarray
38 | current state.
39 | mean_pred : ndarray
40 | mean prediction.
41 | sigma : ndarray
42 | standard deviation in additive disturbance.
43 |
44 | Returns
45 | -------
46 | u_safe : ndarray
47 | Safe control input to be added to `u_nom` as such `env.step(u_nom + u_safe)`
48 | """
49 |
50 | P, q, G, h = self.get_cbf_qp_constraints(u_nom, s, mean_pred, sigma)
51 | u_safe = self.solve_qp(P, q, G, h)
52 |
53 | return u_safe
54 |
55 | def get_cbf_qp_constraints(self, u_nom, state, mean_pred, sigma_pred):
56 | """Build up matrices required to solve qp
57 | Program specifically solves:
58 | minimize_{u,eps} 0.5 * u^T P u + q^T u
59 | subject to G[u,eps]^T <= h
60 |
61 | Each control barrier certificate is of the form:
62 | dh/dx^T (f_out + g_out u) >= -gamma^b h_out^3 where out here is an output of the state.
63 |
64 | In the case of SafetyGym_point dynamics:
65 | state = [x y θ v ω]
66 | state_d = [v*cos(θ) v*sin(θ) omega ω u^v u^ω]
67 |
68 | Parameters
69 | ----------
70 | u_nom : ndarray
71 | Nominal control input.
72 | state : ndarray
73 | current state (check dynamics.py for details on each dynamics' specifics)
74 | mean_pred : ndarray
75 | mean disturbance prediction state, dimensions (n_s, n_u)
76 | sigma_pred : ndarray
77 | standard deviation in additive disturbance after undergoing the output dynamics.
78 |
79 | Returns
80 | -------
81 | P : ndarray
82 | Quadratic cost matrix in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u)
83 | q : ndarray
84 | Linear cost vector in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u)
85 | G : ndarray
86 | Inequality constraint matrix (G[u,eps] <= h) of size (num_constraints, n_u + 1)
87 | h : ndarray
88 | Inequality constraint vector (G[u,eps] <= h) of size (num_constraints,)
89 | """
90 |
91 | if self.env.dynamics_mode == 'Unicycle':
92 |
93 | collision_radius = 1.2 * self.env.hazards_radius # add a little buffer
94 |
95 | # p(x): lookahead output
96 | p_x = np.array([state[0] + self.l_p * np.cos(state[2]), state[1] + self.l_p * np.sin(state[2])])
97 |
98 | # p_dot = f_p + g_p u
99 | f_p = np.zeros(2)
100 | theta = state[2]
101 | c_theta = np.cos(theta)
102 | s_theta = np.sin(theta)
103 | R = np.array([[c_theta, -s_theta],
104 | [s_theta, c_theta]])
105 | L = np.array([[1, 0],
106 | [0, self.l_p]])
107 | g_p = R @ L
108 |
109 | # hs
110 | hs = 0.5 * (np.sum((p_x - self.env.hazards_locations) ** 2,
111 | axis=1) - collision_radius ** 2) # 1/2 * (||x - x_obs||^2 - r^2)
112 |
113 | dhdxs = (p_x - self.env.hazards_locations) # each row is dhdx_i for hazard i
114 |
115 | # since we're dealing with p(x), we need to get the disturbance on p_x, p_y
116 | # Mean
117 | mean_p = mean_pred[:2] + self.l_p * np.array([-np.sin(state[2]), np.cos(state[2])]) * mean_pred[2]
118 | # Sigma (output is 2-dims (p_x, p_y))
119 | sigma_p = sigma_pred[:2] + self.l_p * np.array([-np.sin(state[2]), np.cos(state[2])]) * sigma_pred[2]
120 |
121 | n_u = u_nom.shape[0] # dimension of control inputs
122 | num_constraints = hs.shape[0] + 2 * n_u # each cbf is a constraint, and we need to add actuator constraints (n_u of them)
123 |
124 | # Inequality constraints (G[u, eps] <= h)
125 | G = np.zeros((num_constraints, n_u + 1)) # the plus 1 is for epsilon (to make sure qp is always feasible)
126 | h = np.zeros(num_constraints)
127 | ineq_constraint_counter = 0
128 |
129 | # First let's add the cbf constraints
130 | for c in range(hs.shape[0]):
131 | # extract current affine cbf (h = h1^T x + h0)
132 | h_ = hs[c]
133 | dhdx_ = dhdxs[c]
134 |
135 | # Add inequality constraints
136 | G[ineq_constraint_counter, :n_u] = -dhdx_ @ g_p # h1^Tg(x)
137 | G[ineq_constraint_counter, n_u] = -1 # for slack
138 |
139 | h[ineq_constraint_counter] = self.gamma_b * (h_ ** 3) + np.dot(dhdx_, (f_p+mean_p)) + np.dot(dhdx_ @ g_p,
140 | u_nom) \
141 | - self.k_d * np.dot(np.abs(dhdx_), sigma_p)
142 |
143 | ineq_constraint_counter += 1
144 |
145 | # Let's also build the cost matrices, vectors to minimize control effort and penalize slack
146 | P = np.diag([1.e1, 1.e-4, 1e7]) # in the original code, they use 1e24 instead of 1e7, but quadprog can't handle that...
147 | q = np.zeros(n_u + 1)
148 |
149 | elif self.env.dynamics_mode == 'SimulatedCars':
150 |
151 | n_u = u_nom.shape[0] # dimension of control inputs
152 | num_constraints = 4 # each cbf is a constraint, and we need to add actuator constraints (n_u of them)
153 | collision_radius = 3.5
154 |
155 | # Inequality constraints (G[u, eps] <= h)
156 | G = np.zeros((num_constraints, n_u + 1))
157 | h = np.zeros((num_constraints))
158 | ineq_constraint_counter = 0
159 |
160 | # Current State
161 | pos = state[::2]
162 | vels = state[1::2]
163 |
164 | # Action (acceleration)
165 | vels_des = 30.0 * np.ones(5) # Desired velocities
166 | # vels_des[0] -= 10 * np.sin(0.2 * t_batch)
167 | accels = self.env.kp * (vels_des - vels)
168 | accels[1] -= self.env.k_brake * (pos[0] - pos[1]) * ((pos[0] - pos[1]) < 6.0)
169 | accels[2] -= self.env.k_brake * (pos[1] - pos[2]) * ((pos[1] - pos[2]) < 6.0)
170 | accels[3] = 0.0 # Car 4's acceleration is controlled directly
171 | accels[4] -= self.env.k_brake * (pos[2] - pos[4]) * ((pos[2] - pos[4]) < 13.0)
172 |
173 | # f(x)
174 | f_x = np.zeros(state.shape[0])
175 | f_x[::2] = vels
176 | f_x[1::2] = accels
177 |
178 | # g(x)
179 | g_x = np.zeros(state.shape[0])
180 | g_x[7] = 50.0 # Car 4's acceleration
181 |
182 | # h1
183 | h13 = 0.5 * (((pos[2] - pos[3]) ** 2) - collision_radius ** 2)
184 | h15 = 0.5 * (((pos[4] - pos[3]) ** 2) - collision_radius ** 2)
185 |
186 | # dh1/dt = Lfh1
187 | h13_dot = (pos[3] - pos[2]) * (vels[3] - vels[2])
188 | h15_dot = (pos[3] - pos[4]) * (vels[3] - vels[4])
189 |
190 | # Lffh1
191 | dLfh13dx = np.zeros(10)
192 | dLfh13dx[4] = (vels[2] - vels[3]) # dLfh13/d(pos_car_3)
193 | dLfh13dx[5] = (pos[2] - pos[3]) # dLfh13/d(vel_car_3)
194 | dLfh13dx[6] = (vels[3] - vels[2])
195 | dLfh13dx[7] = (pos[3] - pos[2]) # dLfh13/d(vel_car_4)
196 | Lffh13 = np.dot(dLfh13dx, f_x)
197 |
198 | dLfh15dx = np.zeros(10)
199 | dLfh15dx[8] = (vels[4] - vels[3]) # Car 5 pos
200 | dLfh15dx[9] = (pos[4] - pos[3]) # Car 5 vels
201 | dLfh15dx[6] = (vels[3] - vels[4])
202 | dLfh15dx[7] = (pos[3] - pos[4])
203 | Lffh15 = np.dot(dLfh15dx, f_x)
204 |
205 | # Lfgh1
206 | Lgfh13 = np.dot(dLfh13dx, g_x)
207 | Lgfh15 = np.dot(dLfh15dx, g_x)
208 |
209 | # Inequality constraints (G[u, eps] <= h)
210 | h[0] = Lffh13 + (self.gamma_b + self.gamma_b) * h13_dot + self.gamma_b * self.gamma_b * h13 + Lgfh13 * u_nom
211 | h[1] = Lffh15 + (self.gamma_b + self.gamma_b) * h15_dot + self.gamma_b * self.gamma_b * h15 + Lgfh15 * u_nom
212 | G[0, 0] = -Lgfh13
213 | G[1, 0] = -Lgfh15
214 | G[:2, n_u] = -2e2 # for slack
215 | ineq_constraint_counter += 2
216 |
217 | # Let's also build the cost matrices, vectors to minimize control effort and penalize slack
218 | P = np.diag([0.1, 1e1])
219 | q = np.zeros(n_u + 1)
220 |
221 | else:
222 | raise Exception('Dynamics mode unknown!')
223 |
224 | # Second let's add actuator constraints
225 | n_u = u_nom.shape[0] # dimension of control inputs
226 | for c in range(n_u):
227 |
228 | # u_max >= u_nom + u ---> u <= u_max - u_nom
229 | if self.u_max is not None:
230 | G[ineq_constraint_counter, c] = 1
231 | h[ineq_constraint_counter] = self.u_max[c] - u_nom[c]
232 | ineq_constraint_counter += 1
233 |
234 | # u_min <= u_nom + u ---> -u <= u_min - u_nom
235 | if self.u_min is not None:
236 | G[ineq_constraint_counter, c] = -1
237 | h[ineq_constraint_counter] = -self.u_min[c] + u_nom[c]
238 | ineq_constraint_counter += 1
239 |
240 | return P, q, G, h
241 |
242 | def solve_qp(self, P, q, G, h):
243 | """Solves:
244 | minimize_{u,eps} 0.5 * u^T P u + q^T u
245 | subject to G[u,eps]^T <= h
246 |
247 | Parameters
248 | ----------
249 | P : ndarray
250 | Quadratic cost matrix
251 | q : ndarray
252 | Linear cost vector
253 | G : ndarray
254 | Inequality Constraint Matrix
255 | h : ndarray
256 | Inequality Constraint Vector
257 |
258 | Returns
259 | -------
260 | u_safe : ndarray
261 | The solution of the qp without the last dimension (the slack).
262 | """
263 |
264 | # print('P =\t {}'.format(P))
265 | # print('q =\t {}'.format(q))
266 | # print('G =\t {}'.format(G))
267 | # print('h =\t {}'.format(h))
268 |
269 | # Here we normalize G and h to stay consistent with what we do in CVXPYLAYER which often crashes with big #s
270 | Gh = np.concatenate((G, np.expand_dims(h, 1)), 1)
271 | Gh_norm = np.expand_dims(np.max(np.abs(Gh), axis=1), axis=1)
272 | G /= Gh_norm
273 | h = h / Gh_norm.squeeze(-1)
274 |
275 | try:
276 | sol = solve_qp(P, q, -G.T, -h)
277 | u_safe = sol[0][:-1]
278 | print('quadprog = {} eps = {}'.format(u_safe, sol[0][-1]))
279 | except ValueError as e:
280 | print('P = {},\nq = {},\nG = {},\nh = {}.'.format(P, q, G, h))
281 | raise e
282 |
283 | if np.abs(sol[0][-1]) > 1e-1:
284 | print('CBF indicates constraint violation might occur. epsilon = {}'.format(sol[0][-1]))
285 |
286 | return u_safe
287 |
288 | def get_cbfs(self, hazards_locations, hazards_radius):
289 | """Returns CBF function h(x) and its derivative dh/dx(x) for each hazard. Note that the CBF is defined with
290 | with respect to an output of the state.
291 |
292 | Parameters
293 | ----------
294 | hazards_locations : list
295 | List of hazard-xy positions where each item is a list of length 2
296 | hazards_radius : float
297 | Radius of the hazards
298 |
299 | Returns
300 | -------
301 | get_h :
302 | List of cbfs each corresponding to a constraint. Each cbf is affine (h(s) = h1^Ts + h0), as such each cbf is represented
303 | as a tuple of two entries: h_cbf = (h1, h0) where h1 and h0 are ndarrays.
304 | get_dhdx :
305 | """
306 |
307 | hazards_locations = np.array(hazards_locations)
308 | collision_radius = hazards_radius + 0.07 # add a little buffer
309 |
310 | def get_h(state):
311 | # p(x): lookahead output
312 | if self.env.dynamics_mode in ('SafetyGym_point', 'Unicycle'):
313 | state = np.array([state[0] + self.l_p * np.cos(state[2]), state[1] + self.l_p * np.sin(state[2])])
314 | return 0.5 * (np.sum((state - hazards_locations)**2, axis=1) - collision_radius**2) # 1/2 * (||x - x_obs||^2 - r^2)
315 |
316 | def get_dhdx(state):
317 | # p(x): lookahead output
318 | if self.env.dynamics_mode in ('SafetyGym_point', 'Unicycle'):
319 | state = np.array([state[0] + self.l_p * np.cos(state[2]), state[1] + self.l_p * np.sin(state[2])])
320 | dhdx = (state - hazards_locations) # each row is dhdx_i for hazard i
321 | return dhdx
322 |
323 | return get_h, get_dhdx
324 |
325 | def get_control_bounds(self):
326 | """
327 |
328 | Returns
329 | -------
330 | u_min : ndarray
331 | min control input.
332 | u_max : ndarray
333 | max control input.
334 | """
335 |
336 | u_min = self.env.safe_action_space.low
337 | u_max = self.env.safe_action_space.high
338 |
339 | return u_min, u_max
340 |
341 | def get_min_h_val(self, state):
342 | """
343 |
344 | Parameters
345 | ----------
346 | state : ndarray
347 | Current State
348 |
349 | Returns
350 | -------
351 | min_h_val : float
352 | Minimum h(x) over all hs. If below 0, then at least 1 constraint is violated.
353 |
354 | """
355 |
356 | get_h, _ = self.get_cbfs(self.env.hazards_locations, self.env.hazards_radius)
357 | min_h_val = np.min(get_h(state))
358 | return min_h_val
359 |
360 |
--------------------------------------------------------------------------------
/envs/unicycle_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from gym import spaces
4 |
5 | from envs.utils import to_pixel
6 |
7 |
8 | class UnicycleEnv(gym.Env):
9 | """Custom Environment that follows SafetyGym interface"""
10 |
11 | metadata = {'render.modes': ['human']}
12 |
13 | def __init__(self):
14 |
15 | super(UnicycleEnv, self).__init__()
16 |
17 | self.dynamics_mode = 'Unicycle'
18 | # Define action and observation space
19 | # They must be gym.spaces objects
20 | # Example when using discrete actions:
21 | self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,))
22 | self.safe_action_space = spaces.Box(low=-2.5, high=2.5, shape=(2,))
23 | self.observation_space = spaces.Box(low=-1e10, high=1e10, shape=(7,))
24 | self.bds = np.array([[-3., -3.], [3., 3.]])
25 | self.hazards_radius = 0.6
26 | self.hazards_locations = np.array([[0., 0.], [-1., 1.], [-1., -1.], [1., -1.], [1., 1.]]) * 1.5
27 | self.dt = 0.02
28 | self.max_episode_steps = 1000
29 | self.reward_goal = 1.0
30 | self.goal_size = 0.3
31 | # Initialize Env
32 | self.state = None
33 | self.episode_step = 0
34 | self.goal_pos = np.array([2.5, 2.5])
35 |
36 | self.reset()
37 | # Get Dynamics
38 | self.get_f, self.get_g = self._get_dynamics()
39 | # Disturbance
40 | self.disturb_mean = np.zeros((3,))
41 | self.disturb_covar = np.diag([0.005, 0.005, 0.05]) * 20
42 |
43 | # Viewer
44 | self.viewer = None
45 |
46 | def step(self, action):
47 | """Organize the observation to understand what's going on
48 |
49 | Parameters
50 | ----------
51 | action : ndarray
52 | Action that the agent takes in the environment
53 |
54 | Returns
55 | -------
56 | new_obs : ndarray
57 | The new observation with the following structure:
58 | [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, dist2goal]
59 |
60 | """
61 |
62 | action = np.clip(action, -1.0, 1.0)
63 | state, reward, done, info = self._step(action)
64 | return self.get_obs(), reward, done, info
65 |
66 | def _step(self, action):
67 | """
68 |
69 | Parameters
70 | ----------
71 | action
72 |
73 | Returns
74 | -------
75 | state : ndarray
76 | New internal state of the agent.
77 | reward : float
78 | Reward collected during this transition.
79 | done : bool
80 | Whether the episode terminated.
81 | info : dict
82 | Additional info relevant to the environment.
83 | """
84 |
85 | # Start with our prior for continuous time system x' = f(x) + g(x)u
86 | self.state += self.dt * (self.get_f(self.state) + self.get_g(self.state) @ action)
87 | self.state -= self.dt * 0.1 * self.get_g(self.state) @ np.array([np.cos(self.state[2]), 0]) #* np.random.multivariate_normal(self.disturb_mean, self.disturb_covar, 1).squeeze()
88 |
89 | self.episode_step += 1
90 |
91 | info = dict()
92 |
93 | dist_goal = self._goal_dist()
94 | reward = (self.last_goal_dist - dist_goal)
95 | self.last_goal_dist = dist_goal
96 | # Check if goal is met
97 | if self.goal_met():
98 | info['goal_met'] = True
99 | reward += self.reward_goal
100 | done = True
101 | else:
102 | done = self.episode_step >= self.max_episode_steps
103 |
104 |
105 | # Include constraint cost in reward
106 | if np.any(np.sum((self.state[:2] - self.hazards_locations)**2, axis=1) < self.hazards_radius**2):
107 | if 'cost' in info:
108 | info['cost'] += 0.1
109 | else:
110 | info['cost'] = 0.1
111 | return self.state, reward, done, info
112 |
113 | def goal_met(self):
114 | """Return true if the current goal is met this step
115 |
116 | Returns
117 | -------
118 | goal_met : bool
119 | True if the goal condition is met.
120 |
121 | """
122 |
123 | return np.linalg.norm(self.state[:2] - self.goal_pos) <= self.goal_size
124 |
125 | def reset(self):
126 | """ Reset the state of the environment to an initial state.
127 |
128 | Returns
129 | -------
130 | observation : ndarray
131 | Next observation.
132 | """
133 |
134 | self.episode_step = 0
135 |
136 | # Re-initialize state
137 | self.state = np.array([-2.5, -2.5, 0.])
138 |
139 | # Re-initialize last goal dist
140 | self.last_goal_dist = self._goal_dist()
141 |
142 | # TODO: Randomize this
143 | return self.get_obs()
144 |
145 | def render(self, mode='human', close=False):
146 | """Render the environment to the screen
147 |
148 | Parameters
149 | ----------
150 | mode : str
151 | close : bool
152 |
153 | Returns
154 | -------
155 |
156 | """
157 |
158 | if mode != 'human' and mode != 'rgb_array':
159 | rel_loc = self.goal_pos - self.state[:2]
160 | theta_error = np.arctan2(rel_loc[1], rel_loc[0]) - self.state[2]
161 | print('Ep_step = {}, \tState = {}, \tDist2Goal = {}, alignment_error = {}'.format(self.episode_step,
162 | self.state,
163 | self._goal_dist(),
164 | theta_error))
165 |
166 | screen_width = 600
167 | screen_height = 400
168 |
169 | if self.viewer is None:
170 | from envs import pyglet_rendering
171 | self.viewer = pyglet_rendering.Viewer(screen_width, screen_height)
172 | # Draw obstacles
173 | obstacles = []
174 | for i in range(len(self.hazards_locations)):
175 | obstacles.append(
176 | pyglet_rendering.make_circle(radius=to_pixel(self.hazards_radius, shift=0), filled=True))
177 | obs_trans = pyglet_rendering.Transform(translation=(
178 | to_pixel(self.hazards_locations[i][0], shift=screen_width / 2),
179 | to_pixel(self.hazards_locations[i][1], shift=screen_height / 2)))
180 | obstacles[i].set_color(1.0, 0.0, 0.0)
181 | obstacles[i].add_attr(obs_trans)
182 | self.viewer.add_geom(obstacles[i])
183 |
184 | # Make Goal
185 | goal = pyglet_rendering.make_circle(radius=to_pixel(0.1, shift=0), filled=True)
186 | goal_trans = pyglet_rendering.Transform(translation=(
187 | to_pixel(self.goal_pos[0], shift=screen_width / 2), to_pixel(self.goal_pos[1], shift=screen_height / 2)))
188 | goal.add_attr(goal_trans)
189 | goal.set_color(0.0, 0.5, 0.0)
190 | self.viewer.add_geom(goal)
191 |
192 | # Make Robot
193 | self.robot = pyglet_rendering.make_circle(radius=to_pixel(0.1), filled=True)
194 | self.robot_trans = pyglet_rendering.Transform(translation=(
195 | to_pixel(self.state[0], shift=screen_width / 2), to_pixel(self.state[1], shift=screen_height / 2)))
196 | self.robot_trans.set_rotation(self.state[2])
197 | self.robot.add_attr(self.robot_trans)
198 | self.robot.set_color(0.5, 0.5, 0.8)
199 | self.viewer.add_geom(self.robot)
200 | self.robot_orientation = pyglet_rendering.Line(start=(0.0, 0.0), end=(15.0, 0.0))
201 | self.robot_orientation.linewidth.stroke = 2
202 | self.robot_orientation.add_attr(self.robot_trans)
203 | self.robot_orientation.set_color(0, 0, 0)
204 | self.viewer.add_geom(self.robot_orientation)
205 |
206 | if self.state is None:
207 | return None
208 |
209 | self.robot_trans.set_translation(to_pixel(self.state[0], shift=screen_width / 2),
210 | to_pixel(self.state[1], shift=screen_height / 2))
211 | self.robot_trans.set_rotation(self.state[2])
212 |
213 | return self.viewer.render(return_rgb_array=mode == "rgb_array")
214 |
215 | def get_obs(self):
216 | """Given the state, this function returns it to an observation akin to the one obtained by calling env.step
217 |
218 | Parameters
219 | ----------
220 |
221 | Returns
222 | -------
223 | observation : ndarray
224 | Observation: [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, exp(-dist2goal)]
225 | """
226 |
227 | rel_loc = self.goal_pos - self.state[:2]
228 | goal_dist = np.linalg.norm(rel_loc)
229 | goal_compass = self.obs_compass() # compass to the goal
230 |
231 | return np.array([self.state[0], self.state[1], np.cos(self.state[2]), np.sin(self.state[2]), goal_compass[0], goal_compass[1], np.exp(-goal_dist)])
232 |
233 | def _get_dynamics(self):
234 | """Get affine CBFs for a given environment.
235 |
236 | Parameters
237 | ----------
238 |
239 | Returns
240 | -------
241 | get_f : callable
242 | Drift dynamics of the continuous system x' = f(x) + g(x)u
243 | get_g : callable
244 | Control dynamics of the continuous system x' = f(x) + g(x)u
245 | """
246 |
247 | def get_f(state):
248 | f_x = np.zeros(state.shape)
249 | return f_x
250 |
251 | def get_g(state):
252 | theta = state[2]
253 | g_x = np.array([[np.cos(theta), 0],
254 | [np.sin(theta), 0],
255 | [ 0, 1.0]])
256 | return g_x
257 |
258 | return get_f, get_g
259 |
260 | def obs_compass(self):
261 | """
262 | Return a robot-centric compass observation of a list of positions.
263 | Compass is a normalized (unit-lenght) egocentric XY vector,
264 | from the agent to the object.
265 | This is equivalent to observing the egocentric XY angle to the target,
266 | projected into the sin/cos space we use for joints.
267 | (See comment on joint observation for why we do this.)
268 | """
269 |
270 | # Get ego vector in world frame
271 | vec = self.goal_pos - self.state[:2]
272 | # Rotate into frame
273 | R = np.array([[np.cos(self.state[2]), -np.sin(self.state[2])], [np.sin(self.state[2]), np.cos(self.state[2])]])
274 | vec = np.matmul(vec, R)
275 | # Normalize
276 | vec /= np.sqrt(np.sum(np.square(vec))) + 0.001
277 | return vec
278 |
279 | def _goal_dist(self):
280 | return np.linalg.norm(self.goal_pos - self.state[:2])
281 |
282 | def get_random_hazard_locations(n_hazards, hazard_radius, bds=None):
283 | """
284 |
285 | Parameters
286 | ----------
287 | n_hazards : int
288 | Number of hazards to create
289 | hazard_radius : float
290 | Radius of hazards
291 | bds : list, optional
292 | List of the form [[x_lb, x_ub], [y_lb, y_ub] denoting the bounds of the 2D arena
293 |
294 | Returns
295 | -------
296 | hazards_locs : ndarray
297 | Numpy array of shape (n_hazards, 2) containing xy locations of hazards.
298 | """
299 |
300 | if bds is None:
301 | bds = np.array([[-3., -3.], [3., 3.]])
302 |
303 | # Create buffer with boundaries
304 | buffered_bds = bds
305 | buffered_bds[0] += hazard_radius
306 | buffered_bds[1] -= hazard_radius
307 |
308 | hazards_locs = np.zeros((n_hazards, 2))
309 |
310 | for i in range(n_hazards):
311 | successfully_placed = False
312 | iter = 0
313 | while not successfully_placed and iter < 500:
314 | hazards_locs[i] = (bds[1] - bds[0]) * np.random.random(2) + bds[0]
315 | successfully_placed = np.all(np.linalg.norm(hazards_locs[:i] - hazards_locs[i], axis=1) > 3*hazard_radius)
316 | iter += 1
317 |
318 | if iter >= 500:
319 | raise Exception('Could not place hazards in arena.')
320 |
321 | return hazards_locs
322 |
323 | if __name__ == "__main__":
324 |
325 | import matplotlib.pyplot as plt
326 | from rcbf_sac.cbf_qp import CascadeCBFLayer
327 | from rcbf_sac.dynamics import DynamicsModel
328 | import argparse
329 |
330 | parser = argparse.ArgumentParser()
331 | parser.add_argument('--env-name', default="SafetyGym", help='Either SafetyGym or Unicycle.')
332 | parser.add_argument('--gp_model_size', default=2000, type=int, help='gp')
333 | parser.add_argument('--k_d', default=3.0, type=float)
334 | parser.add_argument('--gamma_b', default=40, type=float)
335 | parser.add_argument('--cuda', action="store_true", help='run on CUDA (default: False)')
336 | args = parser.parse_args()
337 |
338 | env = UnicycleEnv()
339 | dynamics_model = DynamicsModel(env, args)
340 | cbf_wrapper = CascadeCBFLayer(env, gamma_b=args.gamma_b, k_d=args.k_d)
341 |
342 |
343 | def simple_controller(env, state, goal):
344 | goal_xy = goal[:2]
345 | goal_dist = -np.log(goal[2]) # the observation is np.exp(-goal_dist)
346 | v = 4.0 * goal_dist
347 | relative_theta = 1.0 * np.arctan2(goal_xy[1], goal_xy[0])
348 | omega = 5.0 * relative_theta
349 | return np.clip(np.array([v, omega]), env.action_space.low, env.action_space.high)
350 |
351 | obs = env.reset()
352 | done = False
353 | episode_reward = 0
354 | episode_step = 0
355 |
356 | # Plot initial state
357 | fig, ax = plt.subplots() # note we must use plt.subplots, not plt.subplot
358 | for i in range(len(env.hazards_locations)):
359 | ax.add_patch(plt.Circle(env.hazards_locations[i], env.hazards_radius, color='r'))
360 | ax.add_patch(plt.Circle(env.goal_pos, env.goal_size, color='g'))
361 | p_pos = ax.scatter(obs[0], obs[1], s=300)
362 | p_theta = plt.quiver(obs[0], obs[1], obs[0] + .2 * obs[2], .2 * obs[3])
363 | plt.xlim([-3.0, 3.0])
364 | plt.ylim([-3.0, 3.0])
365 | ax.set_aspect('equal', 'box')
366 |
367 | while not done:
368 | # Plot current state
369 | p_pos.set_offsets([obs[0], obs[1]])
370 | p_theta.XY[:, 0] = obs[0]
371 | p_theta.XY[:, 1] = obs[1]
372 | p_theta.set_UVC(.2 * obs[2], .2 * obs[3])
373 | # Take Action and get next state
374 | # random_action = env.action_space.sample()
375 | state = dynamics_model.get_state(obs)
376 | random_action = simple_controller(env, state, obs[-3:])
377 | disturb_mean, disturb_std = dynamics_model.predict_disturbance(state)
378 | action_safe = cbf_wrapper.get_u_safe(random_action, state, disturb_mean, disturb_std)
379 | obs, reward, done, info = env.step(random_action + action_safe)
380 | plt.pause(0.01)
381 | episode_reward += reward
382 | episode_step += 1
383 | print('step {} \tepisode_reward = {}'.format(episode_step, episode_reward))
384 | plt.show()
385 |
386 |
--------------------------------------------------------------------------------
/rcbf_sac/dynamics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from rcbf_sac.gp_model import GPyDisturbanceEstimator
4 | from rcbf_sac.utils import to_tensor, to_numpy
5 |
6 | """
7 | This file contains the DynamicsModel class. Depending on the environment selected, it contains the dynamics
8 | priors and learns the remaining disturbance term using GPs. The system is described as follows:
9 | x_dot ∈ f(x) + g(x)u + D(x),
10 | where x is the state, f and g are the drift and control dynamics respectively, and D(x) is the learned disturbance set.
11 |
12 | A few things to note:
13 | - The prior depends on the dynamics. This is hard-coded as of now. In the future, one direction to take would be to
14 | get a few episodes in the env to determine the affine prior first using least-squares or something similar.
15 | - The state is not the same as the observation and typically requires some pre-processing. These functions are supplied
16 | here which are also, unfortunately, hard-coded as of now.
17 | - The functions here are sometimes used in batch-form or single point form, and sometimes the inputs are torch tensors
18 | and other times are numpy arrays. These are things to be mindful of, and output format is always same as input format.
19 | """
20 |
21 |
22 | DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}, # state = [x y θ]
23 | 'SimulatedCars': {'n_s': 10, 'n_u': 1}} # state = [x y θ v ω]
24 | MAX_STD = {'Unicycle': [2e-1, 2e-1, 2e-1], 'SimulatedCars': [0, 0.2, 0, 0.2, 0, 0.2, 0, 0.2, 0, 0.2]}
25 |
26 |
27 | class DynamicsModel:
28 |
29 | def __init__(self, env, args):
30 | """Constructor of DynamicsModel.
31 |
32 | Parameters
33 | ----------
34 | env : gym.env
35 | Gym environment.
36 | """
37 |
38 | self.env = env
39 | # Get Dynamics
40 | self.get_f, self.get_g = self.get_dynamics()
41 | self.n_s = DYNAMICS_MODE[self.env.dynamics_mode]['n_s']
42 | self.n_u = DYNAMICS_MODE[self.env.dynamics_mode]['n_u']
43 |
44 | # Keep Disturbance History to estimate it using GPs
45 | self.disturb_estimators = None
46 | self.disturbance_history = dict()
47 | self.history_counter = 0 # keeping only 1000 points in the buffer
48 | self.max_history_count = args.gp_model_size # How many points we want to have in the GP
49 | self.disturbance_history['state'] = np.zeros((self.max_history_count, self.n_s))
50 | self.disturbance_history['disturbance'] = np.zeros((self.max_history_count, self.n_s))
51 | self.train_x = None # x-data used to fit the last GP models
52 | self.train_y = None # y-data used to fit the last GP models
53 |
54 | # Point Robot specific dynamics (approx using unicycle + look-ahead)
55 | if hasattr(args, 'l_p'):
56 | self.l_p = args.l_p
57 |
58 | self.device = torch.device("cuda" if args.cuda else "cpu")
59 |
60 | def predict_next_state(self, state_batch, u_batch, t_batch=None, use_gps=True):
61 | """Given the current state and action, this function predicts the next state.
62 |
63 | Parameters
64 | ----------
65 | state_batch : ndarray
66 | State
67 | u_batch : ndarray
68 | Action
69 | t_batch: ndarray, optional
70 | Time batch for state dependant dynamics
71 | use_gps : bool, optional
72 | Use GPs to return mean and var
73 |
74 | Returns
75 | -------
76 | next_state : ndarray
77 | Next state
78 | """
79 |
80 | expand_dims = len(state_batch.shape) == 1
81 | if expand_dims:
82 | state_batch = np.expand_dims(state_batch, axis=0)
83 |
84 | # Start with our prior for continuous time system x' = f(x) + g(x)u
85 | if t_batch is not None:
86 | next_state_batch = state_batch + self.env.dt * (self.get_f(state_batch, t_batch) + (self.get_g(state_batch, t_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1))
87 | else:
88 | next_state_batch = state_batch + self.env.dt * (self.get_f(state_batch) + (self.get_g(state_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1))
89 |
90 | if use_gps: # if we want estimate the disturbance, let's do it!
91 | pred_mean, pred_std = self.predict_disturbance(state_batch)
92 | next_state_batch += self.env.dt * pred_mean
93 | else:
94 | pred_std = np.zeros(state_batch.shape)
95 |
96 | if expand_dims:
97 | next_state_batch = next_state_batch.squeeze(0)
98 | if pred_std is not None:
99 | pred_std = pred_std.squeeze(0)
100 |
101 | if t_batch is not None:
102 | next_t_batch = t_batch + self.env.dt
103 | return next_state_batch, self.env.dt * pred_std, next_t_batch
104 |
105 | return next_state_batch, self.env.dt * pred_std, t_batch
106 |
107 | def predict_next_obs(self, state, u):
108 | """Predicts the next observation given the state and u. Note that this only predicts the mean next observation.
109 |
110 | Parameters
111 | ----------
112 | state : ndarray
113 | u : ndarray
114 |
115 | Returns
116 | -------
117 | next_obs : ndarray
118 | Next observation
119 | """
120 |
121 | next_state, _, _ = self.predict_next_state(state, u)
122 | next_obs = self.get_obs(next_state)
123 | return next_obs
124 |
125 | def get_dynamics(self):
126 | """Get affine CBFs for a given environment.
127 |
128 | Parameters
129 | ----------
130 |
131 | Returns
132 | -------
133 | get_f : callable
134 | Drift dynamics of the continuous system x' = f(x) + g(x)u
135 | get_g : callable
136 | Control dynamics of the continuous system x' = f(x) + g(x)u
137 | """
138 |
139 | if self.env.dynamics_mode == 'Unicycle':
140 |
141 | def get_f(state_batch, t_batch=None):
142 | f_x = np.zeros(state_batch.shape)
143 | return f_x
144 |
145 | def get_g(state_batch, t_batch=None):
146 | theta = state_batch[:, 2]
147 | g_x = np.zeros((state_batch.shape[0], 3, 2))
148 | g_x[:, 0, 0] = np.cos(theta)
149 | g_x[:, 1, 0] = np.sin(theta)
150 | g_x[:, 2, 1] = 1.0
151 | return g_x
152 |
153 | elif self.env.dynamics_mode == 'SimulatedCars':
154 |
155 | kp = 4.0
156 | k_brake = 20.0
157 |
158 | def get_g(state_batch, t_batch):
159 |
160 | g_x = np.zeros((state_batch.shape[0], 10, 1))
161 | g_x[:, 7, 0] = 50.0 # Car 4's acceleration
162 | return g_x
163 |
164 | def get_f(state_batch, t_batch):
165 |
166 | # Current State
167 | pos = state_batch[:, ::2]
168 | vels = state_batch[:, 1::2]
169 |
170 | # Action (acceleration)
171 | vels_des = 30.0 * np.ones((state_batch.shape[0], 5)) # Desired velocities
172 | vels_des[:, 0] -= 10 * np.sin(0.2 * t_batch)
173 | accels = kp * (vels_des - vels)
174 | accels[:, 1] -= k_brake * (pos[:, 0] - pos[:, 1]) * ((pos[:, 0] - pos[:, 1]) < 6.0)
175 | accels[:, 2] -= k_brake * (pos[:, 1] - pos[:, 2]) * ((pos[:, 1] - pos[:, 2]) < 6.0)
176 | accels[:, 3] = 0.0 # Car 4's acceleration is controlled directly
177 | accels[:, 4] -= k_brake * (pos[:, 2] - pos[:, 4]) * ((pos[:, 2] - pos[:, 4]) < 13.0)
178 |
179 | # f(x)
180 | f_x = np.zeros(state_batch.shape)
181 | f_x[:, ::2] = vels
182 | f_x[:, 1::2] = accels
183 | return f_x
184 |
185 | else:
186 | raise Exception('Unknown Dynamics mode.')
187 |
188 | return get_f, get_g
189 |
190 | def get_state(self, obs):
191 | """Given the observation, this function does the pre-processing necessary and returns the state.
192 |
193 | Parameters
194 | ----------
195 | obs_batch : ndarray or torch.tensor
196 | Environment observation.
197 |
198 | Returns
199 | -------
200 | state_batch : ndarray or torch.tensor
201 | State of the system.
202 |
203 | """
204 |
205 | expand_dims = len(obs.shape) == 1
206 | is_tensor = torch.is_tensor(obs)
207 |
208 | if is_tensor:
209 | dtype = obs.dtype
210 | device = obs.device
211 | obs = to_numpy(obs)
212 |
213 | if expand_dims:
214 | obs = np.expand_dims(obs, 0)
215 |
216 | if self.env.dynamics_mode == 'Unicycle':
217 | theta = np.arctan2(obs[:, 3], obs[:, 2])
218 | state_batch = np.zeros((obs.shape[0], 3))
219 | state_batch[:, 0] = obs[:, 0]
220 | state_batch[:, 1] = obs[:, 1]
221 | state_batch[:, 2] = theta
222 | elif self.env.dynamics_mode == 'SimulatedCars':
223 | state_batch = np.copy(obs)
224 | state_batch[:, ::2] *= 100.0 # Scale Positions
225 | state_batch[:, 1::2] *= 30.0 # Scale Velocities
226 | else:
227 | raise Exception('Unknown dynamics')
228 |
229 | if expand_dims:
230 | state_batch = state_batch.squeeze(0)
231 |
232 | return to_tensor(state_batch, dtype, device) if is_tensor else state_batch
233 |
234 | def get_obs(self, state_batch):
235 | """Given the state, this function returns it to an observation akin to the one obtained by calling env.step
236 |
237 | Parameters
238 | ----------
239 | state : ndarray
240 | Environment state batch of shape (batch_size, n_s)
241 |
242 | Returns
243 | -------
244 | obs : ndarray
245 | Observation batch of shape (batch_size, n_o)
246 |
247 | """
248 |
249 | if self.env.dynamics_mode == 'Unicycle':
250 | obs = np.zeros((state_batch.shape[0], 4))
251 | obs[:, 0] = state_batch[:, 0]
252 | obs[:, 1] = state_batch[:, 1]
253 | obs[:, 2] = np.cos(state_batch[:, 2])
254 | obs[:, 3] = np.sin(state_batch[:, 2])
255 | elif self.env.dynamics_mode == 'SimulatedCars':
256 | obs = np.copy(state_batch)
257 | obs[:, ::2] /= 100.0 # Scale Positions
258 | obs[:, 1::2] /= 30.0 # Scale Velocities
259 | else:
260 | raise Exception('Unknown dynamics')
261 | return obs
262 |
263 | def append_transition(self, state_batch, u_batch, next_state_batch, t_batch=None):
264 | """Estimates the disturbance from the current dynamics transition and adds it to buffer.
265 |
266 | Parameters
267 | ----------
268 | state_batch : ndarray
269 | shape (n_s,) or (batch_size, n_s)
270 | u_batch : ndarray
271 | shape (n_u,) or (batch_size, n_u)
272 | next_state_batch : ndarray
273 | shape (n_s,) or (batch_size, n_s)
274 | t_batch : ndarray, optional
275 | shape (1,) or (batch_size, 1)
276 |
277 | Returns
278 | -------
279 |
280 | """
281 |
282 | expand_dims = len(state_batch.shape) == 1
283 |
284 | if expand_dims:
285 | state_batch = np.expand_dims(state_batch, 0)
286 | next_state_batch = np.expand_dims(next_state_batch, 0)
287 | u_batch = np.expand_dims(u_batch, 0)
288 |
289 | u_batch = np.expand_dims(u_batch, -1) # for broadcasting batch matrix multiplication purposes
290 |
291 | disturbance_batch = (next_state_batch - state_batch - self.env.dt * (self.get_f(state_batch, t_batch) + (self.get_g(state_batch, t_batch) @ u_batch).squeeze(-1))) / self.env.dt
292 |
293 | # Append new data point (state, disturbance) to our dataset
294 | for i in range(state_batch.shape[0]):
295 |
296 | self.disturbance_history['state'][self.history_counter % self.max_history_count] = state_batch[i]
297 | self.disturbance_history['disturbance'][self.history_counter % self.max_history_count] = disturbance_batch[i]
298 |
299 | # Increment how many data points we have
300 | self.history_counter += 1
301 |
302 | # Update GP models every max_history_count data points
303 | if self.history_counter % (self.max_history_count/10) == 0:
304 | self.fit_gp_model()
305 |
306 | def fit_gp_model(self, training_iter=70):
307 | """
308 |
309 | Parameters
310 | ----------
311 | training_iter : int
312 | Number of training iterations for GP model.
313 |
314 | Returns
315 | -------
316 |
317 | """
318 |
319 | if self.history_counter < self.max_history_count: # didn't fill the buffer yet
320 | train_x = self.disturbance_history['state'][:self.history_counter]
321 | train_y = self.disturbance_history['disturbance'][:self.history_counter]
322 | else: # buffer filled, use all the data points
323 | train_x = self.disturbance_history['state']
324 | train_y = self.disturbance_history['disturbance']
325 |
326 | # Normalize Data
327 | train_x_std = np.std(train_x, axis=0)
328 | train_x_normalized = train_x / (train_x_std + 1e-8)
329 | train_y_std = np.std(train_y, axis=0)
330 | train_y_normalized = train_y / (train_y_std + 1e-8)
331 |
332 | self.disturb_estimators = []
333 | for i in range(self.n_s):
334 | # self.disturb_estimators.append(GPyDisturbanceEstimator(train_x, train_y[:, i]))
335 | self.disturb_estimators.append(GPyDisturbanceEstimator(train_x_normalized, train_y_normalized[:, i], MAX_STD[self.env.dynamics_mode][i], device=self.device))
336 | self.disturb_estimators[i].train(training_iter)
337 |
338 | # track the data I last used to fit the GPs for saving purposes (need it to initialize before loading weights)
339 | self.train_x = train_x
340 | self.train_y = train_y
341 |
342 | def predict_disturbance(self, test_x):
343 | """Predict the disturbance at the queried states using the GP models.
344 |
345 | Parameters
346 | ----------
347 | test_x : ndarray or torch.tensor
348 | shape(n_test, n_s)
349 | Returns
350 | -------
351 | means: ndarray or torch.tensor
352 | Prediction means -- shape(n_test, n_s)
353 | vars: ndarray or torch.tensor
354 | Prediction variances -- shape(n_test, n_s)
355 | """
356 |
357 | is_tensor = torch.is_tensor(test_x)
358 |
359 | if is_tensor:
360 | dtype = test_x.dtype
361 | device = test_x.device
362 | test_x = to_numpy(test_x)
363 |
364 | expand_dims = len(test_x.shape) == 1
365 | if expand_dims:
366 | test_x = np.expand_dims(test_x, axis=0)
367 |
368 | means = np.zeros(test_x.shape)
369 | f_std = np.zeros(test_x.shape) # standard deviation
370 |
371 | if self.disturb_estimators:
372 | # Normalize
373 | train_x_std = np.std(self.train_x, axis=0)
374 | train_y_std = np.std(self.train_y, axis=0)
375 | test_x = test_x / train_x_std
376 | for i in range(self.n_s):
377 | prediction_ = self.disturb_estimators[i].predict(test_x)
378 | means[:, i] = prediction_['mean'] * (train_y_std[i] + 1e-8)
379 | f_std[:, i] = np.sqrt(prediction_['f_var']) * (train_y_std[i] + 1e-8)
380 |
381 | else: # zero-mean, max_sigma prior
382 | f_std = np.ones(test_x.shape)
383 | for i in range(self.n_s):
384 | f_std[:, i] *= MAX_STD[self.env.dynamics_mode][i]
385 |
386 | if expand_dims:
387 | means = means.squeeze(0)
388 | f_std = f_std.squeeze(0)
389 |
390 | return (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) if is_tensor else (means, f_std)
391 |
392 | def load_disturbance_models(self, output):
393 |
394 | if output is None:
395 | return
396 |
397 | self.disturb_estimators = []
398 |
399 | try:
400 | weights = torch.load('{}/gp_models.pkl'.format(output), map_location=torch.device(self.device))
401 | self.train_x = torch.load('{}/gp_models_train_x.pkl'.format(output))
402 | self.train_y = torch.load('{}/gp_models_train_y.pkl'.format(output))
403 | for i in range(self.n_s):
404 | self.disturb_estimators.append(GPyDisturbanceEstimator(self.train_x, self.train_y[:, i], MAX_STD[self.env.dynamics_mode][i], device=self.device))
405 | self.disturb_estimators[i].model.load_state_dict(weights[i])
406 | except:
407 | raise Exception('Could not load GP models from {}'.format(output))
408 |
409 | def save_disturbance_models(self, output):
410 |
411 | if not self.disturb_estimators or self.train_x is None or self.train_y is None:
412 | return
413 | weights = []
414 | for i in range(len(self.disturb_estimators)):
415 | weights.append(self.disturb_estimators[i].model.state_dict())
416 | torch.save(weights, '{}/gp_models.pkl'.format(output))
417 | # Also save data used to fit model (needed for initializing the model before loading weights)
418 | torch.save(self.train_x, '{}/gp_models_train_x.pkl'.format(output))
419 | torch.save(self.train_y, '{}/gp_models_train_y.pkl'.format(output))
420 |
421 | def seed(self, s):
422 | torch.manual_seed(s)
423 | if torch.cuda.is_available():
424 | torch.cuda.manual_seed(s)
425 |
426 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # import comet_ml at the top of your file
2 | from comet_ml import Experiment
3 |
4 | import argparse
5 | import torch
6 | import numpy as np
7 |
8 | from rcbf_sac.sac_cbf import RCBF_SAC
9 | from rcbf_sac.replay_memory import ReplayMemory
10 | from rcbf_sac.dynamics import DynamicsModel
11 | from build_env import *
12 | import os
13 |
14 | from rcbf_sac.utils import prGreen, get_output_folder, prYellow
15 | from rcbf_sac.evaluator import Evaluator
16 | from rcbf_sac.generate_rollouts import generate_model_rollouts
17 |
18 |
19 | def train(agent, env, dynamics_model, args, experiment=None):
20 |
21 | # Memory
22 | memory = ReplayMemory(args.replay_size, args.seed)
23 | memory_model = ReplayMemory(args.replay_size, args.seed)
24 |
25 | # Training Loop
26 | total_numsteps = 0
27 | updates = 0
28 |
29 | if args.use_comp:
30 | compensator_rollouts = []
31 | comp_buffer_idx = 0
32 |
33 | for i_episode in range(args.max_episodes):
34 | episode_reward = 0
35 | episode_cost = 0
36 | episode_steps = 0
37 | done = False
38 | obs = env.reset()
39 |
40 | # Saving rollout here to train compensator
41 | if args.use_comp:
42 | episode_rollout = dict()
43 | episode_rollout['obs'] = np.zeros((0, env.observation_space.shape[0]))
44 | episode_rollout['u_safe'] = np.zeros((0, env.action_space.shape[0]))
45 | episode_rollout['u_comp'] = np.zeros((0, env.action_space.shape[0]))
46 |
47 | while not done:
48 | if episode_steps % 10 == 0:
49 | prYellow('Episode {} - step {} - eps_rew {} - eps_cost {}'.format(i_episode, episode_steps, episode_reward, episode_cost))
50 | state = dynamics_model.get_state(obs)
51 |
52 | # Generate Model rollouts
53 | if args.model_based and episode_steps % 5 == 0 and len(memory) > dynamics_model.max_history_count / 3:
54 | memory_model = generate_model_rollouts(env, memory_model, memory, agent, dynamics_model,
55 | k_horizon=args.k_horizon,
56 | batch_size=min(len(memory), 5 * args.rollout_batch_size),
57 | warmup=args.start_steps > total_numsteps)
58 |
59 | # If using model-based RL then we only need to have enough data for the real portion of the replay buffer
60 | if len(memory) + len(memory_model) * args.model_based > args.batch_size:
61 |
62 | # Number of updates per step in environment
63 | for i in range(args.updates_per_step):
64 |
65 | # Update parameters of all the networks
66 | if args.model_based:
67 | # Pick the ratio of data to be sampled from the real vs model buffers
68 | real_ratio = max(min(args.real_ratio, len(memory) / args.batch_size), 1 - len(memory_model) / args.batch_size)
69 | # Update parameters of all the networks
70 | critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory,
71 | args.batch_size,
72 | updates,
73 | dynamics_model,
74 | memory_model,
75 | real_ratio)
76 | else:
77 | critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory,
78 | args.batch_size,
79 | updates,
80 | dynamics_model)
81 |
82 | if experiment:
83 | experiment.log_metric('loss/critic_1', critic_1_loss, updates)
84 | experiment.log_metric('loss/critic_2', critic_2_loss, step=updates)
85 | experiment.log_metric('loss/policy', policy_loss, step=updates)
86 | experiment.log_metric('loss/entropy_loss', ent_loss, step=updates)
87 | experiment.log_metric('entropy_temperature/alpha', alpha, step=updates)
88 | updates += 1
89 |
90 | if args.use_comp:
91 | action, action_comp, action_cbf = agent.select_action(obs, dynamics_model, warmup=args.start_steps > total_numsteps)
92 | else:
93 | action = agent.select_action(obs, dynamics_model, warmup=args.start_steps > total_numsteps) # Sample action from policy
94 |
95 | next_obs, reward, done, info = env.step(action) # Step
96 | if 'cost_exception' in info:
97 | prYellow('Cost exception occured.')
98 | episode_steps += 1
99 | total_numsteps += 1
100 | episode_reward += reward
101 | episode_cost += info.get('cost', 0)
102 |
103 | # Ignore the "done" signal if it comes from hitting the time horizon.
104 | # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
105 | mask = 1 if episode_steps == env.max_episode_steps else float(not done)
106 |
107 | memory.push(obs, action, reward, next_obs, mask, t=episode_steps * env.dt, next_t=(episode_steps+1) * env.dt) # Append transition to memory
108 |
109 | # Update state and store transition for GP model learning
110 | next_state = dynamics_model.get_state(next_obs)
111 | if episode_steps % 2 == 0 and i_episode < args.gp_max_episodes: # Stop learning the dynamics after a while to stabilize learning
112 | # TODO: Clean up line below, specifically (t_batch)
113 | dynamics_model.append_transition(state, action, next_state, t_batch=np.array([episode_steps*env.dt]))
114 |
115 | # append comp rollout with step before updating
116 | if args.use_comp:
117 | episode_rollout['obs'] = np.vstack((episode_rollout['obs'], obs))
118 | episode_rollout['u_safe'] = np.vstack((episode_rollout['u_safe'], action_cbf))
119 | episode_rollout['u_comp'] = np.vstack((episode_rollout['u_comp'], action_comp))
120 |
121 | obs = next_obs
122 |
123 | # Train compensator
124 | if args.use_comp and i_episode < args.comp_train_episodes:
125 | if comp_buffer_idx < 50: # TODO: Turn the 50 into an arg
126 | compensator_rollouts.append(episode_rollout)
127 | else:
128 | comp_buffer_idx[comp_buffer_idx] = episode_rollout
129 | comp_buffer_idx = (comp_buffer_idx + 1) % 50
130 | if i_episode % args.comp_update_episode == 0:
131 | agent.update_parameters_compensator(compensator_rollouts)
132 |
133 | # [optional] save intermediate model
134 | if i_episode % int(args.max_episodes / 10) == 0:
135 | agent.save_model(args.output)
136 | dynamics_model.save_disturbance_models(args.output)
137 |
138 | if experiment:
139 | # Comet.ml logging
140 | experiment.log_metric('reward/train', episode_reward, step=i_episode)
141 | experiment.log_metric('cost/train', episode_cost, step=i_episode)
142 | prGreen("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}, cost: {}".format(i_episode, total_numsteps,
143 | episode_steps,
144 | round(episode_reward, 2), round(episode_cost, 2)))
145 |
146 | # Evaluation
147 | if i_episode % 5 == 0 and args.eval is True:
148 | print('Size of replay buffers: real : {}, \t\t model : {}'.format(len(memory), len(memory_model)))
149 | avg_reward = 0.
150 | avg_cost = 0.
151 | episodes = 5
152 | for _ in range(episodes):
153 | obs = env.reset()
154 | episode_reward = 0
155 | episode_cost = 0
156 | done = False
157 | while not done:
158 | if args.use_comp:
159 | action, _, _ = agent.select_action(obs, dynamics_model, evaluate=True)
160 | else:
161 | action = agent.select_action(obs, dynamics_model, evaluate=True) # Sample action from policy
162 | next_obs, reward, done, info = env.step(action)
163 | episode_reward += reward
164 | episode_cost += info.get('cost', 0)
165 | obs = next_obs
166 |
167 | avg_reward += episode_reward
168 | avg_cost += episode_cost
169 | avg_reward /= episodes
170 | avg_cost /= episodes
171 | if experiment:
172 | experiment.log_metric('avg_reward/test', avg_reward, step=i_episode)
173 | experiment.log_metric('avg_cost/test', avg_cost, step=i_episode)
174 |
175 | print("----------------------------------------")
176 | print("Test Episodes: {}, Avg. Reward: {}, Avg. Cost: {}".format(episodes, round(avg_reward, 2), round(avg_cost, 2)))
177 | print("----------------------------------------")
178 |
179 |
180 | def test(agent, env, dynamics_model, evaluate, model_path, visualize=True, debug=False):
181 |
182 | agent.load_weights(model_path)
183 | dynamics_model.load_disturbance_models(model_path)
184 |
185 | def policy(observation):
186 | if args.use_comp:
187 | action, action_comp, action_cbf = agent.select_action(observation, dynamics_model, evaluate=True)
188 | else:
189 | action = agent.select_action(observation, dynamics_model,evaluate=True)
190 | return action
191 |
192 | evaluate(env, policy, dynamics_model=dynamics_model, debug=debug, visualize=visualize, save=False)
193 |
194 |
195 | if __name__ == "__main__":
196 |
197 | parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args')
198 | # Environment Args
199 | parser.add_argument('--env-name', default="SimulatedCars", help='Options are Unicycle or SimulatedCars.')
200 | # Comet ML
201 | parser.add_argument('--log_comet', action='store_true', dest='log_comet', help="Whether to log data")
202 | parser.add_argument('--comet_key', default='', help='Comet API key')
203 | parser.add_argument('--comet_workspace', default='', help='Comet workspace')
204 | # SAC Args
205 | parser.add_argument('--mode', default='train', type=str, help='support option: train/test')
206 | parser.add_argument('--visualize', action='store_true', dest='visualize', help='visualize env -only in available test mode')
207 | parser.add_argument('--output', default='output', type=str, help='')
208 | parser.add_argument('--policy', default="Gaussian",
209 | help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
210 | parser.add_argument('--eval', type=bool, default=True,
211 | help='Evaluates a policy a policy every 5 episode (default: True)')
212 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
213 | help='discount factor for reward (default: 0.99)')
214 | parser.add_argument('--tau', type=float, default=0.005, metavar='G',
215 | help='target smoothing coefficient(τ) (default: 0.005)')
216 | parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
217 | help='learning rate (default: 0.0003)')
218 | parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
219 | help='Temperature parameter α determines the relative importance of the entropy\
220 | term against the reward (default: 0.2)')
221 | parser.add_argument('--automatic_entropy_tuning', type=bool, default=True, metavar='G',
222 | help='Automatically adjust α (default: False)')
223 | parser.add_argument('--seed', type=int, default=12345, metavar='N',
224 | help='random seed (default: 12345)')
225 | parser.add_argument('--batch_size', type=int, default=256, metavar='N',
226 | help='batch size (default: 256)')
227 | parser.add_argument('--max_episodes', type=int, default=400, metavar='N',
228 | help='maximum number of episodes (default: 400)')
229 | parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
230 | help='hidden size (default: 256)')
231 | parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
232 | help='model updates per simulator step (default: 1)')
233 | parser.add_argument('--start_steps', type=int, default=5000, metavar='N',
234 | help='Steps sampling random actions (default: 10000)')
235 | parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
236 | help='Value target update per no. of updates per step (default: 1)')
237 | parser.add_argument('--replay_size', type=int, default=10000000, metavar='N',
238 | help='size of replay buffer (default: 10000000)')
239 | parser.add_argument('--cuda', action="store_true",
240 | help='run on CUDA (default: False)')
241 | parser.add_argument('--device_num', type=int, default=0, help='Select GPU number for CUDA (default: 0)')
242 | parser.add_argument('--resume', default='default', type=str, help='Resuming model path for testing')
243 | parser.add_argument('--validate_episodes', default=5, type=int, help='how many episode to perform during validate experiment')
244 | parser.add_argument('--validate_steps', default=1000, type=int, help='how many steps to perform a validate experiment')
245 | # CBF, Dynamics, Env Args
246 | parser.add_argument('--no_diff_qp', action='store_false', dest='diff_qp', help='Should the agent diff through the CBF?')
247 | parser.add_argument('--gp_model_size', default=3000, type=int, help='gp')
248 | parser.add_argument('--gp_max_episodes', default=100, type=int, help='gp max train episodes.')
249 | parser.add_argument('--k_d', default=3.0, type=float)
250 | parser.add_argument('--gamma_b', default=20, type=float)
251 | parser.add_argument('--l_p', default=0.03, type=float,
252 | help="Look-ahead distance for unicycle dynamics output.")
253 | # Model Based Learning
254 | parser.add_argument('--model_based', action='store_true', dest='model_based', help='If selected, will use data from the model to train the RL agent.')
255 | parser.add_argument('--real_ratio', default=0.3, type=float, help='Portion of data obtained from real replay buffer for training.')
256 | parser.add_argument('--k_horizon', default=1, type=int, help='horizon of model-based rollouts')
257 | parser.add_argument('--rollout_batch_size', default=5, type=int, help='Size of initial states batch to rollout from.')
258 | # Compensator
259 | parser.add_argument('--comp_rate', default=0.005, type=float, help='Compensator learning rate')
260 | parser.add_argument('--comp_train_episodes', default=200, type=int, help='Number of initial episodes to train compensator for.')
261 | parser.add_argument('--comp_update_episode', default=50, type=int, help='Modulo for compensator updates')
262 | parser.add_argument('--use_comp', type=bool, default=False, help='Should the compensator be used.')
263 | args = parser.parse_args()
264 |
265 | if args.mode == 'train':
266 | args.output = get_output_folder(args.output, args.env_name)
267 | if args.resume == 'default':
268 | args.resume = os.getcwd() + '/output/{}-run0'.format(args.env_name)
269 | elif args.resume.isnumeric():
270 | args.resume = os.getcwd() + '/output/{}-run{}'.format(args.env_name, args.resume)
271 |
272 | if args.cuda:
273 | torch.cuda.set_device(args.device_num)
274 |
275 | if args.mode == 'train' and args.log_comet:
276 | project_name = 'sac-rcbf-unicycle-environment' if args.env_name == 'Unicycle' else 'sac-rcbf-sim-cars-env'
277 | prYellow('Logging experiment on comet.ml!')
278 | # Create an experiment with your api key
279 | experiment = Experiment(
280 | api_key=args.comet_key,
281 | project_name=project_name,
282 | workspace=args.comet_workspace,
283 | )
284 | # Log args on comet.ml
285 | experiment.log_parameters(vars(args))
286 | experiment_tags = ['MB' if args.model_based else 'MF',
287 | str(args.batch_size) + '_batch',
288 | str(args.updates_per_step) + '_step_updates',
289 | 'diff_qp' if args.diff_qp else 'reg_qp']
290 | if args.use_comp:
291 | experiment_tags.append('use_comp')
292 | print(experiment_tags)
293 | experiment.add_tags(experiment_tags)
294 | else:
295 | experiment = None
296 |
297 | if args.use_comp and (args.model_based or args.diff_qp):
298 | raise Exception('Compensator can only be used with model free RL and regular CBF.')
299 |
300 | # Environment
301 | env = build_env(args)
302 |
303 | # Agent
304 | agent = RCBF_SAC(env.observation_space.shape[0], env.action_space, env, args)
305 | dynamics_model = DynamicsModel(env, args)
306 |
307 | # Random Seed
308 | if args.seed > 0:
309 | env.seed(args.seed)
310 | env.action_space.seed(args.seed)
311 | torch.manual_seed(args.seed)
312 | np.random.seed(args.seed)
313 | dynamics_model.seed(args.seed)
314 |
315 | # If model based, we warm up in the model too
316 | if args.model_based:
317 | args.start_steps /= (1 + args.rollout_batch_size)
318 |
319 | if args.mode == 'train':
320 | train(agent, env, dynamics_model, args, experiment)
321 | elif args.mode == 'test':
322 | evaluate = Evaluator(args.validate_episodes, args.validate_steps, args.output)
323 | test(agent, env, dynamics_model, evaluate, args.resume, visualize=args.visualize, debug=True)
324 |
325 | env.close()
326 |
327 |
--------------------------------------------------------------------------------
/rcbf_sac/diff_cbf_qp.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import numpy as np
3 | import torch
4 | from rcbf_sac.dynamics import DYNAMICS_MODE
5 | from rcbf_sac.utils import to_tensor, prRed
6 | from time import time
7 | from qpth.qp import QPFunction
8 |
9 |
10 | class CBFQPLayer:
11 |
12 | def __init__(self, env, args, gamma_b=100, k_d=1.5, l_p=0.03):
13 | """Constructor of CBFLayer.
14 |
15 | Parameters
16 | ----------
17 | env : gym.env
18 | Gym environment.
19 | gamma_b : float, optional
20 | gamma of control barrier certificate.
21 | k_d : float, optional
22 | confidence parameter desired (2.0 corresponds to ~95% for example).
23 | """
24 |
25 | self.device = torch.device("cuda" if args.cuda else "cpu")
26 |
27 | self.env = env
28 | self.u_min, self.u_max = self.get_control_bounds()
29 | self.gamma_b = gamma_b
30 |
31 | if self.env.dynamics_mode not in DYNAMICS_MODE:
32 | raise Exception('Dynamics mode not supported.')
33 |
34 | if self.env.dynamics_mode == 'Unicycle':
35 | self.num_cbfs = len(env.hazards_locations)
36 | self.k_d = k_d
37 | self.l_p = l_p
38 | elif self.env.dynamics_mode == 'SimulatedCars':
39 | self.num_cbfs = 2
40 |
41 | self.action_dim = env.action_space.shape[0]
42 | self.num_ineq_constraints = self.num_cbfs + 2 * self.action_dim
43 |
44 | def get_safe_action(self, state_batch, action_batch, mean_pred_batch, sigma_batch):
45 | """
46 |
47 | Parameters
48 | ----------
49 | state_batch : torch.tensor or ndarray
50 | action_batch : torch.tensor or ndarray
51 | State batch
52 | mean_pred_batch : torch.tensor or ndarray
53 | Mean of disturbance
54 | sigma_batch : torch.tensor or ndarray
55 | Standard deviation of disturbance
56 |
57 | Returns
58 | -------
59 | final_action_batch : torch.tensor
60 | Safe actions to take in the environment.
61 | """
62 |
63 | # batch form if only a single data point is passed
64 | expand_dims = len(state_batch.shape) == 1
65 | if expand_dims:
66 | action_batch = action_batch.unsqueeze(0)
67 | state_batch = state_batch.unsqueeze(0)
68 | mean_pred_batch = mean_pred_batch.unsqueeze(0)
69 | sigma_batch = sigma_batch.unsqueeze(0)
70 |
71 | start_time = time()
72 | Ps, qs, Gs, hs = self.get_cbf_qp_constraints(state_batch, action_batch, mean_pred_batch, sigma_batch)
73 | build_qp_time = time()
74 | safe_action_batch = self.solve_qp(Ps, qs, Gs, hs)
75 | # prCyan('Time to get constraints = {} - Time to solve QP = {} - time per qp = {} - batch_size = {} - device = {}'.format(build_qp_time - start_time, time() - build_qp_time, (time() - build_qp_time) / safe_action_batch.shape[0], Ps.shape[0], Ps.device))
76 | # The actual safe action is the cbf action + the nominal action
77 | final_action = torch.clamp(action_batch + safe_action_batch, self.u_min.repeat(action_batch.shape[0], 1), self.u_max.repeat(action_batch.shape[0], 1))
78 |
79 | return final_action if not expand_dims else final_action.squeeze(0)
80 |
81 | def solve_qp(self, Ps: torch.Tensor, qs: torch.Tensor, Gs: torch.Tensor, hs: torch.Tensor):
82 | """Solves:
83 | minimize_{u,eps} 0.5 * u^T P u + q^T u
84 | subject to G[u,eps]^T <= h
85 |
86 | Parameters
87 | ----------
88 | Ps : torch.Tensor
89 | (batch_size, n_u+1, n_u+1)
90 | qs : torch.Tensor
91 | (batch_size, n_u+1)
92 | Gs : torch.Tensor
93 | (batch_size, num_ineq_constraints, n_u+1)
94 | hs : torch.Tensor
95 | (batch_size, num_ineq_constraints)
96 | Returns
97 | -------
98 | safe_action_batch : torch.tensor
99 | The solution of the qp without the last dimension (the slack).
100 | """
101 |
102 |
103 | Ghs = torch.cat((Gs, hs.unsqueeze(2)), -1)
104 | Ghs_norm = torch.max(torch.abs(Ghs), dim=2, keepdim=True)[0]
105 | Gs /= Ghs_norm
106 | hs = hs / Ghs_norm.squeeze(-1)
107 | sol = self.cbf_layer(Ps, qs, Gs, hs, solver_args={"check_Q_spd": False, "maxIter": 100000, "notImprovedLim": 10, "eps": 1e-4})
108 | safe_action_batch = sol[:, :-1]
109 | return safe_action_batch
110 |
111 | def cbf_layer(self, Qs, ps, Gs, hs, As=None, bs=None, solver_args=None):
112 | """
113 |
114 | Parameters
115 | ----------
116 | Qs : torch.Tensor
117 | ps : torch.Tensor
118 | Gs : torch.Tensor
119 | shape (batch_size, num_ineq_constraints, num_vars)
120 | hs : torch.Tensor
121 | shape (batch_size, num_ineq_constraints)
122 | As : torch.Tensor, optional
123 | bs : torch.Tensor, optional
124 | solver_args : dict, optional
125 |
126 | Returns
127 | -------
128 | result : torch.Tensor
129 | Result of QP
130 | """
131 |
132 | if solver_args is None:
133 | solver_args = {}
134 |
135 | if As is None or bs is None:
136 | As = torch.Tensor().to(self.device).double()
137 | bs = torch.Tensor().to(self.device).double()
138 |
139 | result = QPFunction(verbose=0, **solver_args)(Qs.double(), ps.double(), Gs.double(), hs.double(), As, bs).float()
140 |
141 | if torch.any(torch.isnan(result)):
142 | prRed('QP Failed to solve - result is nan == {}!'.format(torch.any(torch.isnan(result))))
143 | raise Exception('QP Failed to solve')
144 | return result
145 |
146 | def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sigma_pred_batch):
147 | """Build up matrices required to solve qp
148 | Program specifically solves:
149 | minimize_{u,eps} 0.5 * u^T P u + q^T u
150 | subject to G[u,eps]^T <= h
151 |
152 | Each control barrier certificate is of the form:
153 | dh/dx^T (f_out + g_out u) >= -gamma^b h_out^3 where out here is an output of the state.
154 |
155 | In the case of SafetyGym_point dynamics:
156 | state = [x y θ v ω]
157 | state_d = [v*cos(θ) v*sin(θ) omega ω u^v u^ω]
158 |
159 | Quick Note on batch matrix multiplication for matrices A and B:
160 | - Batch size should be first dim
161 | - Everything needs to be 3-dimensional
162 | - E.g. if B is a vec, i.e. shape (batch_size, vec_length) --> .view(batch_size, vec_length, 1)
163 |
164 | Parameters
165 | ----------
166 | state_batch : torch.tensor
167 | current state (check dynamics.py for details on each dynamics' specifics)
168 | action_batch : torch.tensor
169 | Nominal control input.
170 | mean_pred_batch : torch.tensor
171 | mean disturbance prediction state, dimensions (n_s, n_u)
172 | sigma_pred_batch : torch.tensor
173 | standard deviation in additive disturbance after undergoing the output dynamics.
174 | gamma_b : float, optional
175 | CBF parameter for the class-Kappa function
176 |
177 | Returns
178 | -------
179 | P : torch.tensor
180 | Quadratic cost matrix in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u)
181 | q : torch.tensor
182 | Linear cost vector in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u)
183 | G : torch.tensor
184 | Inequality constraint matrix (G[u,eps] <= h) of size (num_constraints, n_u + 1)
185 | h : torch.tensor
186 | Inequality constraint vector (G[u,eps] <= h) of size (num_constraints,)
187 | """
188 |
189 | assert len(state_batch.shape) == 2 and len(action_batch.shape) == 2 and len(mean_pred_batch.shape) == 2 and len(
190 | sigma_pred_batch.shape) == 2, print(state_batch.shape, action_batch.shape, mean_pred_batch.shape,
191 | sigma_pred_batch.shape)
192 |
193 | batch_size = state_batch.shape[0]
194 | gamma_b = self.gamma_b
195 |
196 | # Expand dims
197 | state_batch = torch.unsqueeze(state_batch, -1)
198 | action_batch = torch.unsqueeze(action_batch, -1)
199 | mean_pred_batch = torch.unsqueeze(mean_pred_batch, -1)
200 | sigma_pred_batch = torch.unsqueeze(sigma_pred_batch, -1)
201 |
202 | if self.env.dynamics_mode == 'Unicycle':
203 |
204 | num_cbfs = self.num_cbfs
205 | hazards_radius = self.env.hazards_radius
206 | hazards_locations = to_tensor(self.env.hazards_locations, torch.FloatTensor, self.device)
207 | collision_radius = 1.2 * hazards_radius # add a little buffer
208 | l_p = self.l_p
209 |
210 | thetas = state_batch[:, 2, :].squeeze(-1)
211 | c_thetas = torch.cos(thetas)
212 | s_thetas = torch.sin(thetas)
213 |
214 | # p(x): lookahead output (batch_size, 2)
215 | ps = torch.zeros((batch_size, 2)).to(self.device)
216 | ps[:, 0] = state_batch[:, 0, :].squeeze(-1) + l_p * c_thetas
217 | ps[:, 1] = state_batch[:, 1, :].squeeze(-1) + l_p * s_thetas
218 |
219 | # p_dot(x) = f_p(x) + g_p(x)u + D_p where f_p(x) = 0, g_p(x) = RL and D_p is the disturbance
220 |
221 | # f_p(x) = [0,...,0]^T
222 | f_ps = torch.zeros((batch_size, 2, 1)).to(self.device)
223 |
224 | # g_p(x) = RL where L = diag([1, l_p])
225 | Rs = torch.zeros((batch_size, 2, 2)).to(self.device)
226 | Rs[:, 0, 0] = c_thetas
227 | Rs[:, 0, 1] = -s_thetas
228 | Rs[:, 1, 0] = s_thetas
229 | Rs[:, 1, 1] = c_thetas
230 | Ls = torch.zeros((batch_size, 2, 2)).to(self.device)
231 | Ls[:, 0, 0] = 1
232 | Ls[:, 1, 1] = l_p
233 | g_ps = torch.bmm(Rs, Ls) # (batch_size, 2, 2)
234 |
235 | # D_p(x) = g_p [0 D_θ]^T + [D_x1 D_x2]^T
236 | mu_theta_aug = torch.zeros([batch_size, 2, 1]).to(self.device)
237 | mu_theta_aug[:, 1, :] = mean_pred_batch[:, 2, :]
238 | mu_ps = torch.bmm(g_ps, mu_theta_aug) + mean_pred_batch[:, :2, :]
239 | sigma_theta_aug = torch.zeros([batch_size, 2, 1]).to(self.device)
240 | sigma_theta_aug[:, 1, :] = sigma_pred_batch[:, 2, :]
241 | sigma_ps = torch.bmm(torch.abs(g_ps), sigma_theta_aug) + sigma_pred_batch[:, :2, :]
242 |
243 | # hs (batch_size, hazards_locations)
244 | ps_hzds = ps.repeat((1, num_cbfs)).reshape((batch_size, num_cbfs, 2))
245 |
246 | hs = 0.5 * (torch.sum((ps_hzds - hazards_locations.view(1, num_cbfs, -1)) ** 2, axis=2) - collision_radius ** 2) # 1/2 * (||x - x_obs||^2 - r^2)
247 |
248 | dhdps = (ps_hzds - hazards_locations.view(1, num_cbfs, -1)) # (batch_size, n_cbfs, 2)
249 | # (batch_size, 2, 1)
250 | n_u = action_batch.shape[1] # dimension of control inputs
251 | num_constraints = num_cbfs + 2 * n_u # each cbf is a constraint, and we need to add actuator constraints (n_u of them)
252 |
253 | # Inequality constraints (G[u, eps] <= h)
254 | G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) # the extra variable is for epsilon (to make sure qp is always feasible)
255 | h = torch.zeros((batch_size, num_constraints)).to(self.device)
256 | ineq_constraint_counter = 0
257 |
258 | # Add inequality constraints
259 | G[:, :num_cbfs, :n_u] = -torch.bmm(dhdps, g_ps) # h1^Tg(x)
260 | G[:, :num_cbfs, n_u] = -1 # for slack
261 | h[:, :num_cbfs] = gamma_b * (hs ** 3) + (torch.bmm(dhdps, f_ps + mu_ps) - torch.bmm(torch.abs(dhdps), sigma_ps) + torch.bmm(torch.bmm(dhdps, g_ps), action_batch)).squeeze(-1)
262 | ineq_constraint_counter += num_cbfs
263 |
264 | # Let's also build the cost matrices, vectors to minimize control effort and penalize slack
265 | P = torch.diag(torch.tensor([1.e0, 1.e-2, 1e5])).repeat(batch_size, 1, 1).to(self.device)
266 | q = torch.zeros((batch_size, n_u + 1)).to(self.device)
267 |
268 | elif self.env.dynamics_mode == 'SimulatedCars':
269 |
270 | n_u = action_batch.shape[1] # dimension of control inputs
271 | num_constraints = self.num_cbfs + 2 * n_u # each cbf is a constraint, and we need to add actuator constraints (n_u of them)
272 | collision_radius = 3.5
273 |
274 | # Inequality constraints (G[u, eps] <= h)
275 | G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) # the extra variable is for epsilon (to make sure qp is always feasible)
276 | h = torch.zeros((batch_size, num_constraints)).to(self.device)
277 | ineq_constraint_counter = 0
278 |
279 | # Current State
280 | pos = state_batch[:, ::2, 0]
281 | vels = state_batch[:, 1::2, 0]
282 |
283 | # Action (acceleration)
284 | vels_des = 30.0 * torch.ones((state_batch.shape[0], 5)).to(self.device) # Desired velocities
285 | # vels_des[:, 0] -= 10 * torch.sin(0.2 * t_batch)
286 | accels = self.env.kp * (vels_des - vels)
287 | accels[:, 1] -= self.env.k_brake * (pos[:, 0] - pos[:, 1]) * ((pos[:, 0] - pos[:, 1]) < 6.0)
288 | accels[:, 2] -= self.env.k_brake * (pos[:, 1] - pos[:, 2]) * ((pos[:, 1] - pos[:, 2]) < 6.0)
289 | accels[:, 3] = 0.0 # Car 4's acceleration is controlled directly
290 | accels[:, 4] -= self.env.k_brake * (pos[:, 2] - pos[:, 4]) * ((pos[:, 2] - pos[:, 4]) < 13.0)
291 |
292 | # f(x)
293 | f_x = torch.zeros((state_batch.shape[0], state_batch.shape[1])).to(self.device)
294 | f_x[:, ::2] = vels
295 | f_x[:, 1::2] = accels
296 |
297 | # f_D(x) - disturbance in the drift dynamics
298 | fD_x = torch.zeros((state_batch.shape[0], state_batch.shape[1])).to(self.device)
299 | fD_x[:, 1::2] = sigma_pred_batch[:, 1::2, 0].squeeze(-1)
300 |
301 | # g(x)
302 | g_x = torch.zeros((state_batch.shape[0], state_batch.shape[1], 1)).to(self.device)
303 | g_x[:, 7, 0] = 50.0 # Car 4's acceleration
304 |
305 | # h1
306 | h13 = 0.5 * (((pos[:, 2] - pos[:, 3]) ** 2) - collision_radius ** 2)
307 | h15 = 0.5 * (((pos[:, 4] - pos[:, 3]) ** 2) - collision_radius ** 2)
308 |
309 | # dh1/dt = Lfh1
310 | h13_dot = (pos[:, 3] - pos[:, 2]) * (vels[:, 3] - vels[:, 2])
311 | h15_dot = (pos[:, 3] - pos[:, 4]) * (vels[:, 3] - vels[:, 4])
312 |
313 | # Lffh1
314 | dLfh13dx = torch.zeros((batch_size, 10)).to(self.device)
315 | dLfh13dx[:, 4] = (vels[:, 2] - vels[:, 3]) # Car 3 pos
316 | dLfh13dx[:, 5] = (pos[:, 2] - pos[:, 3]) # Car 3 vel
317 | dLfh13dx[:, 6] = (vels[:, 3] - vels[:, 2])
318 | dLfh13dx[:, 7] = (pos[:, 3] - pos[:, 2])
319 | Lffh13 = torch.bmm(dLfh13dx.view(batch_size, 1, -1), f_x.view(batch_size, -1, 1)).squeeze()
320 | LfDfh13 = torch.bmm(torch.abs(dLfh13dx.view(batch_size, 1, -1)), fD_x.view(batch_size, -1, 1)).squeeze()
321 |
322 | dLfh15dx = torch.zeros((batch_size, 10)).to(self.device)
323 | dLfh15dx[:, 8] = (vels[:, 4] - vels[:, 3]) # Car 5 pos
324 | dLfh15dx[:, 9] = (pos[:, 4] - pos[:, 3]) # Car 5 vels
325 | dLfh15dx[:, 6] = (vels[:, 3] - vels[:, 4])
326 | dLfh15dx[:, 7] = (pos[:, 3] - pos[:, 4])
327 | Lffh15 = torch.bmm(dLfh15dx.view(batch_size, 1, -1), f_x.view(batch_size, -1, 1)).squeeze()
328 | LfDfh15 = torch.bmm(torch.abs(dLfh15dx.view(batch_size, 1, -1)), fD_x.view(batch_size, -1, 1)).squeeze()
329 |
330 | # Lfgh1
331 | Lgfh13 = torch.bmm(dLfh13dx.view(batch_size, 1, -1), g_x)
332 | Lgfh15 = torch.bmm(dLfh15dx.view(batch_size, 1, -1), g_x)
333 |
334 | # print('state = {}'.format(state_batch))
335 | # print('f_x = {}'.format(f_x))
336 | # print('g_x = {}'.format(g_x))
337 | # print('h13 = {}'.format(h13))
338 | # print('h15 = {}'.format(h15))
339 | # print('gamma_b = {}'.format(self.gamma_b))
340 | # print('h13_dot = {}'.format(h13_dot))
341 | # print('h15_dot = {}'.format(h15_dot))
342 | # print('Lffh13 = {}'.format(Lffh13))
343 | # print('Lffh15 = {}'.format(Lffh15))
344 | # print('Lgfh13 = {}'.format(Lgfh13))
345 | # print('Lgfh15 = {}'.format(Lgfh15))
346 |
347 | # Inequality constraints (G[u, eps] <= h)
348 | h[:, 0] = Lffh13 - LfDfh13 + (gamma_b + gamma_b) * h13_dot + gamma_b * gamma_b * h13 + torch.bmm(Lgfh13, action_batch).squeeze()
349 | h[:, 1] = Lffh15 - LfDfh15 + (gamma_b + gamma_b) * h15_dot + gamma_b * gamma_b * h15 + torch.bmm(Lgfh15, action_batch).squeeze()
350 | G[:, 0, 0] = -Lgfh13.squeeze()
351 | G[:, 1, 0] = -Lgfh15.squeeze()
352 | G[:, :self.num_cbfs, n_u] = -2e2 # for slack
353 | ineq_constraint_counter += self.num_cbfs
354 |
355 | # Let's also build the cost matrices, vectors to minimize control effort and penalize slack
356 | P = torch.diag(torch.tensor([0.1, 1e1])).repeat(batch_size, 1, 1).to(self.device)
357 | q = torch.zeros((batch_size, n_u + 1)).to(self.device)
358 |
359 | else:
360 | raise Exception('Dynamics mode unknown!')
361 |
362 | # Second let's add actuator constraints
363 | n_u = action_batch.shape[1] # dimension of control inputs
364 |
365 | for c in range(n_u):
366 |
367 | # u_max >= u_nom + u ---> u <= u_max - u_nom
368 | if self.u_max is not None:
369 | G[:, ineq_constraint_counter, c] = 1
370 | h[:, ineq_constraint_counter] = self.u_max[c] - action_batch[:, c].squeeze(-1)
371 | ineq_constraint_counter += 1
372 |
373 | # u_min <= u_nom + u ---> -u <= u_min - u_nom
374 | if self.u_min is not None:
375 | G[:, ineq_constraint_counter, c] = -1
376 | h[:, ineq_constraint_counter] = -self.u_min[c] + action_batch[:, c].squeeze(-1)
377 | ineq_constraint_counter += 1
378 |
379 | return P, q, G, h
380 |
381 | def get_control_bounds(self):
382 | """
383 |
384 | Returns
385 | -------
386 | u_min : torch.tensor
387 | min control input.
388 | u_max : torch.tensor
389 | max control input.
390 | """
391 |
392 | u_min = torch.tensor(self.env.safe_action_space.low).to(self.device)
393 | u_max = torch.tensor(self.env.safe_action_space.high).to(self.device)
394 |
395 | return u_min, u_max
396 |
397 |
398 | if __name__ == "__main__":
399 |
400 | from build_env import build_env
401 | from rcbf_sac.dynamics import DynamicsModel
402 | from copy import deepcopy
403 | from rcbf_sac.utils import to_numpy, prGreen
404 |
405 |
406 | def simple_controller(env, state, goal):
407 | goal_xy = goal[:2]
408 | goal_dist = -np.log(goal[2]) # the observation is np.exp(-goal_dist)
409 | v = 0.02 * goal_dist
410 | relative_theta = 1.0 * np.arctan2(goal_xy[1], goal_xy[0])
411 | omega = 1.0 * relative_theta
412 |
413 | return np.clip(np.array([v, omega]), env.action_space.low, env.action_space.high)
414 |
415 |
416 | parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args')
417 | # Environment Args
418 | parser.add_argument('--env-name', default="SafetyGym", help='Options are Unicycle or SafetyGym')
419 | parser.add_argument('--robot_xml', default='xmls/point.xml',
420 | help="SafetyGym Currently only supporting xmls/point.xml")
421 | parser.add_argument('--k_d', default=3.0, type=float)
422 | parser.add_argument('--gamma_b', default=100, type=float)
423 | parser.add_argument('--l_p', default=0.03, type=float)
424 | parser.add_argument('--gp_model_size', default=2000, type=int, help='gp')
425 | parser.add_argument('--cuda', action='store_true', help='run on CUDA (default: False)')
426 | args = parser.parse_args()
427 | # Environment
428 | env = build_env(args)
429 |
430 | device = torch.device('cuda' if args.cuda else 'cpu')
431 |
432 |
433 | def to_def_tensor(ndarray):
434 |
435 | return to_tensor(ndarray, torch.FloatTensor, device)
436 |
437 |
438 | diff_cbf_layer = CBFQPLayer(env, args, args.gamma_b, args.k_d, args.l_p)
439 | dynamics_model = DynamicsModel(env, args)
440 |
441 | obs = env.reset()
442 | done = False
443 |
444 | ep_ret = 0
445 | ep_cost = 0
446 | ep_step = 0
447 |
448 | for i_step in range(3000):
449 |
450 | if done:
451 | prGreen('Episode Return: %.3f \t Episode Cost: %.3f' % (ep_ret, ep_cost))
452 | ep_ret, ep_cost, ep_step = 0, 0, 0
453 | obs = env.reset()
454 |
455 | state = dynamics_model.get_state(obs)
456 |
457 | print('state = {}, dist2hazards = {}'.format(state[:2],
458 | np.sqrt(np.sum((env.hazards_locations - state[:2]) ** 2, 1))))
459 |
460 | disturb_mean, disturb_std = dynamics_model.predict_disturbance(state)
461 |
462 | action = simple_controller(env, state, obs[-3:]) # TODO: observations last 3 indicated
463 | # action = 2*np.random.rand(2) - 1.0
464 | assert env.action_space.contains(action)
465 | final_action = diff_cbf_layer.get_safe_action(to_def_tensor(state), to_def_tensor(action),
466 | to_def_tensor(disturb_mean), to_def_tensor(disturb_std))
467 | final_action = to_numpy(final_action)
468 |
469 | # Env Step
470 | observation2, reward, done, info = env.step(final_action)
471 | observation2 = deepcopy(observation2)
472 |
473 | # Update state and store transition for GP model learning
474 | next_state = dynamics_model.get_state(observation2)
475 | if ep_step % 2 == 0:
476 | dynamics_model.append_transition(state, final_action, next_state)
477 |
478 | # print('reward', reward)
479 | ep_ret += reward
480 | ep_cost += info.get('cost', 0)
481 | ep_step += 1
482 | # env.render()
483 |
484 | obs = observation2
485 | state = next_state
--------------------------------------------------------------------------------