├── Pendulum ├── agents │ ├── __init__.py │ ├── TruncatedNormal.py │ ├── agents.py │ └── ppo.py ├── gym_models │ ├── envs │ │ ├── assets │ │ │ └── clockwise.png │ │ ├── __init__.py │ │ ├── double_integrator.py │ │ └── pendulum.py │ └── __init__.py ├── __init__.py ├── setup.py ├── config.yml └── Testing-pendulum.py ├── gym_models ├── envs │ └── __init__.py └── __init__.py ├── experiments ├── sampling_beta │ ├── config.yml │ ├── Testing-beta-sampling.py │ └── ppo.py └── projection_guassian │ ├── config.yml │ ├── Testing-projection.py │ └── ppo_proj.py ├── setup.py ├── README.md └── agents ├── agents.py └── ppo.py /Pendulum/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gym_models/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym_models.envs.quad_gym_env import QuadDynamics 2 | from gym_models.envs.quad_gym_env_proj import QuadDynamicsProj 3 | -------------------------------------------------------------------------------- /Pendulum/gym_models/envs/assets/clockwise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sharma1256/cbf-constrained_ppo/HEAD/Pendulum/gym_models/envs/assets/clockwise.png -------------------------------------------------------------------------------- /Pendulum/gym_models/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym_models.envs.double_integrator import DoubleIntegrator 2 | from gym_models.envs.pendulum import InvertedPendulum 3 | -------------------------------------------------------------------------------- /gym_models/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='quad_gym_env', 5 | entry_point='gym_models.envs:QuadDynamics', 6 | ) 7 | 8 | register( 9 | id='proj_quad_gym_env', 10 | entry_point='gym_models.envs:QuadDynamicsProj', 11 | ) 12 | -------------------------------------------------------------------------------- /Pendulum/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='double_integrator-v0', 5 | entry_point='gym_models.envs:DoubleIntegrator', 6 | ) 7 | 8 | register( 9 | id='inverted_pendulum-v0', 10 | entry_point='gym_models.envs:InvertedPendulum', 11 | ) 12 | -------------------------------------------------------------------------------- /Pendulum/gym_models/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='double_integrator-v0', 5 | entry_point='gym_models.envs:DoubleIntegrator', 6 | ) 7 | 8 | register( 9 | id='inverted_pendulum-v0', 10 | entry_point='gym_models.envs:InvertedPendulum', 11 | ) 12 | -------------------------------------------------------------------------------- /Pendulum/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='gym_models', 4 | version='0.0.1', 5 | author="Krishna Chaitanya Kosaraju and Wesley Suttle", 6 | install_requires=['gym', 7 | 'numpy', 8 | 'matplotlib', 9 | 'torch', 10 | 'wesutils', 11 | 'scipy', 12 | 'ray', 13 | 'pandas', 14 | 'seaborn', 15 | 'stable_baselines3', 16 | ] 17 | ) 18 | -------------------------------------------------------------------------------- /experiments/sampling_beta/config.yml: -------------------------------------------------------------------------------- 1 | # config file for experiment comparing Beta and Gaussian policies 2 | 3 | env: 4 | dt: 0.1 5 | umin: [-10.0, -10.0] 6 | umax: [10.0, 10.0] 7 | 8 | 9 | experiment: 10 | n_episodes: 10000 11 | rollout_length: 180 12 | buffer_size: 180 13 | n_epochs: 10 14 | entropy_coefficient: 0.0 15 | weight_decay: 0.0 16 | T_reward: 1 17 | cbf: True 18 | enable_cuda: True 19 | max_steps: 1000 20 | 21 | 22 | beta: 23 | pi_lr: 0.0006 24 | v_lr: 0.0006 25 | v_units1: 256 26 | v_units2: 256 27 | pi_units1: 256 28 | pi_units2: 256 29 | -------------------------------------------------------------------------------- /Pendulum/config.yml: -------------------------------------------------------------------------------- 1 | # config file for experiment comparing Beta and Gaussian policies 2 | 3 | env: 4 | tau: 0.05 5 | theta_safety_bounds: [-1.0, 1.0] 6 | 7 | 8 | experiment: 9 | n_episodes: 300 10 | rollout_length: 2048 11 | n_replications: 5 12 | 13 | 14 | beta: 15 | torque_bounds: [-.inf, .inf] 16 | pi_lr: 0.01 17 | v_lr: 0.01 18 | v_units1: 64 19 | v_units2: 64 20 | pi_units1: 64 21 | pi_units2: 64 22 | 23 | 24 | gaussian: 25 | torque_bounds: [-15.0, 15.0] 26 | pi_lr: 0.0003 27 | v_lr: 0.0003 28 | v_units1: 64 29 | v_units2: 64 30 | pi_units1: 64 31 | pi_units2: 64 32 | -------------------------------------------------------------------------------- /experiments/projection_guassian/config.yml: -------------------------------------------------------------------------------- 1 | # config file for experiment comparing Beta and Gaussian policies 2 | 3 | env: 4 | dt: 0.1 5 | umin: [-15.0, -15.0] 6 | umax: [15.0, 15.0] 7 | 8 | 9 | experiment: 10 | n_episodes: 10000 11 | rollout_length: 320 12 | buffer_size: 320 13 | n_epochs: 10 14 | entropy_coefficient: 0.00000001 15 | weight_decay: 0.0 16 | T_reward: 1 17 | cbf: True 18 | enable_cuda: True 19 | max_steps: 1000 20 | 21 | 22 | gaussian: 23 | pi_lr: 0.0004 24 | v_lr: 0.0004 25 | v_units1: 256 26 | v_units2: 256 27 | pi_units1: 256 28 | pi_units2: 256 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='gym_models', 4 | version='0.0.1', 5 | install_requires=['gym', 6 | 'numpy', 7 | 'matplotlib', 8 | 'torch', 9 | 'wesutils', 10 | 'scipy', 11 | 'ray', 12 | 'pandas', 13 | 'seaborn', 14 | 'stable_baselines3', 15 | 'math', 16 | 'cvxpy', 17 | 'cvxopt', 18 | 'time', 19 | 'warnings', 20 | 'copy', 21 | 'scipy', 22 | 'csv', 23 | 'datetime' 24 | ] 25 | ) 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Control Barrier Function-constrained Proximal Policy Optimization 2 | 3 | This repository provides the framework used to conduct the experiments for our paper "Sampling-Based Safe Reinforcement Learning for Nonlinear Dynamical Systems", appearing in _Proceedings of the 27th International Conference on Artificial Intelligence and Statistics (AISTATS)_, 2024. The paper is available [here](https://arxiv.org/abs/2403.04007). 4 | 5 | Specifically, this repo contains the following: 6 | 1) Sampling based safety-constrained PPO 7 | 2) Constrained beta policy 8 | 3) Projection or Safety filter Benchmark 9 | 10 | A Beta policy, given in `ppo.py`, is constrained over the safe control set obtained from the `cbf` function defined in `quad_gym_env.py` that represents Control Barrier Function (CBF) based safety constraints,and, 11 | this policy is then updated using proximal policy optimization defined in `ppo.py`, which was adapted from [Stable Baselines3](https://stable-baselines3.readthedocs.io/en/master/). 12 | 13 | In addition, we created a benchmark using projection-based or safety-filter based safe RL policies in `ppo_proj.py` using the CBFs defined in `quad_gym_env_proj.py` to obtain safety constraints. This essentially leads to a projection based safe RL policy like that proposed in [Cheng et al., 2019](https://cdn.aaai.org/ojs/4213/4213-13-7267-1-10-20190705.pdf). 14 | 15 | Some of the dynamical components involved in our safe quadcopter gym environment are adapted from the repo: `https://github.com/hocherie/cbf_quadrotor` 16 | 17 | ### Usage 18 | 19 | 1) To install, first set up your preferred virtual environment, then do `pip install -e .` 20 | 2) For Quadcopter experiments: 21 | Go to `experiments` directory and select the experiment (e.g., `Testing-projection.py` or `Testing-beta-sampling.py`) that you wish to run 22 | 3) For Pendulum Experiments: 23 | Go to Pendulum directory and run `Testing-pendulum.py` 24 | 5) You'll see plots and rewards arrays being stored in the corresponding experiment folder 25 | 26 | ### Reference 27 | ## [Sampling-based safe reinforcement learning for nonlinear dynamical systems](https://proceedings.mlr.press/v238/suttle24a.html) 28 | 29 | `@inproceedings{suttle2024sampling, 30 | title={Sampling-based safe reinforcement learning for nonlinear dynamical systems}, 31 | author={Suttle, Wesley and Sharma, Vipul Kumar and Kosaraju, Krishna Chaitanya and Seetharaman, Sivaranjani and Liu, Ji and Gupta, Vijay and Sadler, Brian M}, 32 | booktitle={International Conference on Artificial Intelligence and Statistics}, 33 | pages={4420--4428}, 34 | year={2024}, 35 | organization={PMLR} 36 | }` 37 | -------------------------------------------------------------------------------- /Pendulum/Testing-pendulum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 24 20:16:49 2023 4 | 5 | @author: VIPUL 6 | """ 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import argparse 10 | from itertools import product 11 | import pickle 12 | import os 13 | import yaml 14 | import time 15 | import shutil 16 | import torch 17 | #import torch.distributions 18 | import wesutils 19 | #from quad_gym_env import QuadDynamics 20 | #from ppo import BetaPolicy 21 | import agents.ppo 22 | import gym_models 23 | import gym_models.envs.pendulum 24 | #from torch import distributions 25 | 26 | import pdb 27 | 28 | 29 | ### Hyperparameters... 30 | 31 | # ...for the agent 32 | n_episodes = 1000 33 | #rollout_length = 2048 34 | rollout_length = 2048 35 | buffer_size = rollout_length 36 | policy_lr = 0.0001 37 | value_lr = 0.0001 38 | layer_size = 512 39 | enable_cuda = False 40 | n_epochs = 10 41 | batch_size = 64 #modified from 256 42 | entropy_coef = 0.000001 43 | weight_decay = 0.0 44 | T=5 #storing reward per 100 episodes 45 | tau=0.05 46 | theta_safety_bounds=[-1.0, 1.0] 47 | beta_torque_bounds=[-15.0, 15.0] 48 | pi_units1=128 49 | pi_units2=128 50 | v_units1=64 51 | v_units2=64 52 | def train(): 53 | env = gym_models.envs.pendulum.InvertedPendulum( 54 | tau=tau, 55 | theta_safety_bounds=theta_safety_bounds, 56 | torque_bounds=beta_torque_bounds 57 | ) 58 | 59 | pi = agents.ppo.BetaPolicy( 60 | 3, env.cbf, 1, 61 | hidden_layer1_size=pi_units1, 62 | hidden_layer2_size=pi_units2 63 | ) 64 | v = wesutils.two_layer_net( 65 | 3, 1, v_units1, v_units2 66 | ) 67 | agent = agents.ppo.PPOBase( 68 | env, pi, v, 69 | policy_lr, value_lr, 70 | buffer_size=buffer_size, 71 | enable_cuda=enable_cuda, 72 | n_epochs=n_epochs, 73 | batch_size=batch_size, 74 | entropy_coef=entropy_coef, 75 | weight_decay=weight_decay, 76 | ) 77 | 78 | # train and collect data 79 | rewards, safety_rates = [], [] # TODO: get rid of safety_rates 80 | for i in range(n_episodes): 81 | reward, safety_rate = agent.collect_rollout(env, rollout_length) 82 | agent.train() 83 | rewards.append(reward) 84 | safety_rates.append(safety_rate) 85 | if i%T==0: 86 | np.save("rewards_sequence.npy", rewards) 87 | safety_rates.append(safety_rate) 88 | 89 | print(f'Episode {i} return: {reward:.2f}') # does this work? 90 | 91 | return {'rewards': rewards, 92 | 'safety_rates': safety_rates} 93 | 94 | 95 | if __name__ == '__main__': 96 | 97 | start_time = time.time() 98 | train() 99 | total_runtime = time.time() - start_time 100 | 101 | print(f'Total runtime: {total_runtime / 60:.1f}m') 102 | -------------------------------------------------------------------------------- /experiments/sampling_beta/Testing-beta-sampling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import os 5 | import time 6 | import wesutils 7 | from quad_gym_env import QuadDynamics 8 | from ppo import BetaPolicy 9 | import ppo 10 | 11 | from datetime import datetime 12 | 13 | # Get today's date as a datetime object 14 | today_date = datetime.today() 15 | 16 | # Convert the datetime object to a string 17 | today_date_str = today_date.strftime("%Y-%m-%d") 18 | 19 | ### Hyperparameters... 20 | n_episodes = 10000 #1000 21 | rollout_length = 180 22 | buffer_size = rollout_length 23 | policy_lr = 0.0006 24 | value_lr = 0.0006 25 | layer_size = 256 26 | enable_cuda = True 27 | n_epochs = 10 28 | batch_size = 256 29 | entropy_coef = 0.0 30 | weight_decay = 0.0 31 | T=1 32 | dt = 0.1 33 | max_steps = 1000 34 | umin = -10.0* np.array([1, 1]) 35 | umax = 10.0 * np.array([1, 1]) 36 | episode = 0 37 | cbf = True 38 | device_run = 'sampling' 39 | run = 1 40 | 41 | 42 | def train(): 43 | env = QuadDynamics( 44 | dt=dt, 45 | max_steps=max_steps, 46 | umax=umax, 47 | umin=umin, 48 | env_cbf = cbf, 49 | layer_size = layer_size, 50 | entropy = entropy_coef, 51 | lr = policy_lr, 52 | device_run = device_run, 53 | date = today_date_str, 54 | run = run 55 | ) 56 | 57 | pi = BetaPolicy( 58 | 10, env.cbf, 2, 59 | hidden_layer1_size=layer_size, 60 | hidden_layer2_size=layer_size, 61 | ) 62 | v = wesutils.two_layer_net( 63 | 10, 1, layer_size, layer_size 64 | ) 65 | agent = ppo.PPOBase( 66 | env, pi, v, 67 | policy_lr, value_lr, 68 | buffer_size=buffer_size, 69 | enable_cuda=enable_cuda, 70 | n_epochs=n_epochs, 71 | batch_size=batch_size, 72 | entropy_coef=entropy_coef, 73 | weight_decay=weight_decay, 74 | ) 75 | 76 | # train and collect data 77 | rewards, safety_rates = [], [] # TODO: get rid of safety_rates 78 | for i in range(n_episodes): 79 | env.episodes = i 80 | reward, safety_rate = agent.collect_rollout(env, rollout_length) 81 | agent.train() 82 | rewards.append(reward) 83 | safety_rates.append(safety_rate) 84 | if i%T==0: 85 | folder_name_main = f"{{{env.date}}}" 86 | os.makedirs(folder_name_main, exist_ok=True) 87 | ##Change the current working directory to the newly created folder 88 | os.chdir(folder_name_main) 89 | 90 | folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}" 91 | os.makedirs(folder_name, exist_ok=True) 92 | ##Change the current working directory to the newly created folder 93 | os.chdir(folder_name) 94 | np.save(f"lr={policy_lr}_ent={entropy_coef}_lyr=batch={layer_size}_roll={rollout_length}.npy", rewards) 95 | os.chdir('..') 96 | os.chdir('..') 97 | safety_rates.append(safety_rate) 98 | 99 | print(f'Episode {i} return: {reward:.2f}') 100 | 101 | return {'rewards': rewards, 102 | 'safety_rates': safety_rates} 103 | 104 | 105 | if __name__ == '__main__': 106 | 107 | start_time = time.time() 108 | train() 109 | total_runtime = time.time() - start_time 110 | 111 | print(f'Total runtime: {total_runtime / 60:.1f}m') 112 | -------------------------------------------------------------------------------- /experiments/projection_guassian/Testing-projection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import os 4 | import time 5 | import wesutils 6 | from quad_gym_env_proj import QuadDynamicsProj 7 | from ppo_proj import GaussianPolicy 8 | import ppo_proj 9 | from datetime import datetime 10 | 11 | # Get today's date as a datetime object 12 | today_date = datetime.today() 13 | 14 | # Convert the datetime object to a string 15 | today_date_str = today_date.strftime("%Y-%m-%d") 16 | 17 | # Hyperparameters... 18 | n_episodes = 10000 #1000 19 | rollout_length = 320 20 | buffer_size = rollout_length 21 | policy_lr = 0.0004 22 | value_lr = 0.0004 23 | layer_size = 256 24 | enable_cuda = False 25 | n_epochs = 10 26 | batch_size = 256 #modified from 256 27 | entropy_coef = 0.00000001 28 | weight_decay = 0.0 29 | T=1 30 | # ...for the environment 31 | dt = 0.1 32 | max_steps = 1000 33 | umin = -15.0 * np.array([1, 1]) 34 | umax = 15.0 * np.array([1, 1]) 35 | episode = 0 36 | cbf = True 37 | device_run = 'projection' 38 | run = 1 39 | 40 | def train(): 41 | env = QuadDynamicsProj( 42 | dt=dt, 43 | max_steps=max_steps, 44 | umax=umax, 45 | umin=umin, 46 | env_cbf = cbf, 47 | layer_size = layer_size, 48 | entropy = entropy_coef, 49 | lr = policy_lr, 50 | device_run = device_run, 51 | date = today_date_str, 52 | run = run 53 | ) 54 | pi = GaussianPolicy( 55 | 10, 2, umin, umax, 56 | hidden_layer1_size=layer_size, 57 | hidden_layer2_size=layer_size, 58 | ) 59 | v = wesutils.two_layer_net( 60 | 10, 1, layer_size, layer_size 61 | ) 62 | agent = ppo_proj.PPOBase( 63 | env, pi, v, 64 | policy_lr, value_lr, 65 | buffer_size=buffer_size, 66 | enable_cuda=enable_cuda, 67 | n_epochs=n_epochs, 68 | batch_size=batch_size, 69 | entropy_coef=entropy_coef, 70 | weight_decay=weight_decay, 71 | ) 72 | # train and collect data 73 | rewards, safety_rates = [], [] # TODO: get rid of safety_rates 74 | for i in range(n_episodes): 75 | env.episodes = i 76 | reward, safety_rate = agent.collect_rollout(env, rollout_length) 77 | agent.train() 78 | rewards.append(reward) 79 | safety_rates.append(safety_rate) 80 | if i%T==0: 81 | folder_name_main = f"{{{env.date}}}" 82 | os.makedirs(folder_name_main, exist_ok=True) 83 | ##Change the current working directory to the newly created folder 84 | os.chdir(folder_name_main) 85 | folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}" 86 | os.makedirs(folder_name, exist_ok=True) 87 | ##Change the current working directory to the newly created folder 88 | os.chdir(folder_name) 89 | np.save(f"lr={policy_lr}_ent={entropy_coef}_lyr=batch={layer_size}_roll={rollout_length}.npy", rewards) 90 | os.chdir('..') 91 | os.chdir('..') 92 | safety_rates.append(safety_rate) 93 | print(f'Episode {i} return: {reward:.2f}') # does this work? 94 | return {'rewards': rewards, 95 | 'safety_rates': safety_rates} 96 | 97 | if __name__ == '__main__': 98 | 99 | start_time = time.time() 100 | train() 101 | total_runtime = time.time() - start_time 102 | 103 | print(f'Total runtime: {total_runtime / 60:.1f}m') 104 | -------------------------------------------------------------------------------- /Pendulum/gym_models/envs/double_integrator.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import numpy as np 3 | import gym 4 | from gym import spaces 5 | from gym.utils import seeding 6 | 7 | 8 | 9 | class DoubleIntegrator(gym.Env): 10 | """ 11 | Description: 12 | """ 13 | 14 | 15 | metadata = {'render.modes': ['console']} 16 | 17 | def __init__(self, 18 | tau: float = 1e-2, 19 | initial_state: Union[float, float] = [9.1, 1], 20 | safety_bounds: Union[float, float] = [9., 11.], 21 | x1_des: float = [10.], 22 | max_steps: int = 1000 23 | ): 24 | super(DoubleIntegrator, self).__init__() 25 | 26 | self._tau = tau 27 | self.initial_state = initial_state 28 | self.safety_bounds = safety_bounds 29 | self.x1_des = x1_des 30 | self.action_space = spaces.Box(-np.inf, np.inf, shape=(1,), dtype=np.float64) 31 | self.observation_space = spaces.Box(-np.inf, np.inf, shape=(2,1), dtype=np.float64) 32 | self._state = np.array(self.initial_state) 33 | self.max_steps = max_steps 34 | self.count = 0 35 | self.seed() 36 | 37 | @property 38 | def tau(self): 39 | return self._tau 40 | 41 | @tau.setter 42 | def tau(self, value: float): 43 | if value>1e-1: 44 | print("discretizing time is too high, consider reducing for better results") 45 | self._tau = value 46 | 47 | @property 48 | def state(self): 49 | return self._state.reshape(2,1) 50 | 51 | def seed(self, seed=None): 52 | # not used 53 | self.np_random, seed = seeding.np_random(seed) 54 | return [seed] 55 | 56 | def reward(self): 57 | x1 = self._state[0] 58 | return (-0.5) * ((x1 - self.x1_des) ** 2) 59 | 60 | def step(self, action: float): 61 | self.count += 1 62 | x1, x2 = self._state[0], self._state[1] 63 | x1_new = x1 + (self._tau * x2) + (0.5 * (self._tau ** 2)) * action 64 | x2_new = x2 + (self._tau) * action 65 | self._state = np.array([x1_new, x2_new], dtype=float) 66 | 67 | x1_min, x1_max = self.safety_bounds[0], self.safety_bounds[1] 68 | done = False 69 | if (x1_new < x1_min) or (x1_new > x1_max) or (self.count>self.max_steps): 70 | done = True 71 | return self.state, self.reward(), done, {} 72 | 73 | def reset(self): 74 | self._state = np.array(self.initial_state) 75 | self.count = 0 76 | return self.state 77 | 78 | def render(self, mode='console'): 79 | if mode != 'console': 80 | raise NotImplementedError() 81 | print("not implemented") 82 | 83 | def close(self): 84 | pass 85 | 86 | def cbf(self, state=None, eta: float = 0.5): 87 | """ 88 | Calculates CBF constraint set at a given state. Default is 89 | the current state. 90 | """ 91 | 92 | state = state if state is not None else self._state 93 | 94 | if (eta>1-1e-3) or (eta<1e-5): 95 | raise ValueError("eta should be inside (0, 1)") 96 | x1, x2 = state[0], state[1] 97 | x1_min, x1_max = self.safety_bounds[0], self.safety_bounds[1] 98 | u_min = (2 / (self._tau ** 2) ) * ( - (self._tau * x2) - eta * (x1 - x1_min)) 99 | u_max = (2 / (self._tau ** 2) ) * ( - (self._tau * x2) + eta * (x1_max - x1)) 100 | if u_min>u_max: 101 | raise ValueError("Infeasible") 102 | else: 103 | return [u_min, u_max] 104 | -------------------------------------------------------------------------------- /Pendulum/agents/TruncatedNormal.py: -------------------------------------------------------------------------------- 1 | import math 2 | from numbers import Number 3 | 4 | import torch 5 | from torch.distributions import Distribution, constraints 6 | from torch.distributions.utils import broadcast_all 7 | 8 | CONST_SQRT_2 = math.sqrt(2) 9 | CONST_INV_SQRT_2PI = 1 / math.sqrt(2 * math.pi) 10 | CONST_INV_SQRT_2 = 1 / math.sqrt(2) 11 | CONST_LOG_INV_SQRT_2PI = math.log(CONST_INV_SQRT_2PI) 12 | CONST_LOG_SQRT_2PI_E = 0.5 * math.log(2 * math.pi * math.e) 13 | 14 | 15 | class TruncatedStandardNormal(Distribution): 16 | """ 17 | Truncated Standard Normal distribution 18 | https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf 19 | """ 20 | 21 | arg_constraints = { 22 | 'a': constraints.real, 23 | 'b': constraints.real, 24 | } 25 | has_rsample = True 26 | 27 | def __init__(self, a, b, validate_args=None): 28 | self.a, self.b = broadcast_all(a, b) 29 | if isinstance(a, Number) and isinstance(b, Number): 30 | batch_shape = torch.Size() 31 | else: 32 | batch_shape = self.a.size() 33 | super(TruncatedStandardNormal, self).__init__(batch_shape, validate_args=validate_args) 34 | if self.a.dtype != self.b.dtype: 35 | raise ValueError('Truncation bounds types are different') 36 | if any((self.a >= self.b).view(-1,).tolist()): 37 | raise ValueError('Incorrect truncation range') 38 | eps = torch.finfo(self.a.dtype).eps 39 | self._dtype_min_gt_0 = eps 40 | self._dtype_max_lt_1 = 1 - eps 41 | self._little_phi_a = self._little_phi(self.a) 42 | self._little_phi_b = self._little_phi(self.b) 43 | self._big_phi_a = self._big_phi(self.a) 44 | self._big_phi_b = self._big_phi(self.b) 45 | self._Z = (self._big_phi_b - self._big_phi_a).clamp_min(eps) 46 | self._log_Z = self._Z.log() 47 | little_phi_coeff_a = torch.nan_to_num(self.a, nan=math.nan) 48 | little_phi_coeff_b = torch.nan_to_num(self.b, nan=math.nan) 49 | self._lpbb_m_lpaa_d_Z = (self._little_phi_b * little_phi_coeff_b - self._little_phi_a * little_phi_coeff_a) / self._Z 50 | self._mean = -(self._little_phi_b - self._little_phi_a) / self._Z 51 | self._variance = 1 - self._lpbb_m_lpaa_d_Z - ((self._little_phi_b - self._little_phi_a) / self._Z) ** 2 52 | self._entropy = CONST_LOG_SQRT_2PI_E + self._log_Z - 0.5 * self._lpbb_m_lpaa_d_Z 53 | 54 | @constraints.dependent_property 55 | def support(self): 56 | return constraints.interval(self.a, self.b) 57 | 58 | @property 59 | def mean(self): 60 | return self._mean 61 | 62 | @property 63 | def variance(self): 64 | return self._variance 65 | 66 | @property 67 | def entropy(self): 68 | return self._entropy 69 | 70 | @property 71 | def auc(self): 72 | return self._Z 73 | 74 | @staticmethod 75 | def _little_phi(x): 76 | return (-(x ** 2) * 0.5).exp() * CONST_INV_SQRT_2PI 77 | 78 | @staticmethod 79 | def _big_phi(x): 80 | return 0.5 * (1 + (x * CONST_INV_SQRT_2).erf()) 81 | 82 | @staticmethod 83 | def _inv_big_phi(x): 84 | return CONST_SQRT_2 * (2 * x - 1).erfinv() 85 | 86 | def cdf(self, value): 87 | if self._validate_args: 88 | self._validate_sample(value) 89 | return ((self._big_phi(value) - self._big_phi_a) / self._Z).clamp(0, 1) 90 | 91 | def icdf(self, value): 92 | return self._inv_big_phi(self._big_phi_a + value * self._Z) 93 | 94 | def log_prob(self, value): 95 | if self._validate_args: 96 | self._validate_sample(value) 97 | return CONST_LOG_INV_SQRT_2PI - self._log_Z - (value ** 2) * 0.5 98 | 99 | def rsample(self, sample_shape=torch.Size()): 100 | shape = self._extended_shape(sample_shape) 101 | p = torch.empty(shape, device=self.a.device).uniform_(self._dtype_min_gt_0, self._dtype_max_lt_1) 102 | return self.icdf(p) 103 | 104 | 105 | class TruncatedNormal(TruncatedStandardNormal): 106 | """ 107 | Truncated Normal distribution 108 | https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf 109 | """ 110 | 111 | has_rsample = True 112 | 113 | def __init__(self, loc, scale, a, b, validate_args=None): 114 | self.loc, self.scale, a, b = broadcast_all(loc, scale, a, b) 115 | a = (a - self.loc) / self.scale 116 | b = (b - self.loc) / self.scale 117 | super(TruncatedNormal, self).__init__(a, b, validate_args=validate_args) 118 | self._log_scale = self.scale.log() 119 | self._mean = self._mean * self.scale + self.loc 120 | self._variance = self._variance * self.scale ** 2 121 | self._entropy += self._log_scale 122 | 123 | def _to_std_rv(self, value): 124 | return (value - self.loc) / self.scale 125 | 126 | def _from_std_rv(self, value): 127 | return value * self.scale + self.loc 128 | 129 | def cdf(self, value): 130 | return super(TruncatedNormal, self).cdf(self._to_std_rv(value)) 131 | 132 | def icdf(self, value): 133 | return self._from_std_rv(super(TruncatedNormal, self).icdf(value)) 134 | 135 | def log_prob(self, value): 136 | return super(TruncatedNormal, self).log_prob(self._to_std_rv(value)) - self._log_scale 137 | -------------------------------------------------------------------------------- /Pendulum/gym_models/envs/pendulum.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import numpy as np 3 | import gym 4 | from gym import spaces 5 | from gym.utils import seeding 6 | from os import path 7 | from gym.envs.classic_control import rendering 8 | 9 | 10 | 11 | class InvertedPendulum(gym.Env): 12 | """ 13 | Description: 14 | """ 15 | 16 | 17 | # metadata = {'render.modes': ['console']} 18 | metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 30} 19 | 20 | def __init__(self, 21 | tau: float = 5e-2, 22 | m: float = 1, 23 | g: float = 9.8, 24 | l: float = 1, 25 | initial_state: Union[float, float] = [0.0, 0.], 26 | theta_safety_bounds: Union[float, float] = [-1.0, 1.0], 27 | thetadot_safety_bounds: Union[float, float] = [-np.inf, np.inf], 28 | theta_des: float = [0.], 29 | torque_bounds: Union[float, float] = [-15., 15.], 30 | max_steps: int = 200 31 | ): 32 | super(InvertedPendulum, self).__init__() 33 | 34 | self._tau = tau 35 | self.g = g 36 | self.l = l 37 | self.m = m 38 | self.torque_bounds = torque_bounds 39 | self.initial_state = initial_state 40 | self.theta_safety_bounds = theta_safety_bounds 41 | self.thetadot_safety_bounds = thetadot_safety_bounds 42 | self.x1_des = 0. 43 | self.action_space = spaces.Box(self.torque_bounds[0], self.torque_bounds[1], shape=(1,), dtype=np.float64) 44 | high = np.array([1.0, 1.0, 8.0]) 45 | self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32) 46 | # self.observation_space = spaces.Box(-np.inf, np.inf, shape=(3,), dtype=np.float64) 47 | # self._state = np.array(self.initial_state) 48 | self.seed() 49 | self.max_steps = max_steps 50 | self.count = 0 51 | self.action_dim=1 52 | 53 | # rendering stuff 54 | self.viewer = None 55 | 56 | 57 | @property 58 | def tau(self): 59 | return self._tau 60 | @tau.setter 61 | def tau(self, value: float): 62 | if value>1e-1: 63 | print("discretizing time is too high, consider reducing for better results") 64 | self._tau = value 65 | 66 | @property 67 | def state(self): 68 | # return self._state 69 | return self._get_obs() 70 | 71 | def seed(self, seed=None): 72 | # not used 73 | self.np_random, seed = seeding.np_random(seed) 74 | return [seed] 75 | 76 | ### New reward 77 | def _angle_normalize(self, x): 78 | return ((x + np.pi) % (2 * np.pi)) - np.pi 79 | 80 | def reward(self, action): 81 | cost = self._angle_normalize(self._state[0])**2 + 0.1 * self._state[1]**2 \ 82 | + 0.001 * (action ** 2) 83 | return -float(cost) 84 | ### end new reward 85 | 86 | def _get_obs(self): 87 | theta, thetadot = self._state 88 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 89 | 90 | def step(self, action: float): 91 | self.count += 1 92 | action = np.clip(action, self.torque_bounds[0], self.torque_bounds[1]) 93 | self.last_u = action # for use in rendering 94 | c1 = ((3 * self.g)/(2 * self.l)) 95 | c2 = (3 /(self.m * (self.l ** 2))) 96 | theta, thetadot = self._state[0], self._state[1] 97 | theta_new = theta + (self._tau * thetadot) + (self._tau ** 2) * ( c1 * np.sin(theta) + c2 * action) 98 | thetadot_new = thetadot + (self._tau) * ( c1 * np.sin(theta) + c2 * action) 99 | self._state = np.array([theta_new, thetadot_new], dtype=float) 100 | 101 | theta_min, theta_max = self.theta_safety_bounds[0], self.theta_safety_bounds[1] 102 | thetadot_min, thetadot_max = self.thetadot_safety_bounds 103 | done = False 104 | 105 | # if (theta_new < theta_min) or \ 106 | # (theta_new > theta_max) or \ 107 | # (thetadot_new < thetadot_min) or \ 108 | # (thetadot_new > thetadot_max) or \ 109 | # (self.count > self.max_steps): 110 | # done = True 111 | 112 | if self.count > self.max_steps: 113 | done = True 114 | 115 | return self._get_obs().flatten(), self.reward(action), done, {} 116 | 117 | ### OLD 118 | # return self._state, self.reward(action), done, {} 119 | 120 | ### OLD 121 | # def reset(self): 122 | # self._state = np.array(self.initial_state) 123 | # self.count = 0 124 | # return self._state) 125 | 126 | def reset(self): 127 | self._state = np.array(self.initial_state) 128 | # self._state = np.random.uniform(-np.pi, np.pi, size=(2,)) 129 | self.count = 0 130 | return self._get_obs() 131 | 132 | def render(self, mode="human"): 133 | if self.viewer is None: 134 | self.viewer = rendering.Viewer(500, 500) 135 | self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2) 136 | rod = rendering.make_capsule(1, 0.2) 137 | rod.set_color(0.8, 0.3, 0.3) 138 | self.pole_transform = rendering.Transform() 139 | rod.add_attr(self.pole_transform) 140 | self.viewer.add_geom(rod) 141 | axle = rendering.make_circle(0.05) 142 | axle.set_color(0, 0, 0) 143 | self.viewer.add_geom(axle) 144 | fname = path.join(path.dirname(__file__), "assets/clockwise.png") 145 | self.img = rendering.Image(fname, 1.0, 1.0) 146 | self.imgtrans = rendering.Transform() 147 | self.img.add_attr(self.imgtrans) 148 | 149 | self.viewer.add_onetime(self.img) 150 | self.pole_transform.set_rotation(self._state[0] + np.pi / 2) 151 | if self.last_u: 152 | self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2) 153 | 154 | return self.viewer.render(return_rgb_array=mode == "rgb_array") 155 | 156 | def close(self): 157 | if self.viewer: 158 | self.viewer.close() 159 | self.viewer = None 160 | 161 | ### OLD 162 | # def render(self, mode='console'): 163 | # if mode != 'console'metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 30}: 164 | # raise NotImplementedError() 165 | # print("not implemented") 166 | 167 | # def close(self): 168 | # pass 169 | 170 | def cbf(self, state=None, eta: float = 0.99): 171 | """ 172 | Calculates CBF constraint set at a given state. Default is 173 | the current state. 174 | """ 175 | 176 | state = state if state is not None else self._state 177 | 178 | if (eta>1-1e-3) or (eta<1e-5): 179 | raise ValueError("eta should be inside (0, 1)") 180 | c1 = ((3 * self.g)/(2 * self.l)) 181 | c2 = (3 /(self.m * (self.l ** 2))) 182 | #theta, thetadot = np.arcsin(obs[i][1]), obs[i][2] 183 | #theta, thetadot = np.arcsin(state[1]), state[2] 184 | #the above line can replace line 185 185 | theta, thetadot = state[0], state[1] 186 | theta_min, theta_max = self.theta_safety_bounds[0], self.theta_safety_bounds[1] 187 | thetadot_min, thetadot_max = self.thetadot_safety_bounds[0], self.thetadot_safety_bounds[1] 188 | u_min1 = (1/c2) * (((1 / (self._tau **2)) * (-eta * (theta - theta_min) - self._tau * thetadot)) - c1 * np.sin(theta) ) 189 | u_max1 = (1/c2) * (((1 / (self._tau **2)) * ( eta * (theta_max - theta) - self._tau * thetadot)) - c1 * np.sin(theta) ) 190 | 191 | 192 | u_min2 = (1/c2) * (((1 / (self._tau)) * (-eta * (thetadot - thetadot_min))) - c1 * np.sin(theta) ) 193 | u_max2 = (1/c2) * (((1 / (self._tau)) * ( eta * (thetadot_max - thetadot))) - c1 * np.sin(theta) ) 194 | 195 | u_min = max(u_min1, u_min2, self.torque_bounds[0]) 196 | u_max = min(u_max1, u_max2, self.torque_bounds[1]) 197 | 198 | u_min=self.torque_bounds[0] 199 | u_max=self.torque_bounds[1] 200 | if u_min>u_max: 201 | raise ValueError("Infeasible") 202 | else: 203 | return [u_min, u_max] -------------------------------------------------------------------------------- /experiments/projection_guassian/ppo_proj.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributions as td 7 | from torch.nn import functional as F 8 | import gym 9 | from gym import spaces 10 | import numpy as np 11 | from typing import NamedTuple 12 | import warnings 13 | from matplotlib import pyplot as plt 14 | import os 15 | import csv 16 | 17 | from stable_baselines3.common.utils import obs_as_tensor 18 | from stable_baselines3.common.preprocessing import ( 19 | get_obs_shape, get_action_dim 20 | ) 21 | 22 | 23 | class RolloutBufferSamples(NamedTuple): 24 | observations: torch.Tensor 25 | actions: torch.Tensor 26 | old_values: torch.Tensor 27 | old_log_prob: torch.Tensor 28 | advantages: torch.Tensor 29 | returns: torch.Tensor 30 | 31 | 32 | class RolloutBuffer: 33 | 34 | def __init__(self, 35 | buffer_size, 36 | observation_space, 37 | action_space, 38 | gamma=0.90, 39 | device='cpu'): 40 | 41 | self.buffer_size = buffer_size 42 | self.observation_space = observation_space 43 | self.action_space = action_space 44 | self.gamma = gamma 45 | self.device = device 46 | self.obs_shape = get_obs_shape(self.observation_space) 47 | self.action_dim = get_action_dim(self.action_space) 48 | 49 | self.reset() 50 | 51 | def reset(self): 52 | 53 | self.observations = np.zeros( 54 | (self.buffer_size,) + self.obs_shape, dtype=np.float32 55 | ) 56 | self.actions = np.zeros( 57 | (self.buffer_size, self.action_dim), dtype=np.float32 58 | ) 59 | self.rewards = np.zeros( 60 | (self.buffer_size,), dtype=np.float32 61 | ) 62 | self.episode_starts = np.zeros( 63 | (self.buffer_size,), dtype=np.float32 64 | ) 65 | self.values = np.zeros( 66 | (self.buffer_size,), dtype=np.float32 67 | ) 68 | self.log_probs = np.zeros( 69 | (self.buffer_size,), dtype=np.float32 70 | ) 71 | self.advantages = np.zeros( 72 | (self.buffer_size,), dtype=np.float32 73 | ) 74 | 75 | self.full = False 76 | self.pos = 0 77 | 78 | def compute_returns_and_advantage(self, last_value, done): 79 | 80 | last_value = last_value.clone().cpu().numpy().flatten() 81 | 82 | discounted_reward = 0 83 | for step in reversed(range(self.buffer_size)): 84 | if step == self.buffer_size - 1: 85 | next_non_terminal = 1.0 - done 86 | next_value = last_value 87 | else: 88 | next_non_terminal = 1.0 - self.episode_starts[step + 1] 89 | next_value = self.values[step + 1] 90 | discounted_reward = self.rewards[step] + \ 91 | self.gamma * discounted_reward * next_non_terminal 92 | self.advantages[step] = discounted_reward - self.values[step] 93 | self.returns = self.advantages + self.values 94 | 95 | def add(self, obs, action, reward, episode_start, value, log_prob): 96 | 97 | if len(log_prob.shape) == 0: 98 | log_prob = log_prob.reshape(-1, 1) 99 | 100 | if isinstance(self.observation_space, spaces.Discrete): 101 | obs = obs.reshape((1,) + self.obs_shape) 102 | 103 | self.observations[self.pos] = np.array(obs).copy() 104 | self.actions[self.pos] = np.array(action).copy() 105 | self.rewards[self.pos] = np.array(reward).copy() 106 | self.episode_starts[self.pos] = np.array(episode_start).copy() 107 | self.values[self.pos] = value.clone().cpu().numpy().flatten() 108 | self.log_probs[self.pos] = log_prob.clone().cpu().numpy() 109 | self.pos += 1 110 | if self.pos == self.buffer_size: 111 | self.full = True 112 | 113 | def get(self, batch_size=None): 114 | assert self.full, "" 115 | indices = np.random.permutation(self.buffer_size) 116 | 117 | # Return everything, don't create minibatches 118 | if batch_size is None: 119 | batch_size = self.buffer_size 120 | 121 | start_idx = 0 122 | while start_idx < self.buffer_size: 123 | yield self._get_samples(indices[start_idx : start_idx + batch_size]) 124 | start_idx += batch_size 125 | 126 | def _get_samples(self, batch_inds): 127 | data = ( 128 | self.observations[batch_inds], 129 | self.actions[batch_inds], 130 | self.values[batch_inds].flatten(), 131 | self.log_probs[batch_inds].flatten(), 132 | self.advantages[batch_inds].flatten(), 133 | self.returns[batch_inds].flatten(), 134 | ) 135 | return RolloutBufferSamples(*tuple(map(self.to_torch, data))) 136 | 137 | def to_torch(self, array, copy=True): 138 | if copy: 139 | return torch.tensor(array).to(self.device) 140 | return torch.as_tensor(array).to(self.device) 141 | 142 | 143 | class PolicyNetwork(nn.Module): 144 | """Base class for stochastic policy networks.""" 145 | 146 | def __init__(self): 147 | super().__init__() 148 | 149 | def forward(self, state): 150 | """Take state as input, then output the parameters of the policy.""" 151 | 152 | raise NotImplemented("forward not implemented.") 153 | 154 | def sample(self, state): 155 | """ 156 | Sample an action based on the model parameters given the current state. 157 | """ 158 | 159 | raise NotImplemented("sample not implemented.") 160 | 161 | def log_probs(self, obs, actions): 162 | """ 163 | Return log probabilities for each state-action pair. 164 | """ 165 | 166 | raise NotImplemented("log_probs not implemented.") 167 | 168 | def entropy(self, obs): 169 | """ 170 | Return entropy of the policy for each state. 171 | """ 172 | 173 | raise NotImplemented("entropy not implemented.") 174 | 175 | 176 | class GaussianPolicyBase(PolicyNetwork): 177 | """ 178 | Base class for Gaussian policy. 179 | 180 | Desired network needs to be implemented. 181 | """ 182 | 183 | def __init__(self, action_dim): 184 | 185 | super().__init__() 186 | 187 | self.action_dim = action_dim 188 | 189 | def _get_covs(self, log_stds): 190 | batch_size = log_stds.shape[0] 191 | stds = log_stds.exp().reshape(batch_size, 1, 1) 192 | covs = stds * torch.eye(self.action_dim).repeat(batch_size, 1, 1) 193 | return covs 194 | 195 | def sample(self, obs, no_log_prob=False): 196 | mean, log_std = self.forward(obs) 197 | cov = log_std.exp() * torch.eye(self.action_dim) 198 | dist = td.MultivariateNormal(mean, cov) 199 | action = dist.rsample() 200 | return action if no_log_prob else (action, dist.log_prob(action)) 201 | 202 | def log_probs(self, obs, actions): 203 | means, log_stds = self.forward(obs) 204 | covs = self._get_covs(log_stds) 205 | dists = td.MultivariateNormal(means, covs) 206 | return dists.log_prob(actions) 207 | 208 | def entropy(self, obs): 209 | means, log_stds = self.forward(obs) 210 | covs = self._get_covs(log_stds) 211 | dists = td.MultivariateNormal(means, covs) 212 | return dists.entropy() 213 | 214 | 215 | class GaussianPolicy(GaussianPolicyBase): 216 | """ 217 | Gaussian policy using a two-layer, two-headed MLP with ReLU activation. 218 | """ 219 | 220 | def __init__(self, obs_dim, action_dim, 221 | min_action_val=-20.0 * np.array([1, 1]), 222 | max_action_val=20.0 * np.array([1, 1]), 223 | hidden_layer1_size=64, 224 | hidden_layer2_size=64): 225 | 226 | super().__init__(action_dim) 227 | 228 | self.base_net = nn.Sequential( 229 | nn.Linear(obs_dim, hidden_layer1_size), 230 | nn.ReLU(), 231 | nn.Linear(hidden_layer1_size, hidden_layer2_size), 232 | nn.ReLU(), 233 | ) 234 | 235 | self.mean_net = nn.Sequential( 236 | nn.Linear(hidden_layer2_size, action_dim), 237 | nn.Hardtanh(min_action_val[0], max_action_val[0]), 238 | nn.Hardtanh(min_action_val[1], max_action_val[1]) 239 | ) 240 | 241 | self.log_std_net = nn.Sequential( 242 | nn.Linear(hidden_layer2_size, 1), 243 | ) 244 | 245 | def forward(self, obs): 246 | x = self.base_net(obs) 247 | mean = self.mean_net(x) 248 | log_std = self.log_std_net(x) 249 | return mean, log_std 250 | 251 | 252 | class PPOBase: 253 | def __init__(self, 254 | env, 255 | policy, 256 | value_function, 257 | policy_lr, 258 | value_lr, 259 | entropy_coef=0.0, 260 | clip_range=0.2, 261 | n_epochs=10, 262 | batch_size=64, 263 | weight_decay=0.0, 264 | gamma=0.99, 265 | buffer_size=2048, 266 | enable_cuda=True, 267 | policy_optimizer=torch.optim.Adam, 268 | value_optimizer=torch.optim.Adam, 269 | grad_clip_radius=None): 270 | 271 | warnings.warn('This PPO implementation currently contains hacks for ' + \ 272 | 'returning information about CBF-related safety.') 273 | 274 | self.env = env 275 | self.pi = policy 276 | self.v = value_function 277 | self.entropy_coef = entropy_coef 278 | self.clip_range = clip_range 279 | self.n_epochs = n_epochs 280 | self.batch_size = batch_size 281 | 282 | self.__cuda_enabled = enable_cuda 283 | self.enable_cuda(self.__cuda_enabled, warn=False) 284 | # NOTE: self.device is defined when self.enable_cuda is called! 285 | 286 | self.pi_optim = policy_optimizer(self.pi.parameters(), 287 | lr=policy_lr, 288 | weight_decay=weight_decay) 289 | self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr) 290 | self.grad_clip_radius = grad_clip_radius 291 | 292 | self.rollout_buffer = RolloutBuffer( 293 | buffer_size, 294 | env.observation_space, 295 | env.action_space, 296 | device=self.device, 297 | gamma=gamma 298 | ) 299 | 300 | @property 301 | def cuda_enabled(self): 302 | return self.__cuda_enabled 303 | 304 | def enable_cuda(self, enable_cuda=True, warn=True): 305 | """Enable or disable cuda and update models.""" 306 | 307 | if warn: 308 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 309 | "initializing optimizers can give errors when using " 310 | "optimizers other than SGD or Adam!") 311 | 312 | self.__cuda_enabled = enable_cuda 313 | self.device = torch.device( 314 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 315 | else 'cpu') 316 | self.pi.to(self.device) 317 | self.v.to(self.device) 318 | 319 | def load_models(self, filename, enable_cuda=True, continue_training=True): 320 | """ 321 | Load policy and value functions. Copy them to target functions. 322 | 323 | This method is for evaluation only. Use load_checkpoint to continue 324 | training. 325 | """ 326 | 327 | models = torch.load(filename) 328 | 329 | self.pi.load_state_dict(models['pi_state_dict']) 330 | self.v.load_state_dict(models['v_state_dict']) 331 | 332 | self.pi.eval() 333 | self.v.eval() 334 | 335 | self.enable_cuda(enable_cuda, warn=False) 336 | 337 | def save_checkpoint(self, filename): 338 | """Save state_dicts of models and optimizers.""" 339 | 340 | torch.save({ 341 | 'using_cuda': self.__cuda_enabled, 342 | 'pi_state_dict': self.pi.state_dict(), 343 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 344 | 'v_state_dict': self.v.state_dict(), 345 | 'v_optimizer_state_dict': self.v_optim.state_dict(), 346 | }, filename) 347 | 348 | def load_checkpoint(self, filename, continue_training=True): 349 | """Load state_dicts for models and optimizers.""" 350 | 351 | checkpoint = torch.load(filename) 352 | 353 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 354 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 355 | self.v.load_state_dict(models['v_state_dict']) 356 | self.v_optim.load_state_dict(models['v_optimizer_state_dict']) 357 | 358 | if continue_training: 359 | self.pi.train() 360 | self.v.train() 361 | else: 362 | self.pi.eval() 363 | self.v.eval() 364 | 365 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 366 | 367 | def collect_rollout(self, env, rollout_length): 368 | """ 369 | Perform a rollout and fill the rollout buffer. 370 | """ 371 | 372 | self._last_obs = env.reset() 373 | self._last_episode_start = np.zeros(1) 374 | n_steps = 0 375 | self.rollout_buffer.reset() 376 | 377 | num_unsafe_steps = 0 378 | x_t=[] 379 | y_t=[] 380 | 381 | local_flag_done = False 382 | while n_steps < rollout_length: 383 | action_dim=get_action_dim(env.action_space) 384 | 385 | 386 | with torch.no_grad(): 387 | # Convert to pytorch tensor or to TensorDict 388 | obs_tensor = obs_as_tensor(self._last_obs, self.device).float() 389 | action, log_prob = self.pi.sample(obs_tensor, action_dim) 390 | value = self.v(obs_tensor) 391 | action = action.cpu().numpy() 392 | 393 | # Rescale and perform action 394 | clipped_action = action 395 | # Clip the actions to avoid out of bound error 396 | if isinstance(self.env.action_space, gym.spaces.Box): 397 | clipped_action = np.clip(action, self.env.action_space.low, 398 | self.env.action_space.high) 399 | elif isinstance(self.env.action_space, gym.spaces.Discrete): 400 | clipped_action = int(clipped_action) 401 | 402 | new_obs, reward, done, info = env.step(clipped_action) 403 | 404 | 405 | if abs(new_obs[0]-env.obstacle[0])<0.1 and abs(new_obs[1]-env.obstacle[1])<0.1: 406 | print("crash") 407 | 408 | n_steps += 1 409 | 410 | if isinstance(self.env.action_space, gym.spaces.Discrete): 411 | # Reshape in case of discrete action 412 | action = action.reshape(-1, 1) 413 | 414 | self.rollout_buffer.add(self._last_obs, action, reward, 415 | self._last_episode_start, value, log_prob) 416 | self._last_obs = new_obs.flatten() 417 | self._last_episode_start = done 418 | 419 | if done == 0 and local_flag_done == False: 420 | x_t.append(new_obs[0]) 421 | y_t.append(new_obs[1]) 422 | 423 | elif done == 1 and n_steps>1: 424 | local_flag_done = True 425 | 426 | plt.xlim(np.double(env.min_x),np.double(env.max_x)) 427 | plt.ylim(np.double(env.min_y),np.double(env.max_y)) 428 | plt.xlabel('X axis') 429 | plt.ylabel('Y-axis') 430 | plt.plot(x_t,y_t) 431 | plt.plot(env.goal[0],env.goal[1],marker='o',color='red') 432 | plt.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black') 433 | 434 | def f(x, y, xa, yb, a, b): 435 | return (x - xa)**4/a**4 + (y - yb)**4/b**4 436 | 437 | # Define the point around which to plot 438 | xa, yb = env.obstacle[0], env.obstacle[1] 439 | 440 | # Define the range of x and y values to plot 441 | x_vals = np.linspace(xa - env.a_d, xa + env.a_d, 100) 442 | y_vals = np.linspace(yb - env.b_d, yb + env.b_d, 100) 443 | 444 | # Create a grid of x and y values 445 | X, Y = np.meshgrid(x_vals, y_vals) 446 | 447 | # Evaluate the function at each point in the grid 448 | Z = f(X, Y, xa, yb, env.a_d, env.b_d) 449 | 450 | # Plot the function as a contour plot 451 | 452 | 453 | folder_name_main = f"{{{env.date}}}" 454 | os.makedirs(folder_name_main, exist_ok=True) 455 | ##Change the current working directory to the newly created folder 456 | os.chdir(folder_name_main) 457 | 458 | folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}" 459 | os.makedirs(folder_name, exist_ok=True) 460 | ##Change the current working directory to the newly created folder 461 | os.chdir(folder_name) 462 | 463 | 464 | folder_name_1 = f"{{lr={env.lr}_entr={env.entropy}_umin={env.umin[0]}_umax={env.umax[0]}_lyr=batch={env.layer_size}}}" 465 | os.makedirs(folder_name_1, exist_ok=True) 466 | os.chdir(folder_name_1) 467 | 468 | if (env.episodes)%1 == 0: 469 | plt.savefig(f"ep={env.episodes}.png") 470 | with open(f"episode={env.episodes}.csv", 'w', newline='') as file: 471 | writer = csv.writer(file) 472 | writer.writerow(x_t) 473 | writer.writerow(y_t) 474 | plt.contour(X, Y, Z, levels=[env.safety_dist]) 475 | 476 | 477 | # Return to the original working directory (optional) 478 | os.chdir('..') 479 | os.chdir('..') 480 | os.chdir('..') 481 | 482 | 483 | self.rollout_buffer.compute_returns_and_advantage(last_value=value, 484 | done=done) 485 | 486 | safety_rate = 100 * (1 - num_unsafe_steps / rollout_length) 487 | 488 | return np.sum(self.rollout_buffer.rewards), safety_rate 489 | 490 | def train(self): 491 | """ 492 | Train on the current rollout buffer. 493 | """ 494 | for epoch in range(self.n_epochs): 495 | 496 | # Do a complete pass on the rollout buffer 497 | for rollout_data in self.rollout_buffer.get(self.batch_size): 498 | 499 | actions = rollout_data.actions 500 | obs = rollout_data.observations 501 | values = self.v(obs).flatten() 502 | try: 503 | log_probs = self.pi.log_probs(obs, actions) 504 | except: 505 | print(self.pi.log_probs(obs, actions)) 506 | import pdb; pdb.set_trace() 507 | 508 | entropies = self.pi.entropy(obs) 509 | if log_probs.device!=actions.device: 510 | log_probs=log_probs.to('cuda:0') 511 | entropies=entropies.to('cuda:0') 512 | advantages = rollout_data.advantages 513 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) 514 | 515 | # ratio between old and new policy, should be one at the first iteration 516 | ratio = torch.exp(log_probs - rollout_data.old_log_prob) 517 | 518 | 519 | policy_loss_1 = advantages * ratio 520 | policy_loss_2 = advantages * torch.clamp(ratio, 521 | 1 - self.clip_range, 522 | 1 + self.clip_range) 523 | policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean() - \ 524 | self.entropy_coef * entropies.mean() 525 | 526 | self.pi_optim.zero_grad() 527 | policy_loss.backward() 528 | # Clip grad norm 529 | if self.grad_clip_radius is not None: 530 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 531 | self.grad_clip_radius) 532 | self.pi_optim.step() 533 | 534 | value_loss = F.mse_loss(rollout_data.returns, values) 535 | 536 | self.v_optim.zero_grad() 537 | value_loss.backward() 538 | # Clip grad norm 539 | if self.grad_clip_radius is not None: 540 | torch.nn.utils.clip_grad_norm_(self.v.parameters(), 541 | self.grad_clip_radius) 542 | self.v_optim.step() 543 | -------------------------------------------------------------------------------- /experiments/sampling_beta/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.distributions as td 4 | from torch.nn import functional as F 5 | import gym 6 | from gym import spaces 7 | import numpy as np 8 | from typing import NamedTuple 9 | import warnings 10 | from matplotlib import pyplot as plt 11 | import os 12 | import csv 13 | 14 | from stable_baselines3.common.utils import obs_as_tensor 15 | from stable_baselines3.common.preprocessing import ( 16 | get_obs_shape, get_action_dim 17 | ) 18 | 19 | 20 | #from agents.TruncatedNormal import TruncatedNormal as tn 21 | 22 | from wesutils import two_layer_net 23 | 24 | class RolloutBufferSamples(NamedTuple): 25 | observations: torch.Tensor 26 | actions: torch.Tensor 27 | old_values: torch.Tensor 28 | old_log_prob: torch.Tensor 29 | advantages: torch.Tensor 30 | returns: torch.Tensor 31 | 32 | 33 | class RolloutBuffer: 34 | 35 | def __init__(self, 36 | buffer_size, 37 | observation_space, 38 | action_space, 39 | gamma=0.90, 40 | device='cpu'): 41 | 42 | self.buffer_size = buffer_size 43 | self.observation_space = observation_space 44 | self.action_space = action_space 45 | self.gamma = gamma 46 | self.device = device 47 | self.obs_shape = get_obs_shape(self.observation_space) 48 | self.action_dim = get_action_dim(self.action_space) 49 | 50 | self.reset() 51 | 52 | def reset(self): 53 | 54 | self.observations = np.zeros( 55 | (self.buffer_size,) + self.obs_shape, dtype=np.float32 56 | ) 57 | self.actions = np.zeros( 58 | (self.buffer_size, self.action_dim), dtype=np.float32 59 | ) 60 | self.rewards = np.zeros( 61 | (self.buffer_size,), dtype=np.float32 62 | ) 63 | self.episode_starts = np.zeros( 64 | (self.buffer_size,), dtype=np.float32 65 | ) 66 | self.values = np.zeros( 67 | (self.buffer_size,), dtype=np.float32 68 | ) 69 | self.log_probs = np.zeros( 70 | (self.buffer_size,), dtype=np.float32 71 | ) 72 | self.advantages = np.zeros( 73 | (self.buffer_size,), dtype=np.float32 74 | ) 75 | 76 | self.full = False 77 | self.pos = 0 78 | 79 | def compute_returns_and_advantage(self, last_value, done): 80 | 81 | last_value = last_value.clone().cpu().numpy().flatten() 82 | 83 | discounted_reward = 0 84 | for step in reversed(range(self.buffer_size)): 85 | if step == self.buffer_size - 1: 86 | next_non_terminal = 1.0 - done 87 | next_value = last_value 88 | else: 89 | next_non_terminal = 1.0 - self.episode_starts[step + 1] 90 | next_value = self.values[step + 1] 91 | discounted_reward = self.rewards[step] + \ 92 | self.gamma * discounted_reward * next_non_terminal 93 | self.advantages[step] = discounted_reward - self.values[step] 94 | self.returns = self.advantages + self.values 95 | 96 | def add(self, obs, action, reward, episode_start, value, log_prob): 97 | 98 | if len(log_prob.shape) == 0: 99 | log_prob = log_prob.reshape(-1, 1) 100 | 101 | if isinstance(self.observation_space, spaces.Discrete): 102 | obs = obs.reshape((1,) + self.obs_shape) 103 | 104 | self.observations[self.pos] = np.array(obs).copy() 105 | self.actions[self.pos] = np.array(action).copy() 106 | self.rewards[self.pos] = np.array(reward).copy() 107 | self.episode_starts[self.pos] = np.array(episode_start).copy() 108 | self.values[self.pos] = value.clone().cpu().numpy().flatten() 109 | self.log_probs[self.pos] = log_prob.clone().cpu().numpy() 110 | self.pos += 1 111 | if self.pos == self.buffer_size: 112 | self.full = True 113 | 114 | def get(self, batch_size=None): 115 | assert self.full, "" 116 | indices = np.random.permutation(self.buffer_size) 117 | 118 | # Return everything, don't create minibatches 119 | if batch_size is None: 120 | batch_size = self.buffer_size 121 | 122 | start_idx = 0 123 | while start_idx < self.buffer_size: 124 | yield self._get_samples(indices[start_idx : start_idx + batch_size]) 125 | start_idx += batch_size 126 | 127 | def _get_samples(self, batch_inds): 128 | data = ( 129 | self.observations[batch_inds], 130 | self.actions[batch_inds], 131 | self.values[batch_inds].flatten(), 132 | self.log_probs[batch_inds].flatten(), 133 | self.advantages[batch_inds].flatten(), 134 | self.returns[batch_inds].flatten(), 135 | ) 136 | return RolloutBufferSamples(*tuple(map(self.to_torch, data))) 137 | 138 | def to_torch(self, array, copy=True): 139 | if copy: 140 | return torch.tensor(array).to(self.device) 141 | return torch.as_tensor(array).to(self.device) 142 | 143 | 144 | class PolicyNetwork(nn.Module): 145 | """Base class for stochastic policy networks.""" 146 | 147 | def __init__(self): 148 | super().__init__() 149 | 150 | def forward(self, state): 151 | """Take state as input, then output the parameters of the policy.""" 152 | 153 | raise NotImplemented("forward not implemented.") 154 | 155 | def sample(self, state): 156 | """ 157 | Sample an action based on the model parameters given the current state. 158 | """ 159 | 160 | raise NotImplemented("sample not implemented.") 161 | 162 | def log_probs(self, obs, actions): 163 | """ 164 | Return log probabilities for each state-action pair. 165 | """ 166 | 167 | raise NotImplemented("log_probs not implemented.") 168 | 169 | def entropy(self, obs): 170 | """ 171 | Return entropy of the policy for each state. 172 | """ 173 | 174 | raise NotImplemented("entropy not implemented.") 175 | 176 | 177 | class BetaPolicyBase(PolicyNetwork): 178 | """ 179 | Base class for Beta policy. 180 | 181 | Desired network needs to be implemented. 182 | """ 183 | 184 | def __init__(self, constraint_fn, action_dim, enable_cuda=False): 185 | 186 | super().__init__() 187 | 188 | self.device = torch.device( 189 | 'cuda' if torch.cuda.is_available() and enable_cuda \ 190 | else 'cpu') 191 | self.constraint_fn = self._vectorize_f(constraint_fn, action_dim) 192 | self.action_dim = action_dim 193 | 194 | def _vectorize_f(self, f, action_dim): 195 | """ 196 | Converts a function f defined on 1D numpy arrays and outputting pairs of 197 | scalars into a vectorized function accepting batches of 198 | torch tensorized arrays and output pairs of torch tensors. 199 | """ 200 | 201 | def vectorized_f(obs, action_dim): 202 | 203 | obs = obs.cpu().detach().numpy() 204 | 205 | if len(obs.shape) == 1: # check to see if obs is a batch or single obs 206 | batch_size = 1 207 | lbs, ubs = f(obs) 208 | 209 | else: 210 | batch_size = obs.shape[0] 211 | lbs = np.zeros([batch_size, self.action_dim]) 212 | ubs = np.zeros([batch_size, self.action_dim]) 213 | for i in range(batch_size): 214 | lbs[i], ubs[i] = f(obs[i]) 215 | 216 | lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim) 217 | ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim) 218 | lbs = lbs.to(self.device) 219 | ubs = ubs.to(self.device) 220 | 221 | return lbs, ubs 222 | 223 | return vectorized_f 224 | 225 | def sample(self, obs, action_dim, no_log_prob=False): 226 | """ 227 | Sample from independent Beta distributions across each action_dim. 228 | """ 229 | 230 | assert len(obs.shape) == 1, 'obs must be a flat array' 231 | 232 | alphas, betas = self.forward(obs) 233 | alphas, betas = torch.flatten(alphas), torch.flatten(betas) 234 | dists = [ 235 | td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas) 236 | ] 237 | action_along_dims = [dist.rsample() for dist in dists] 238 | action = torch.tensor(action_along_dims, requires_grad=True) 239 | log_prob = torch.sum(torch.tensor([ 240 | dist.log_prob(a) for dist, a in zip(dists, action_along_dims) 241 | ], requires_grad=True)) 242 | lb, ub = self.constraint_fn(obs, action_dim) 243 | action = lb + (ub - lb) * action 244 | return action if no_log_prob else (action, log_prob) 245 | 246 | def log_probs(self, obs, actions, action_dim): 247 | alphas_arr, betas_arr = self.forward(obs) 248 | dists = [] 249 | 250 | alphas_arr_1 = alphas_arr[:,0] 251 | alphas_arr_2 = alphas_arr[:,1] 252 | betas_arr_1 = betas_arr[:,0] 253 | betas_arr_2 = betas_arr[:,1] 254 | try: 255 | dists_1 = td.Beta(alphas_arr_1, betas_arr_1) 256 | except: 257 | import pdb; pdb.set_trace() 258 | 259 | try: 260 | dists_2 = td.Beta(alphas_arr_2, betas_arr_2) 261 | except: 262 | import pdb; pdb.set_trace() 263 | 264 | for i in range(alphas_arr.shape[0]): 265 | alphas = alphas_arr[i] 266 | betas = betas_arr[i] 267 | dists.append([ 268 | td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas) 269 | ]) 270 | 271 | lbs, ubs = self.constraint_fn(obs, action_dim) 272 | if lbs.device!=actions.device: 273 | lbs = lbs.to('cuda:0') 274 | ubs = ubs.to('cuda:0') 275 | actions = (actions - lbs) / (ubs - lbs) 276 | actions = actions.clip(0, 1) 277 | 278 | log_probs = [] 279 | for action, action_dists in zip(actions, dists): 280 | log_probs.append( 281 | torch.sum(torch.tensor([ 282 | dim_dist.log_prob(dim_action) \ 283 | for dim_dist, dim_action in zip(action_dists, action) 284 | ], requires_grad=True)) 285 | ) 286 | log_probs = torch.tensor(log_probs, requires_grad=True) 287 | 288 | return_new = dists_1.log_prob(actions[:,0]).flatten() + dists_2.log_prob(actions[:,1]).flatten() 289 | 290 | 291 | return return_new 292 | 293 | def entropy(self, obs): 294 | """ 295 | Returns sum of entropies along each independent action dimension. 296 | """ 297 | alphas_arr, betas_arr = self.forward(obs) 298 | dists = [] 299 | for i in range(alphas_arr.shape[0]): 300 | alphas = alphas_arr[i] 301 | betas = betas_arr[i] 302 | dists.append([ 303 | td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas) 304 | ]) 305 | entropies = torch.tensor( 306 | [torch.sum(torch.tensor([dist.entropy() for dist in dist_list])) \ 307 | for dist_list in dists] 308 | ) 309 | return entropies 310 | 311 | 312 | 313 | class BetaPolicy(BetaPolicyBase): 314 | """ 315 | Beta policy using a two-layer, two-headed MLP with ReLU activation. 316 | """ 317 | 318 | def __init__(self, obs_dim, constraint_fn, action_dim, 319 | hidden_layer1_size=64, 320 | hidden_layer2_size=64): 321 | 322 | super().__init__(constraint_fn, action_dim=action_dim) 323 | 324 | self.base_net = nn.Sequential( 325 | nn.Linear(obs_dim, hidden_layer1_size), 326 | nn.Tanh(), 327 | nn.Linear(hidden_layer1_size, hidden_layer2_size), 328 | nn.Tanh(), 329 | ) 330 | 331 | self.alpha_net = nn.Sequential( 332 | nn.Linear(hidden_layer2_size, action_dim), 333 | nn.Softplus(), 334 | ) 335 | 336 | self.beta_net = nn.Sequential( 337 | nn.Linear(hidden_layer2_size, action_dim), 338 | nn.Softplus(), 339 | ) 340 | 341 | def forward(self, obs): 342 | 343 | x = self.base_net(obs) 344 | alpha = 1.0 + self.alpha_net(x) 345 | beta = 1.0 + self.beta_net(x) 346 | 347 | return alpha, beta 348 | 349 | 350 | class CategoricalPolicy(PolicyNetwork): 351 | """ 352 | Base class for categorical policy. 353 | 354 | Desired network needs to be implemented. 355 | """ 356 | 357 | def __init__(self, num_actions): 358 | 359 | super().__init__() 360 | 361 | self.num_actions = num_actions 362 | 363 | def sample(self, obs, no_log_prob=False): 364 | logits = self.forward(obs) 365 | dist = td.Categorical(logits=logits) 366 | action = dist.sample(sample_shape=torch.tensor([1])) 367 | return action if no_log_prob else (action, dist.log_prob(action)) 368 | 369 | def log_probs(self, obs, actions): 370 | dists = td.Categorical(logits=self.forward(obs)) 371 | return dists.log_prob(actions.flatten()) 372 | 373 | def entropy(self, obs): 374 | dists = td.Categorical(logits=self.forward(obs)) 375 | return dists.entropy() 376 | 377 | 378 | class CategoricalPolicyTwoLayer(CategoricalPolicy): 379 | """ 380 | Categorical policy using a fully connected two-layer network. 381 | """ 382 | 383 | def __init__(self, state_dim, num_actions, 384 | hidden_layer1_size=64, 385 | hidden_layer2_size=64, 386 | init_std=0.001): 387 | 388 | super().__init__(num_actions) 389 | 390 | self.init_std = init_std 391 | 392 | self.linear1 = nn.Linear(state_dim, hidden_layer1_size) 393 | self.linear2 = nn.Linear(hidden_layer1_size, hidden_layer2_size) 394 | self.linear3 = nn.Linear(hidden_layer2_size, num_actions) 395 | nn.init.normal_(self.linear1.weight, std=init_std) 396 | nn.init.normal_(self.linear2.weight, std=init_std) 397 | nn.init.normal_(self.linear3.weight, std=init_std) 398 | 399 | def forward(self, state): 400 | x = F.relu(self.linear1(state)) 401 | x = F.relu(self.linear2(x)) 402 | output = self.linear3(x) 403 | return output 404 | 405 | 406 | class PPOBase: 407 | def __init__(self, 408 | env, 409 | policy, 410 | value_function, 411 | policy_lr, 412 | value_lr, 413 | entropy_coef=0.0, 414 | clip_range=0.2, 415 | n_epochs=10, 416 | batch_size=64, 417 | weight_decay=0.0, 418 | gamma=0.99, 419 | buffer_size=2048, 420 | enable_cuda=True, 421 | policy_optimizer=torch.optim.Adam, 422 | value_optimizer=torch.optim.Adam, 423 | grad_clip_radius=None): 424 | 425 | warnings.warn('This PPO implementation currently contains hacks for ' + \ 426 | 'returning information about CBF-related safety.') 427 | 428 | self.env = env 429 | self.pi = policy 430 | self.v = value_function 431 | self.entropy_coef = entropy_coef 432 | self.clip_range = clip_range 433 | self.n_epochs = n_epochs 434 | self.batch_size = batch_size 435 | 436 | self.__cuda_enabled = enable_cuda 437 | self.enable_cuda(self.__cuda_enabled, warn=False) 438 | # NOTE: self.device is defined when self.enable_cuda is called! 439 | 440 | self.pi_optim = policy_optimizer(self.pi.parameters(), 441 | lr=policy_lr, 442 | weight_decay=weight_decay) 443 | self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr) 444 | self.grad_clip_radius = grad_clip_radius 445 | 446 | self.rollout_buffer = RolloutBuffer( 447 | buffer_size, 448 | env.observation_space, 449 | env.action_space, 450 | device=self.device, 451 | gamma=gamma 452 | ) 453 | 454 | @property 455 | def cuda_enabled(self): 456 | return self.__cuda_enabled 457 | 458 | def enable_cuda(self, enable_cuda=True, warn=True): 459 | """Enable or disable cuda and update models.""" 460 | 461 | if warn: 462 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 463 | "initializing optimizers can give errors when using " 464 | "optimizers other than SGD or Adam!") 465 | 466 | self.__cuda_enabled = enable_cuda 467 | self.device = torch.device( 468 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 469 | else 'cpu') 470 | self.pi.to(self.device) 471 | self.v.to(self.device) 472 | 473 | def load_models(self, filename, enable_cuda=True, continue_training=True): 474 | """ 475 | Load policy and value functions. Copy them to target functions. 476 | 477 | This method is for evaluation only. Use load_checkpoint to continue 478 | training. 479 | """ 480 | 481 | models = torch.load(filename) 482 | 483 | self.pi.load_state_dict(models['pi_state_dict']) 484 | self.v.load_state_dict(models['v_state_dict']) 485 | 486 | self.pi.eval() 487 | self.v.eval() 488 | 489 | self.enable_cuda(enable_cuda, warn=False) 490 | 491 | def save_checkpoint(self, filename): 492 | """Save state_dicts of models and optimizers.""" 493 | 494 | torch.save({ 495 | 'using_cuda': self.__cuda_enabled, 496 | 'pi_state_dict': self.pi.state_dict(), 497 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 498 | 'v_state_dict': self.v.state_dict(), 499 | 'v_optimizer_state_dict': self.v_optim.state_dict(), 500 | }, filename) 501 | 502 | def load_checkpoint(self, filename, continue_training=True): 503 | """Load state_dicts for models and optimizers.""" 504 | 505 | checkpoint = torch.load(filename) 506 | 507 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 508 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 509 | self.v.load_state_dict(models['v_state_dict']) 510 | self.v_optim.load_state_dict(models['v_optimizer_state_dict']) 511 | 512 | if continue_training: 513 | self.pi.train() 514 | self.v.train() 515 | else: 516 | self.pi.eval() 517 | self.v.eval() 518 | 519 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 520 | 521 | def collect_rollout(self, env, rollout_length): 522 | """ 523 | Perform a rollout and fill the rollout buffer. 524 | """ 525 | 526 | self._last_obs = env.reset() 527 | self._last_episode_start = np.zeros(1) 528 | n_steps = 0 529 | self.rollout_buffer.reset() 530 | 531 | num_unsafe_steps = 0 532 | x_t=[] 533 | y_t=[] 534 | 535 | local_flag_done = False 536 | while n_steps < rollout_length: 537 | 538 | action_dim=get_action_dim(env.action_space) 539 | 540 | with torch.no_grad(): 541 | # Convert to pytorch tensor or to TensorDict 542 | obs_tensor = obs_as_tensor(self._last_obs, self.device).float() 543 | action, log_prob = self.pi.sample(obs_tensor, action_dim) 544 | value = self.v(obs_tensor) 545 | action = action.cpu().numpy() 546 | 547 | # Rescale and perform action 548 | clipped_action = action 549 | # Clip the actions to avoid out of bound error 550 | if isinstance(self.env.action_space, gym.spaces.Box): 551 | clipped_action = np.clip(action, self.env.action_space.low, 552 | self.env.action_space.high) 553 | elif isinstance(self.env.action_space, gym.spaces.Discrete): 554 | clipped_action = int(clipped_action) 555 | 556 | new_obs, reward, done, info = env.step(clipped_action) 557 | 558 | x_t.append(new_obs[0]) 559 | y_t.append(new_obs[1]) 560 | if abs(new_obs[0]-env.obstacle[0])<0.1 and abs(new_obs[1]-env.obstacle[1])<0.1: 561 | print("crash") 562 | 563 | n_steps += 1 564 | 565 | if isinstance(self.env.action_space, gym.spaces.Discrete): 566 | # Reshape in case of discrete action 567 | action = action.reshape(-1, 1) 568 | 569 | self.rollout_buffer.add(self._last_obs, action, reward, 570 | self._last_episode_start, value, log_prob) 571 | self._last_obs = new_obs.flatten() 572 | self._last_episode_start = done 573 | 574 | if n_steps == rollout_length: 575 | env.reset() 576 | 577 | 578 | plt.xlim(np.double(env.min_x),np.double(env.max_x)) 579 | plt.ylim(np.double(env.min_y),np.double(env.max_y)) 580 | plt.xlabel('X axis') 581 | plt.ylabel('Y-axis') 582 | plt.plot(x_t,y_t) 583 | plt.plot(env.goal[0],env.goal[1],marker='o',color='red') 584 | plt.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black') 585 | 586 | 587 | def f(x, y, xa, yb, a, b): 588 | return (x - xa)**4/a**4 + (y - yb)**4/b**4 589 | 590 | # Define the point around which to plot 591 | xa, yb = env.obstacle[0], env.obstacle[1] 592 | 593 | # Define the range of x and y values to plot 594 | x_vals = np.linspace(xa - env.a_d, xa + env.a_d, 100) 595 | y_vals = np.linspace(yb - env.b_d, yb + env.b_d, 100) 596 | 597 | # Create a grid of x and y values 598 | X, Y = np.meshgrid(x_vals, y_vals) 599 | 600 | # Evaluate the function at each point in the grid 601 | Z = f(X, Y, xa, yb, env.a_d, env.b_d) 602 | 603 | # Plot the function as a contour plot 604 | ##Create a folder in the current directory 605 | folder_name_main = f"{{{env.date}}}" 606 | os.makedirs(folder_name_main, exist_ok=True) 607 | ##Change the current working directory to the newly created folder 608 | os.chdir(folder_name_main) 609 | 610 | folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}" 611 | os.makedirs(folder_name, exist_ok=True) 612 | ##Change the current working directory to the newly created folder 613 | os.chdir(folder_name) 614 | 615 | folder_name_1 = f"{{lr={env.lr}_entr={env.entropy}_umin={env.umin[0]}_umax={env.umax[0]}_lyr=batch={env.layer_size}}}" 616 | os.makedirs(folder_name_1, exist_ok=True) 617 | os.chdir(folder_name_1) 618 | 619 | with open(f"episode={env.episodes}.csv", 'w', newline='') as file: 620 | writer = csv.writer(file) 621 | writer.writerow(x_t) 622 | writer.writerow(y_t) 623 | if (env.episodes)%1 == 0: 624 | plt.savefig(f"ep={env.episodes}.png") 625 | plt.contour(X, Y, Z, levels=[env.safety_dist]) 626 | 627 | os.chdir('..') 628 | os.chdir('..') 629 | os.chdir('..') 630 | 631 | 632 | self.rollout_buffer.compute_returns_and_advantage(last_value=value, 633 | done=done) 634 | 635 | safety_rate = 100 * (1 - num_unsafe_steps / rollout_length) 636 | 637 | return np.sum(self.rollout_buffer.rewards), safety_rate 638 | 639 | def train(self): 640 | """ 641 | Train on the current rollout buffer. 642 | """ 643 | #action_dim = get_action_dim(self.action_space) 644 | for epoch in range(self.n_epochs): 645 | 646 | # Do a complete pass on the rollout buffer 647 | for rollout_data in self.rollout_buffer.get(self.batch_size): 648 | 649 | actions = rollout_data.actions 650 | obs = rollout_data.observations 651 | values = self.v(obs).flatten() 652 | try: 653 | log_probs = self.pi.log_probs(obs, actions, actions.shape[1]) 654 | except: 655 | print(self.pi.log_probs(obs, actions, actions.shape[1])) 656 | import pdb; pdb.set_trace() 657 | 658 | entropies = self.pi.entropy(obs) 659 | if log_probs.device!=actions.device: 660 | log_probs=log_probs.to('cuda:0') 661 | entropies=entropies.to('cuda:0') 662 | advantages = rollout_data.advantages 663 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) 664 | 665 | # ratio between old and new policy, should be one at the first iteration 666 | ratio = torch.exp(log_probs - rollout_data.old_log_prob) 667 | 668 | policy_loss_1 = advantages * ratio 669 | policy_loss_2 = advantages * torch.clamp(ratio, 670 | 1 - self.clip_range, 671 | 1 + self.clip_range) 672 | policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean() - \ 673 | self.entropy_coef * entropies.mean() 674 | 675 | self.pi_optim.zero_grad() 676 | policy_loss.backward() 677 | # Clip grad norm 678 | if self.grad_clip_radius is not None: 679 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 680 | self.grad_clip_radius) 681 | self.pi_optim.step() 682 | 683 | value_loss = F.mse_loss(rollout_data.returns, values) 684 | 685 | self.v_optim.zero_grad() 686 | value_loss.backward() 687 | # Clip grad norm 688 | if self.grad_clip_radius is not None: 689 | torch.nn.utils.clip_grad_norm_(self.v.parameters(), 690 | self.grad_clip_radius) 691 | self.v_optim.step() 692 | -------------------------------------------------------------------------------- /Pendulum/agents/agents.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | import yaml 5 | from scipy import integrate 6 | from wesutils.utils import GaussianPolicyTwoLayer, two_layer_net 7 | from functools import reduce 8 | from operator import mul 9 | 10 | 11 | class GaussianPolicyCBF(GaussianPolicyTwoLayer): 12 | """ 13 | Modified version of the standard Gaussian policy for use in CBF- 14 | constrained settings. Provides access to the pdf of the policy at 15 | a given state as well as utilities for directly manipulating the 16 | parameters of the policy. Requires a CBF upon initilization. The 17 | CBF is assumed to accept torch tensor representations of the state. 18 | 19 | NOTE: This version of the policy assumes that action_dim=1 and that 20 | the CBF returns intervals. 21 | """ 22 | 23 | def __init__(self, cbf, state_dim, action_dim=1, 24 | simple_cov=True, 25 | hidden_layer1_size=32, 26 | hidden_layer2_size=32, 27 | activation='sigmoid', 28 | log_std_min=-20, log_std_max=3, 29 | weight_init_std=0.0001): 30 | 31 | assert action_dim == 1, "Action dimension must be 1" 32 | 33 | super().__init__( 34 | state_dim, action_dim, 35 | simple_cov=simple_cov, 36 | hidden_layer1_size=hidden_layer1_size, 37 | hidden_layer2_size=hidden_layer2_size, 38 | activation=activation, 39 | log_std_min=log_std_min, log_std_max=log_std_max, 40 | weight_init_std=weight_init_std 41 | ) 42 | 43 | self.cbf = cbf 44 | self.param_shape = tuple(p.shape for p in self.parameters()) 45 | self.param_size = sum(reduce(mul, shape) for shape in self.param_shape) 46 | 47 | def to(self, device): 48 | super().to(device) 49 | self.device = device 50 | 51 | @property 52 | def params(self): 53 | """Get policy model parameters.""" 54 | return torch.cat([p.data.reshape(-1) for p in self.parameters()]) 55 | 56 | @params.setter 57 | def params(self, new_values): 58 | """Set policy model parameters.""" 59 | assert new_values.size()[0] == self.param_size, "Error" 60 | 61 | index = 0 62 | 63 | for param in self.parameters(): 64 | size = reduce(mul, param.shape) 65 | block = new_values[index:index+size].reshape(param.shape) 66 | param.data.copy_(block) 67 | index += size 68 | 69 | def _numpy_original_pdf(self, state): 70 | """ 71 | Return numpy version of the pdf of the untruncated policy at the 72 | state provided. 73 | """ 74 | 75 | mean, cov = self.forward(state) 76 | mean, cov = mean.detach().numpy(), cov.detach().numpy() 77 | K = np.float_power((2 * np.pi)**len(mean) * np.linalg.det(cov), -0.5) 78 | inv = np.linalg.inv(cov) 79 | 80 | def pdf(action): 81 | return K * np.exp( 82 | -0.5 * (action - mean).dot(inv.dot(action - mean)) 83 | ).flatten() 84 | 85 | return pdf 86 | 87 | def _torch_original_pdf(self, state): 88 | """ 89 | Return torch version of the pdf of the untruncated policy at the 90 | state provided. Detach is not called, so all computations herein 91 | are reflected in the computation graph. 92 | """ 93 | 94 | # import pdb; pdb.set_trace() 95 | 96 | mean, cov = self.forward(state) 97 | K = torch.float_power((2 * np.pi)**len(mean) * torch.linalg.det(cov), -0.5) 98 | inv = torch.linalg.inv(cov).squeeze(dim=0) 99 | 100 | def pdf(action): 101 | return K * torch.exp(-0.5 * torch.matmul( 102 | action - mean, torch.matmul(inv, action - mean))) 103 | 104 | return pdf 105 | 106 | def get_numpy_pdf(self, state): 107 | """ 108 | Return the pdf of the original Gaussian pdf truncated to the set C(x). 109 | """ 110 | 111 | lb, ub = self.cbf(state=state) 112 | original_pdf = self._numpy_original_pdf(state) 113 | normalization = integrate.quad(original_pdf, lb, ub)[0] 114 | 115 | def pdf(action): 116 | return original_pdf(action) / normalization if lb <= action <= ub \ 117 | else 0 118 | 119 | return pdf 120 | 121 | def sample(self, state, sample_cutoff=100, 122 | no_log_prob=False, num_log_prob_samples=1000): 123 | """ 124 | Repeatedly sample from the original Gaussian policy until an action 125 | lying within the CBF constraint set is generated. 126 | 127 | sample_cutoff specifies the number of times to sample using the 128 | original Gaussian policy before simply uniformly generating an 129 | action from the CBF constraint set. 130 | """ 131 | 132 | lb, ub = self.cbf(state=state) 133 | lb = float(lb) 134 | ub = float(ub) 135 | state = torch.FloatTensor(state).reshape(1, len(state)).to(self.device) 136 | orig_pdf = self._torch_original_pdf(state) 137 | 138 | if not no_log_prob: 139 | actions = lb + (ub - lb) * torch.rand( 140 | num_log_prob_samples, self.action_dim, 1, device=self.device 141 | ) 142 | log_prob = orig_pdf(actions).sum().log() # log pi_theta (C(x) | x) 143 | 144 | for _ in range(sample_cutoff): 145 | 146 | action, orig_log_prob = super().sample(state) 147 | 148 | if lb <= action <= ub: 149 | log_prob = orig_log_prob - log_prob # log pi^C_theta 150 | # = log pi_theta - log pi_theta(C(x) | x) 151 | return action, log_prob if not no_log_prob else action 152 | 153 | action = lb + (ub - lb) * torch.rand(1, 1) 154 | orig_log_prob = orig_pdf(action).log() 155 | return action, (orig_log_prob - log_prob) if not no_log_prob else action 156 | 157 | 158 | class CBFREINFORCEAgent: 159 | """ 160 | Agent for training a CBF-constrained version of the classic REINFORCE 161 | algorithm. 162 | """ 163 | 164 | def __init__(self, 165 | ### agent parameters 166 | state_dim, action_dim, cbf, 167 | policy_lr, discount_factor, 168 | num_log_prob_samples=1000, 169 | enable_cuda=False, 170 | optimizer=torch.optim.Adam, 171 | grad_clip_radius=None, 172 | ### policy parameters 173 | simple_cov=True, 174 | hidden_layer1_size=32, 175 | hidden_layer2_size=32, 176 | activation='relu', 177 | log_std_min=-20, log_std_max=3, 178 | weight_init_std=0.0001): 179 | 180 | assert action_dim == 1, "Action dimension must be 1 in this version" 181 | 182 | self.pi = GaussianPolicyCBF( 183 | cbf=cbf, state_dim=state_dim, action_dim=action_dim, 184 | simple_cov=simple_cov, 185 | hidden_layer1_size=hidden_layer1_size, 186 | hidden_layer2_size=hidden_layer2_size, 187 | activation=activation, 188 | log_std_min=log_std_min, log_std_max=log_std_max, 189 | weight_init_std=weight_init_std 190 | ) 191 | 192 | self.gamma = discount_factor 193 | 194 | self.pi_optim = optimizer(self.pi.parameters(), lr=policy_lr) 195 | self.grad_clip_radius = grad_clip_radius 196 | 197 | self.__cuda_enabled = enable_cuda 198 | self.enable_cuda(self.__cuda_enabled, warn=False) 199 | # NOTE: self.device is defined when self.enable_cuda is called! 200 | 201 | @property 202 | def cuda_enabled(self): 203 | return self.__cuda_enabled 204 | 205 | def enable_cuda(self, enable_cuda=True, warn=True): 206 | """Enable or disable cuda and update models.""" 207 | 208 | if warn: 209 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 210 | "initializing optimizers can give errors when using " 211 | "optimizers other than SGD or Adam!") 212 | 213 | self.__cuda_enabled = enable_cuda 214 | self.device = torch.device( 215 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 216 | else 'cpu') 217 | self.pi.to(self.device) 218 | 219 | def load_models(self, filename, enable_cuda=True, continue_training=True): 220 | """Load policy and value functions. Copy them to target functions.""" 221 | 222 | models = torch.load(filename) 223 | 224 | self.pi.load_state_dict(models['pi_state_dict']) 225 | 226 | if continue_training: 227 | self.pi.train() 228 | else: 229 | self.pi.eval() 230 | 231 | self.enable_cuda(enable_cuda, warn=False) 232 | 233 | def save_checkpoint(self, filename): 234 | """Save state_dicts of models and optimizers.""" 235 | 236 | torch.save({ 237 | 'using_cuda': self.__cuda_enabled, 238 | 'pi_state_dict': self.pi.state_dict(), 239 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 240 | }, filename) 241 | 242 | def load_checkpoint(self, filename, continue_training=True): 243 | """Load state_dicts for models and optimizers.""" 244 | 245 | checkpoint = torch.load(filename) 246 | 247 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 248 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 249 | 250 | if continue_training: 251 | self.pi.train() 252 | 253 | else: 254 | self.pi.eval() 255 | 256 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 257 | 258 | def update(self, env, rollout_length, sample_cutoff=100): 259 | """ 260 | Perform a single rollout and corresponding gradient update. 261 | Return the total reward accumulated during the rollout. 262 | """ 263 | 264 | rewards, log_probs = [], [] 265 | num_steps = 0 266 | 267 | state = env.state 268 | 269 | for _ in range(rollout_length): 270 | action, log_prob = self.pi.sample(state, 271 | sample_cutoff=sample_cutoff) 272 | state, reward, done, _ = env.step(action.cpu().detach().numpy()) 273 | rewards.append(float(reward)) 274 | log_probs.append(log_prob) 275 | 276 | if done: 277 | break 278 | 279 | num_steps += 1 280 | 281 | G = 0 282 | pi_loss = 0 283 | 284 | for i in range(len(rewards) - 1, -1, -1): 285 | G = rewards[i] + self.gamma * G 286 | pi_loss = pi_loss + (self.gamma ** i) * G * log_probs[i] 287 | 288 | pi_loss = -pi_loss 289 | 290 | self.pi_optim.zero_grad() 291 | pi_loss.backward() 292 | if self.grad_clip_radius is not None: 293 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 294 | self.grad_clip_radius) 295 | self.pi_optim.step() 296 | 297 | return np.mean(rewards) 298 | 299 | def train(self, env, num_episodes, rollout_length, 300 | output_dir, args_list, 301 | reset_env=True, 302 | sample_cutoff=100): 303 | """ 304 | Train on the environment. 305 | """ 306 | 307 | episode_mean_rewards = [] 308 | 309 | for i in range(num_episodes): 310 | if reset_env: 311 | env.reset() 312 | mean_reward = self.update(env, rollout_length, 313 | sample_cutoff=sample_cutoff) 314 | cbf = [float(elem) for elem in env.cbf(env.state)] 315 | print( 316 | f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}, C(x) = [{cbf[0]:.2f}, {cbf[1]:.2f}]') 317 | episode_mean_rewards.append(mean_reward) 318 | 319 | rewards_filename = os.path.join(output_dir, 'episode_rewards') 320 | np.save(rewards_filename, episode_mean_rewards) 321 | 322 | hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml') 323 | with open(hyperparams_filename, 'w') as f: 324 | yaml.dump(args_list, f) 325 | 326 | 327 | 328 | class CBFACAgent: 329 | """ 330 | Agent for training a CBF-constrained version of the classic actor-critic 331 | algorithm. 332 | """ 333 | 334 | def __init__(self, 335 | ### agent parameters 336 | state_dim, action_dim, 337 | policy_lr, value_lr, discount_factor, 338 | cbf=None, 339 | num_log_prob_samples=1000, 340 | enable_cuda=False, 341 | policy_optimizer=torch.optim.Adam, 342 | value_optimizer=torch.optim.Adam, 343 | grad_clip_radius=None, 344 | ### policy parameters 345 | simple_cov=True, 346 | policy_hidden_layer1_size=32, 347 | policy_hidden_layer2_size=32, 348 | policy_activation='relu', 349 | log_std_min=-20, log_std_max=3, 350 | weight_init_std=0.0001, 351 | # value function parameters 352 | value_hidden_layer1_size=32, 353 | value_hidden_layer2_size=32, 354 | value_activation='ReLU'): 355 | 356 | assert action_dim == 1, "Action dimension must be 1 in this version" 357 | 358 | self.pi = GaussianPolicyCBF( 359 | cbf=cbf, state_dim=state_dim, action_dim=action_dim, 360 | simple_cov=simple_cov, 361 | hidden_layer1_size=policy_hidden_layer1_size, 362 | hidden_layer2_size=policy_hidden_layer2_size, 363 | activation=policy_activation, 364 | log_std_min=log_std_min, log_std_max=log_std_max, 365 | weight_init_std=weight_init_std 366 | ) 367 | 368 | self.v = two_layer_net( 369 | input_dim=state_dim, output_dim=1, 370 | hidden_layer1_size=value_hidden_layer1_size, 371 | hidden_layer2_size=value_hidden_layer2_size, 372 | activation=value_activation, 373 | ) 374 | 375 | self.gamma = discount_factor 376 | 377 | self.pi_optim = policy_optimizer(self.pi.parameters(), lr=policy_lr) 378 | self.grad_clip_radius = grad_clip_radius 379 | 380 | self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr) 381 | self.grad_clip_radius = grad_clip_radius 382 | 383 | self.__cuda_enabled = enable_cuda 384 | self.enable_cuda(self.__cuda_enabled, warn=False) 385 | # NOTE: self.device is defined when self.enable_cuda is called! 386 | 387 | @property 388 | def cuda_enabled(self): 389 | return self.__cuda_enabled 390 | 391 | def enable_cuda(self, enable_cuda=True, warn=True): 392 | """Enable or disable cuda and update models.""" 393 | 394 | if warn: 395 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 396 | "initializing optimizers can give errors when using " 397 | "optimizers other than SGD or Adam!") 398 | 399 | self.__cuda_enabled = enable_cuda 400 | self.device = torch.device( 401 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 402 | else 'cpu') 403 | self.pi.to(self.device) 404 | self.v.to(self.device) 405 | 406 | def load_models(self, filename, enable_cuda=True, continue_training=True): 407 | """Load policy and value functions. Copy them to target functions.""" 408 | 409 | models = torch.load(filename) 410 | 411 | self.pi.load_state_dict(models['pi_state_dict']) 412 | self.v.load_state_dict(models['v_state_dict']) 413 | 414 | if continue_training: 415 | self.pi.train() 416 | self.v.train() 417 | else: 418 | self.pi.eval() 419 | self.v.eval() 420 | 421 | self.enable_cuda(enable_cuda, warn=False) 422 | 423 | def save_checkpoint(self, filename): 424 | """Save state_dicts of models and optimizers.""" 425 | 426 | torch.save({ 427 | 'using_cuda': self.__cuda_enabled, 428 | 'pi_state_dict': self.pi.state_dict(), 429 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 430 | 'v_state_dict': self.v.state_dict(), 431 | 'v_optimizer_state_dict': self.v_optim.state_dict() 432 | }, filename) 433 | 434 | def load_checkpoint(self, filename, continue_training=True): 435 | """Load state_dicts for models and optimizers.""" 436 | 437 | checkpoint = torch.load(filename) 438 | 439 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 440 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 441 | self.v.load_state_dict(checkpoint['v_state_dict']) 442 | self.v_optim.load_state_dict(checkpoint['v_optimizer_state_dict']) 443 | 444 | if continue_training: 445 | self.pi.train() 446 | self.v.train() 447 | 448 | else: 449 | self.pi.eval() 450 | self.v.eval() 451 | 452 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 453 | 454 | def update(self, env, episode_length, sample_cutoff=100): 455 | """ 456 | Perform a single episode and corresponding gradient update. 457 | Return the total reward accumulated during the rollout. 458 | """ 459 | 460 | states, actions, rewards, next_states, log_probs = [], [], [], [], [] 461 | 462 | state = env.state 463 | 464 | for _ in range(episode_length): 465 | states.append(state) 466 | 467 | action, log_prob = self.pi.sample(state, 468 | sample_cutoff=sample_cutoff) 469 | actions.append(action) 470 | log_probs.append(log_prob) 471 | state, reward, done, _ = env.step(action.cpu().detach().numpy()) 472 | rewards.append(reward) 473 | next_states.append(state) 474 | 475 | if done: 476 | break 477 | 478 | next_states.append(env.state) 479 | 480 | v_loss = 0 481 | pi_loss = 0 482 | 483 | for state, action, reward, next_state, log_prob in zip( 484 | states, actions, rewards, next_states, log_probs): 485 | state = torch.FloatTensor(state).reshape(1, len(state)) 486 | next_state = torch.FloatTensor(next_state).reshape(1, len(next_state)) 487 | with torch.no_grad(): 488 | v_target = float(reward) + self.gamma * self.v(next_state) 489 | td_error = v_target - self.v(state) 490 | v_loss += (v_target - self.v(state))**2 491 | pi_loss = pi_loss + td_error * log_prob 492 | 493 | v_loss = v_loss / len(states) 494 | pi_loss = pi_loss / len(states) 495 | pi_loss = -pi_loss 496 | 497 | self.pi_optim.zero_grad() 498 | self.v_optim.zero_grad() 499 | 500 | if self.grad_clip_radius is not None: 501 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 502 | self.grad_clip_radius) 503 | torch.nn.utils.clip_grad_norm_(self.v.parameters(), 504 | self.grad_clip_radius) 505 | 506 | self.v_optim.step() 507 | self.pi_optim.step() 508 | 509 | return np.mean(rewards) 510 | 511 | def train(self, env, num_episodes, rollout_length, 512 | output_dir, args_list, 513 | reset_env=True, 514 | sample_cutoff=100): 515 | """ 516 | Train on the environment. 517 | """ 518 | 519 | episode_mean_rewards = [] 520 | 521 | for i in range(num_episodes): 522 | if reset_env: 523 | env.reset() 524 | mean_reward = self.update(env, rollout_length, 525 | sample_cutoff=sample_cutoff) 526 | episode_mean_rewards.append(mean_reward) 527 | cbf = [float(elem) for elem in env.cbf(env.state)] 528 | print( 529 | f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}, C(x) = [{cbf[0]:.2f}, {cbf[1]:.2f}]') 530 | 531 | rewards_filename = os.path.join(output_dir, 'episode_rewards') 532 | np.save(rewards_filename, episode_mean_rewards) 533 | 534 | hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml') 535 | with open(hyperparams_filename, 'w') as f: 536 | yaml.dump(args_list, f) 537 | 538 | 539 | class VanillaACAgent: 540 | """ 541 | Agent for training the classic actor-critic algorithm. 542 | """ 543 | 544 | def __init__(self, 545 | ### agent parameters 546 | state_dim, action_dim, 547 | policy_lr, value_lr, discount_factor, 548 | enable_cuda=False, 549 | policy_optimizer=torch.optim.Adam, 550 | value_optimizer=torch.optim.Adam, 551 | grad_clip_radius=None, 552 | ### policy parameters 553 | simple_cov=True, 554 | policy_hidden_layer1_size=32, 555 | policy_hidden_layer2_size=32, 556 | policy_activation='relu', 557 | log_std_min=-20, log_std_max=3, 558 | weight_init_std=0.0001, 559 | # value function parameters 560 | value_hidden_layer1_size=32, 561 | value_hidden_layer2_size=32, 562 | value_activation='ReLU', 563 | cbf=None, 564 | num_log_prob_samples=None): 565 | 566 | assert action_dim == 1, "Action dimension must be 1 in this version" 567 | 568 | self.pi = GaussianPolicyTwoLayer( 569 | state_dim=state_dim, action_dim=action_dim, 570 | simple_cov=simple_cov, 571 | hidden_layer1_size=policy_hidden_layer1_size, 572 | hidden_layer2_size=policy_hidden_layer2_size, 573 | activation=policy_activation, 574 | log_std_min=log_std_min, log_std_max=log_std_max, 575 | weight_init_std=weight_init_std 576 | ) 577 | 578 | self.v = two_layer_net( 579 | input_dim=state_dim, output_dim=1, 580 | hidden_layer1_size=value_hidden_layer1_size, 581 | hidden_layer2_size=value_hidden_layer2_size, 582 | activation=value_activation, 583 | ) 584 | 585 | self.gamma = discount_factor 586 | 587 | self.pi_optim = policy_optimizer(self.pi.parameters(), lr=policy_lr) 588 | self.grad_clip_radius = grad_clip_radius 589 | 590 | self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr) 591 | self.grad_clip_radius = grad_clip_radius 592 | 593 | self.__cuda_enabled = enable_cuda 594 | self.enable_cuda(self.__cuda_enabled, warn=False) 595 | # NOTE: self.device is defined when self.enable_cuda is called! 596 | 597 | @property 598 | def cuda_enabled(self): 599 | return self.__cuda_enabled 600 | 601 | def enable_cuda(self, enable_cuda=True, warn=True): 602 | """Enable or disable cuda and update models.""" 603 | 604 | if warn: 605 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 606 | "initializing optimizers can give errors when using " 607 | "optimizers other than SGD or Adam!") 608 | 609 | self.__cuda_enabled = enable_cuda 610 | self.device = torch.device( 611 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 612 | else 'cpu') 613 | self.pi.to(self.device) 614 | self.v.to(self.device) 615 | 616 | def load_models(self, filename, enable_cuda=True, continue_training=True): 617 | """Load policy and value functions. Copy them to target functions.""" 618 | 619 | models = torch.load(filename) 620 | 621 | self.pi.load_state_dict(models['pi_state_dict']) 622 | self.v.load_state_dict(models['v_state_dict']) 623 | 624 | if continue_training: 625 | self.pi.train() 626 | self.v.train() 627 | else: 628 | self.pi.eval() 629 | self.v.eval() 630 | 631 | self.enable_cuda(enable_cuda, warn=False) 632 | 633 | def save_checkpoint(self, filename): 634 | """Save state_dicts of models and optimizers.""" 635 | 636 | torch.save({ 637 | 'using_cuda': self.__cuda_enabled, 638 | 'pi_state_dict': self.pi.state_dict(), 639 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 640 | 'v_state_dict': self.v.state_dict(), 641 | 'v_optimizer_state_dict': self.v_optim.state_dict() 642 | }, filename) 643 | 644 | def load_checkpoint(self, filename, continue_training=True): 645 | """Load state_dicts for models and optimizers.""" 646 | 647 | checkpoint = torch.load(filename) 648 | 649 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 650 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 651 | self.v.load_state_dict(checkpoint['v_state_dict']) 652 | self.v_optim.load_state_dict(checkpoint['v_optimizer_state_dict']) 653 | 654 | if continue_training: 655 | self.pi.train() 656 | self.v.train() 657 | 658 | else: 659 | self.pi.eval() 660 | self.v.eval() 661 | 662 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 663 | 664 | def update(self, env, episode_length, sample_cutoff=100): 665 | """ 666 | Perform a single episode and corresponding gradient update. 667 | Return the mean reward accumulated during the rollout. 668 | """ 669 | 670 | states, actions, rewards, next_states, log_probs = [], [], [], [], [] 671 | 672 | state = env.reset() 673 | 674 | for _ in range(episode_length): 675 | states.append(state) 676 | 677 | action, log_prob = self.pi.sample( 678 | torch.FloatTensor(state).reshape(1, len(state)).to(self.device) 679 | ) 680 | actions.append(action) 681 | log_probs.append(log_prob) 682 | state, reward, done, _ = env.step(action.cpu().detach().numpy()) 683 | rewards.append(reward) 684 | next_states.append(state) 685 | if done: 686 | break 687 | 688 | next_states.append(env.state) 689 | 690 | v_loss = 0 691 | pi_loss = 0 692 | 693 | for state, action, reward, next_state, log_prob in zip( 694 | states, actions, rewards, next_states, log_probs): 695 | state = torch.FloatTensor(state).reshape(1, len(state)) 696 | next_state = torch.FloatTensor(next_state).reshape(1, len(next_state)) 697 | with torch.no_grad(): 698 | v_target = float(reward) + self.gamma * self.v(next_state) 699 | td_error = v_target - self.v(state) 700 | v_loss += (v_target - self.v(state))**2 701 | pi_loss = pi_loss + td_error * log_prob 702 | 703 | pi_loss = pi_loss / len(states) 704 | v_loss = v_loss / len(states) 705 | pi_loss = -pi_loss 706 | 707 | self.pi_optim.zero_grad() 708 | self.v_optim.zero_grad() 709 | 710 | if self.grad_clip_radius is not None: 711 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 712 | self.grad_clip_radius) 713 | torch.nn.utils.clip_grad_norm_(self.v.parameters(), 714 | self.grad_clip_radius) 715 | 716 | self.v_optim.step() 717 | self.pi_optim.step() 718 | 719 | return np.mean(rewards) 720 | 721 | def train(self, env, num_episodes, rollout_length, 722 | output_dir, args_list, 723 | reset_env=True, 724 | sample_cutoff=100): 725 | """ 726 | Train on the environment. 727 | """ 728 | 729 | episode_mean_rewards = [] 730 | 731 | for i in range(num_episodes): 732 | if reset_env: 733 | env.reset() 734 | mean_reward = self.update(env, rollout_length, 735 | sample_cutoff=sample_cutoff) 736 | episode_mean_rewards.append(mean_reward) 737 | print( 738 | f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}') 739 | 740 | # rewards_filename = os.path.join(output_dir, 'episode_rewards') 741 | # np.save(rewards_filename, episode_mean_rewards) 742 | 743 | # hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml') 744 | # with open(hyperparams_filename, 'w') as f: 745 | # yaml.dump(args_list, f) 746 | -------------------------------------------------------------------------------- /agents/agents.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | import yaml 5 | from scipy import integrate 6 | from wesutils.utils import GaussianPolicyTwoLayer, two_layer_net 7 | from functools import reduce 8 | from operator import mul 9 | import warnings 10 | 11 | class GaussianPolicyCBF(GaussianPolicyTwoLayer): 12 | """ 13 | Modified version of the standard Gaussian policy for use in CBF- 14 | constrained settings. Provides access to the pdf of the policy at 15 | a given state as well as utilities for directly manipulating the 16 | parameters of the policy. Requires a CBF upon initilization. The 17 | CBF is assumed to accept torch tensor representations of the state. 18 | 19 | NOTE: This version of the policy assumes that action_dim=1 and that 20 | the CBF returns intervals. 21 | """ 22 | 23 | def __init__(self, cbf, state_dim, action_dim=1, 24 | simple_cov=True, 25 | hidden_layer1_size=32, 26 | hidden_layer2_size=32, 27 | activation='sigmoid', 28 | log_std_min=-20, log_std_max=3, 29 | weight_init_std=0.0001): 30 | 31 | assert action_dim == 1, "Action dimension must be 1" 32 | 33 | super().__init__( 34 | state_dim, action_dim, 35 | simple_cov=simple_cov, 36 | hidden_layer1_size=hidden_layer1_size, 37 | hidden_layer2_size=hidden_layer2_size, 38 | activation=activation, 39 | log_std_min=log_std_min, log_std_max=log_std_max, 40 | weight_init_std=weight_init_std 41 | ) 42 | 43 | self.cbf = cbf 44 | self.param_shape = tuple(p.shape for p in self.parameters()) 45 | self.param_size = sum(reduce(mul, shape) for shape in self.param_shape) 46 | 47 | def to(self, device): 48 | super().to(device) 49 | self.device = device 50 | 51 | @property 52 | def params(self): 53 | """Get policy model parameters.""" 54 | return torch.cat([p.data.reshape(-1) for p in self.parameters()]) 55 | 56 | @params.setter 57 | def params(self, new_values): 58 | """Set policy model parameters.""" 59 | assert new_values.size()[0] == self.param_size, "Error" 60 | 61 | index = 0 62 | 63 | for param in self.parameters(): 64 | size = reduce(mul, param.shape) 65 | block = new_values[index:index+size].reshape(param.shape) 66 | param.data.copy_(block) 67 | index += size 68 | 69 | def _numpy_original_pdf(self, state): 70 | """ 71 | Return numpy version of the pdf of the untruncated policy at the 72 | state provided. 73 | """ 74 | 75 | mean, cov = self.forward(state) 76 | mean, cov = mean.detach().numpy(), cov.detach().numpy() 77 | K = np.float_power((2 * np.pi)**len(mean) * np.linalg.det(cov), -0.5) 78 | inv = np.linalg.inv(cov) 79 | 80 | def pdf(action): 81 | return K * np.exp( 82 | -0.5 * (action - mean).dot(inv.dot(action - mean)) 83 | ).flatten() 84 | 85 | return pdf 86 | 87 | def _torch_original_pdf(self, state): 88 | """ 89 | Return torch version of the pdf of the untruncated policy at the 90 | state provided. Detach is not called, so all computations herein 91 | are reflected in the computation graph. 92 | """ 93 | 94 | # import pdb; pdb.set_trace() 95 | 96 | mean, cov = self.forward(state) 97 | K = torch.float_power((2 * np.pi)**len(mean) * torch.linalg.det(cov), -0.5) 98 | inv = torch.linalg.inv(cov).squeeze(dim=0) 99 | 100 | def pdf(action): 101 | return K * torch.exp(-0.5 * torch.matmul( 102 | action - mean, torch.matmul(inv, action - mean))) 103 | 104 | return pdf 105 | 106 | def get_numpy_pdf(self, state): 107 | """ 108 | Return the pdf of the original Gaussian pdf truncated to the set C(x). 109 | """ 110 | 111 | lb, ub = self.cbf(state=state) 112 | original_pdf = self._numpy_original_pdf(state) 113 | normalization = integrate.quad(original_pdf, lb, ub)[0] 114 | 115 | def pdf(action): 116 | return original_pdf(action) / normalization if lb <= action <= ub \ 117 | else 0 118 | 119 | return pdf 120 | 121 | def sample(self, state, sample_cutoff=100, 122 | no_log_prob=False, num_log_prob_samples=1000): 123 | """ 124 | Repeatedly sample from the original Gaussian policy until an action 125 | lying within the CBF constraint set is generated. 126 | 127 | sample_cutoff specifies the number of times to sample using the 128 | original Gaussian policy before simply uniformly generating an 129 | action from the CBF constraint set. 130 | """ 131 | 132 | lb, ub = self.cbf(state=state) 133 | lb = float(lb) 134 | ub = float(ub) 135 | state = torch.FloatTensor(state).reshape(1, len(state)).to(self.device) 136 | orig_pdf = self._torch_original_pdf(state) 137 | 138 | if not no_log_prob: 139 | actions = lb + (ub - lb) * torch.rand( 140 | num_log_prob_samples, self.action_dim, 1, device=self.device 141 | ) 142 | log_prob = orig_pdf(actions).sum().log() # log pi_theta (C(x) | x) 143 | 144 | for _ in range(sample_cutoff): 145 | 146 | action, orig_log_prob = super().sample(state) 147 | 148 | if lb <= action <= ub: 149 | log_prob = orig_log_prob - log_prob # log pi^C_theta 150 | # = log pi_theta - log pi_theta(C(x) | x) 151 | return action, log_prob if not no_log_prob else action 152 | 153 | action = lb + (ub - lb) * torch.rand(1, 1) 154 | orig_log_prob = orig_pdf(action).log() 155 | return action, (orig_log_prob - log_prob) if not no_log_prob else action 156 | 157 | 158 | class CBFREINFORCEAgent: 159 | """ 160 | Agent for training a CBF-constrained version of the classic REINFORCE 161 | algorithm. 162 | """ 163 | 164 | def __init__(self, 165 | ### agent parameters 166 | state_dim, action_dim, cbf, 167 | policy_lr, discount_factor, 168 | num_log_prob_samples=1000, 169 | enable_cuda=False, 170 | optimizer=torch.optim.Adam, 171 | grad_clip_radius=None, 172 | ### policy parameters 173 | simple_cov=True, 174 | hidden_layer1_size=32, 175 | hidden_layer2_size=32, 176 | activation='relu', 177 | log_std_min=-20, log_std_max=3, 178 | weight_init_std=0.0001): 179 | 180 | assert action_dim == 1, "Action dimension must be 1 in this version" 181 | 182 | self.pi = GaussianPolicyCBF( 183 | cbf=cbf, state_dim=state_dim, action_dim=action_dim, 184 | simple_cov=simple_cov, 185 | hidden_layer1_size=hidden_layer1_size, 186 | hidden_layer2_size=hidden_layer2_size, 187 | activation=activation, 188 | log_std_min=log_std_min, log_std_max=log_std_max, 189 | weight_init_std=weight_init_std 190 | ) 191 | 192 | self.gamma = discount_factor 193 | 194 | self.pi_optim = optimizer(self.pi.parameters(), lr=policy_lr) 195 | self.grad_clip_radius = grad_clip_radius 196 | 197 | self.__cuda_enabled = enable_cuda 198 | self.enable_cuda(self.__cuda_enabled, warn=False) 199 | # NOTE: self.device is defined when self.enable_cuda is called! 200 | 201 | @property 202 | def cuda_enabled(self): 203 | return self.__cuda_enabled 204 | 205 | def enable_cuda(self, enable_cuda=True, warn=True): 206 | """Enable or disable cuda and update models.""" 207 | 208 | if warn: 209 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 210 | "initializing optimizers can give errors when using " 211 | "optimizers other than SGD or Adam!") 212 | 213 | self.__cuda_enabled = enable_cuda 214 | self.device = torch.device( 215 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 216 | else 'cpu') 217 | self.pi.to(self.device) 218 | 219 | def load_models(self, filename, enable_cuda=True, continue_training=True): 220 | """Load policy and value functions. Copy them to target functions.""" 221 | 222 | models = torch.load(filename) 223 | 224 | self.pi.load_state_dict(models['pi_state_dict']) 225 | 226 | if continue_training: 227 | self.pi.train() 228 | else: 229 | self.pi.eval() 230 | 231 | self.enable_cuda(enable_cuda, warn=False) 232 | 233 | def save_checkpoint(self, filename): 234 | """Save state_dicts of models and optimizers.""" 235 | 236 | torch.save({ 237 | 'using_cuda': self.__cuda_enabled, 238 | 'pi_state_dict': self.pi.state_dict(), 239 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 240 | }, filename) 241 | 242 | def load_checkpoint(self, filename, continue_training=True): 243 | """Load state_dicts for models and optimizers.""" 244 | 245 | checkpoint = torch.load(filename) 246 | 247 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 248 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 249 | 250 | if continue_training: 251 | self.pi.train() 252 | 253 | else: 254 | self.pi.eval() 255 | 256 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 257 | 258 | def update(self, env, rollout_length, sample_cutoff=100): 259 | """ 260 | Perform a single rollout and corresponding gradient update. 261 | Return the total reward accumulated during the rollout. 262 | """ 263 | 264 | rewards, log_probs = [], [] 265 | num_steps = 0 266 | 267 | state = env.state 268 | 269 | for _ in range(rollout_length): 270 | action, log_prob = self.pi.sample(state, 271 | sample_cutoff=sample_cutoff) 272 | state, reward, done, _ = env.step(action.cpu().detach().numpy()) 273 | rewards.append(float(reward)) 274 | log_probs.append(log_prob) 275 | 276 | if done: 277 | break 278 | 279 | num_steps += 1 280 | 281 | G = 0 282 | pi_loss = 0 283 | 284 | for i in range(len(rewards) - 1, -1, -1): 285 | G = rewards[i] + self.gamma * G 286 | pi_loss = pi_loss + (self.gamma ** i) * G * log_probs[i] 287 | 288 | pi_loss = -pi_loss 289 | 290 | self.pi_optim.zero_grad() 291 | pi_loss.backward() 292 | if self.grad_clip_radius is not None: 293 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 294 | self.grad_clip_radius) 295 | self.pi_optim.step() 296 | 297 | return np.mean(rewards) 298 | 299 | def train(self, env, num_episodes, rollout_length, 300 | output_dir, args_list, 301 | reset_env=True, 302 | sample_cutoff=100): 303 | """ 304 | Train on the environment. 305 | """ 306 | 307 | episode_mean_rewards = [] 308 | 309 | for i in range(num_episodes): 310 | if reset_env: 311 | env.reset() 312 | mean_reward = self.update(env, rollout_length, 313 | sample_cutoff=sample_cutoff) 314 | cbf = [float(elem) for elem in env.cbf(env.state)] 315 | print( 316 | f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}, C(x) = [{cbf[0]:.2f}, {cbf[1]:.2f}]') 317 | episode_mean_rewards.append(mean_reward) 318 | 319 | rewards_filename = os.path.join(output_dir, 'episode_rewards') 320 | np.save(rewards_filename, episode_mean_rewards) 321 | 322 | hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml') 323 | with open(hyperparams_filename, 'w') as f: 324 | yaml.dump(args_list, f) 325 | 326 | 327 | 328 | class CBFACAgent: 329 | """ 330 | Agent for training a CBF-constrained version of the classic actor-critic 331 | algorithm. 332 | """ 333 | 334 | def __init__(self, 335 | ### agent parameters 336 | state_dim, action_dim, 337 | policy_lr, value_lr, discount_factor, 338 | cbf=None, 339 | num_log_prob_samples=1000, 340 | enable_cuda=False, 341 | policy_optimizer=torch.optim.Adam, 342 | value_optimizer=torch.optim.Adam, 343 | grad_clip_radius=None, 344 | ### policy parameters 345 | simple_cov=True, 346 | policy_hidden_layer1_size=32, 347 | policy_hidden_layer2_size=32, 348 | policy_activation='relu', 349 | log_std_min=-20, log_std_max=3, 350 | weight_init_std=0.0001, 351 | # value function parameters 352 | value_hidden_layer1_size=32, 353 | value_hidden_layer2_size=32, 354 | value_activation='ReLU'): 355 | 356 | assert action_dim == 1, "Action dimension must be 1 in this version" 357 | 358 | self.pi = GaussianPolicyCBF( 359 | cbf=cbf, state_dim=state_dim, action_dim=action_dim, 360 | simple_cov=simple_cov, 361 | hidden_layer1_size=policy_hidden_layer1_size, 362 | hidden_layer2_size=policy_hidden_layer2_size, 363 | activation=policy_activation, 364 | log_std_min=log_std_min, log_std_max=log_std_max, 365 | weight_init_std=weight_init_std 366 | ) 367 | 368 | self.v = two_layer_net( 369 | input_dim=state_dim, output_dim=1, 370 | hidden_layer1_size=value_hidden_layer1_size, 371 | hidden_layer2_size=value_hidden_layer2_size, 372 | activation=value_activation, 373 | ) 374 | 375 | self.gamma = discount_factor 376 | 377 | self.pi_optim = policy_optimizer(self.pi.parameters(), lr=policy_lr) 378 | self.grad_clip_radius = grad_clip_radius 379 | 380 | self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr) 381 | self.grad_clip_radius = grad_clip_radius 382 | 383 | self.__cuda_enabled = enable_cuda 384 | self.enable_cuda(self.__cuda_enabled, warn=False) 385 | # NOTE: self.device is defined when self.enable_cuda is called! 386 | 387 | @property 388 | def cuda_enabled(self): 389 | return self.__cuda_enabled 390 | 391 | def enable_cuda(self, enable_cuda=True, warn=True): 392 | """Enable or disable cuda and update models.""" 393 | 394 | if warn: 395 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 396 | "initializing optimizers can give errors when using " 397 | "optimizers other than SGD or Adam!") 398 | 399 | self.__cuda_enabled = enable_cuda 400 | self.device = torch.device( 401 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 402 | else 'cpu') 403 | self.pi.to(self.device) 404 | self.v.to(self.device) 405 | 406 | def load_models(self, filename, enable_cuda=True, continue_training=True): 407 | """Load policy and value functions. Copy them to target functions.""" 408 | 409 | models = torch.load(filename) 410 | 411 | self.pi.load_state_dict(models['pi_state_dict']) 412 | self.v.load_state_dict(models['v_state_dict']) 413 | 414 | if continue_training: 415 | self.pi.train() 416 | self.v.train() 417 | else: 418 | self.pi.eval() 419 | self.v.eval() 420 | 421 | self.enable_cuda(enable_cuda, warn=False) 422 | 423 | def save_checkpoint(self, filename): 424 | """Save state_dicts of models and optimizers.""" 425 | 426 | torch.save({ 427 | 'using_cuda': self.__cuda_enabled, 428 | 'pi_state_dict': self.pi.state_dict(), 429 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 430 | 'v_state_dict': self.v.state_dict(), 431 | 'v_optimizer_state_dict': self.v_optim.state_dict() 432 | }, filename) 433 | 434 | def load_checkpoint(self, filename, continue_training=True): 435 | """Load state_dicts for models and optimizers.""" 436 | 437 | checkpoint = torch.load(filename) 438 | 439 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 440 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 441 | self.v.load_state_dict(checkpoint['v_state_dict']) 442 | self.v_optim.load_state_dict(checkpoint['v_optimizer_state_dict']) 443 | 444 | if continue_training: 445 | self.pi.train() 446 | self.v.train() 447 | 448 | else: 449 | self.pi.eval() 450 | self.v.eval() 451 | 452 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 453 | 454 | def update(self, env, episode_length, sample_cutoff=100): 455 | """ 456 | Perform a single episode and corresponding gradient update. 457 | Return the total reward accumulated during the rollout. 458 | """ 459 | 460 | states, actions, rewards, next_states, log_probs = [], [], [], [], [] 461 | 462 | state = env.state 463 | 464 | for _ in range(episode_length): 465 | states.append(state) 466 | 467 | action, log_prob = self.pi.sample(state, 468 | sample_cutoff=sample_cutoff) 469 | actions.append(action) 470 | log_probs.append(log_prob) 471 | state, reward, done, _ = env.step(action.cpu().detach().numpy()) 472 | rewards.append(reward) 473 | next_states.append(state) 474 | 475 | if done: 476 | break 477 | 478 | next_states.append(env.state) 479 | 480 | v_loss = 0 481 | pi_loss = 0 482 | 483 | for state, action, reward, next_state, log_prob in zip( 484 | states, actions, rewards, next_states, log_probs): 485 | state = torch.FloatTensor(state).reshape(1, len(state)) 486 | next_state = torch.FloatTensor(next_state).reshape(1, len(next_state)) 487 | with torch.no_grad(): 488 | v_target = float(reward) + self.gamma * self.v(next_state) 489 | td_error = v_target - self.v(state) 490 | v_loss += (v_target - self.v(state))**2 491 | pi_loss = pi_loss + td_error * log_prob 492 | 493 | v_loss = v_loss / len(states) 494 | pi_loss = pi_loss / len(states) 495 | pi_loss = -pi_loss 496 | 497 | self.pi_optim.zero_grad() 498 | self.v_optim.zero_grad() 499 | 500 | if self.grad_clip_radius is not None: 501 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 502 | self.grad_clip_radius) 503 | torch.nn.utils.clip_grad_norm_(self.v.parameters(), 504 | self.grad_clip_radius) 505 | 506 | self.v_optim.step() 507 | self.pi_optim.step() 508 | 509 | return np.mean(rewards) 510 | 511 | def train(self, env, num_episodes, rollout_length, 512 | output_dir, args_list, 513 | reset_env=True, 514 | sample_cutoff=100): 515 | """ 516 | Train on the environment. 517 | """ 518 | 519 | episode_mean_rewards = [] 520 | 521 | for i in range(num_episodes): 522 | if reset_env: 523 | env.reset() 524 | mean_reward = self.update(env, rollout_length, 525 | sample_cutoff=sample_cutoff) 526 | episode_mean_rewards.append(mean_reward) 527 | cbf = [float(elem) for elem in env.cbf(env.state)] 528 | print( 529 | f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}, C(x) = [{cbf[0]:.2f}, {cbf[1]:.2f}]') 530 | 531 | rewards_filename = os.path.join(output_dir, 'episode_rewards') 532 | np.save(rewards_filename, episode_mean_rewards) 533 | 534 | hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml') 535 | with open(hyperparams_filename, 'w') as f: 536 | yaml.dump(args_list, f) 537 | 538 | 539 | class VanillaACAgent: 540 | """ 541 | Agent for training the classic actor-critic algorithm. 542 | """ 543 | 544 | def __init__(self, 545 | ### agent parameters 546 | state_dim, action_dim, 547 | policy_lr, value_lr, discount_factor, 548 | enable_cuda=False, 549 | policy_optimizer=torch.optim.Adam, 550 | value_optimizer=torch.optim.Adam, 551 | grad_clip_radius=None, 552 | ### policy parameters 553 | simple_cov=True, 554 | policy_hidden_layer1_size=32, 555 | policy_hidden_layer2_size=32, 556 | policy_activation='relu', 557 | log_std_min=-20, log_std_max=3, 558 | weight_init_std=0.0001, 559 | # value function parameters 560 | value_hidden_layer1_size=32, 561 | value_hidden_layer2_size=32, 562 | value_activation='ReLU', 563 | cbf=None, 564 | num_log_prob_samples=None): 565 | 566 | assert action_dim == 1, "Action dimension must be 1 in this version" 567 | 568 | self.pi = GaussianPolicyTwoLayer( 569 | state_dim=state_dim, action_dim=action_dim, 570 | simple_cov=simple_cov, 571 | hidden_layer1_size=policy_hidden_layer1_size, 572 | hidden_layer2_size=policy_hidden_layer2_size, 573 | activation=policy_activation, 574 | log_std_min=log_std_min, log_std_max=log_std_max, 575 | weight_init_std=weight_init_std 576 | ) 577 | 578 | self.v = two_layer_net( 579 | input_dim=state_dim, output_dim=1, 580 | hidden_layer1_size=value_hidden_layer1_size, 581 | hidden_layer2_size=value_hidden_layer2_size, 582 | activation=value_activation, 583 | ) 584 | 585 | self.gamma = discount_factor 586 | 587 | self.pi_optim = policy_optimizer(self.pi.parameters(), lr=policy_lr) 588 | self.grad_clip_radius = grad_clip_radius 589 | 590 | self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr) 591 | self.grad_clip_radius = grad_clip_radius 592 | 593 | self.__cuda_enabled = enable_cuda 594 | self.enable_cuda(self.__cuda_enabled, warn=False) 595 | # NOTE: self.device is defined when self.enable_cuda is called! 596 | 597 | @property 598 | def cuda_enabled(self): 599 | return self.__cuda_enabled 600 | 601 | def enable_cuda(self, enable_cuda=True, warn=True): 602 | """Enable or disable cuda and update models.""" 603 | 604 | if warn: 605 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 606 | "initializing optimizers can give errors when using " 607 | "optimizers other than SGD or Adam!") 608 | 609 | self.__cuda_enabled = enable_cuda 610 | self.device = torch.device( 611 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 612 | else 'cpu') 613 | self.pi.to(self.device) 614 | self.v.to(self.device) 615 | 616 | def load_models(self, filename, enable_cuda=True, continue_training=True): 617 | """Load policy and value functions. Copy them to target functions.""" 618 | 619 | models = torch.load(filename) 620 | 621 | self.pi.load_state_dict(models['pi_state_dict']) 622 | self.v.load_state_dict(models['v_state_dict']) 623 | 624 | if continue_training: 625 | self.pi.train() 626 | self.v.train() 627 | else: 628 | self.pi.eval() 629 | self.v.eval() 630 | 631 | self.enable_cuda(enable_cuda, warn=False) 632 | 633 | def save_checkpoint(self, filename): 634 | """Save state_dicts of models and optimizers.""" 635 | 636 | torch.save({ 637 | 'using_cuda': self.__cuda_enabled, 638 | 'pi_state_dict': self.pi.state_dict(), 639 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 640 | 'v_state_dict': self.v.state_dict(), 641 | 'v_optimizer_state_dict': self.v_optim.state_dict() 642 | }, filename) 643 | 644 | def load_checkpoint(self, filename, continue_training=True): 645 | """Load state_dicts for models and optimizers.""" 646 | 647 | checkpoint = torch.load(filename) 648 | 649 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 650 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 651 | self.v.load_state_dict(checkpoint['v_state_dict']) 652 | self.v_optim.load_state_dict(checkpoint['v_optimizer_state_dict']) 653 | 654 | if continue_training: 655 | self.pi.train() 656 | self.v.train() 657 | 658 | else: 659 | self.pi.eval() 660 | self.v.eval() 661 | 662 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 663 | 664 | def update(self, env, episode_length, sample_cutoff=100): 665 | """ 666 | Perform a single episode and corresponding gradient update. 667 | Return the mean reward accumulated during the rollout. 668 | """ 669 | 670 | states, actions, rewards, next_states, log_probs = [], [], [], [], [] 671 | 672 | state = env.reset() 673 | 674 | for _ in range(episode_length): 675 | states.append(state) 676 | 677 | action, log_prob = self.pi.sample( 678 | torch.FloatTensor(state).reshape(1, len(state)).to(self.device) 679 | ) 680 | actions.append(action) 681 | log_probs.append(log_prob) 682 | state, reward, done, _ = env.step(action.cpu().detach().numpy()) 683 | rewards.append(reward) 684 | next_states.append(state) 685 | if done: 686 | break 687 | 688 | next_states.append(env.state) 689 | 690 | v_loss = 0 691 | pi_loss = 0 692 | 693 | for state, action, reward, next_state, log_prob in zip( 694 | states, actions, rewards, next_states, log_probs): 695 | state = torch.FloatTensor(state).reshape(1, len(state)) 696 | next_state = torch.FloatTensor(next_state).reshape(1, len(next_state)) 697 | with torch.no_grad(): 698 | v_target = float(reward) + self.gamma * self.v(next_state) 699 | td_error = v_target - self.v(state) 700 | v_loss += (v_target - self.v(state))**2 701 | pi_loss = pi_loss + td_error * log_prob 702 | 703 | pi_loss = pi_loss / len(states) 704 | v_loss = v_loss / len(states) 705 | pi_loss = -pi_loss 706 | 707 | self.pi_optim.zero_grad() 708 | self.v_optim.zero_grad() 709 | 710 | if self.grad_clip_radius is not None: 711 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 712 | self.grad_clip_radius) 713 | torch.nn.utils.clip_grad_norm_(self.v.parameters(), 714 | self.grad_clip_radius) 715 | 716 | self.v_optim.step() 717 | self.pi_optim.step() 718 | 719 | return np.mean(rewards) 720 | 721 | def train(self, env, num_episodes, rollout_length, 722 | output_dir, args_list, 723 | reset_env=True, 724 | sample_cutoff=100): 725 | """ 726 | Train on the environment. 727 | """ 728 | 729 | episode_mean_rewards = [] 730 | 731 | for i in range(num_episodes): 732 | if reset_env: 733 | env.reset() 734 | mean_reward = self.update(env, rollout_length, 735 | sample_cutoff=sample_cutoff) 736 | episode_mean_rewards.append(mean_reward) 737 | print( 738 | f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}') 739 | 740 | # rewards_filename = os.path.join(output_dir, 'episode_rewards') 741 | # np.save(rewards_filename, episode_mean_rewards) 742 | 743 | # hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml') 744 | # with open(hyperparams_filename, 'w') as f: 745 | # yaml.dump(args_list, f) 746 | -------------------------------------------------------------------------------- /Pendulum/agents/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.distributions as td 4 | from torch.nn import functional as F 5 | import gym 6 | from gym import spaces 7 | import numpy as np 8 | from typing import NamedTuple 9 | import warnings 10 | from matplotlib import pyplot as plt 11 | 12 | from stable_baselines3.common.utils import obs_as_tensor 13 | from stable_baselines3.common.preprocessing import ( 14 | get_obs_shape, get_action_dim 15 | ) 16 | 17 | 18 | #from agents.TruncatedNormal import TruncatedNormal as tn 19 | 20 | from wesutils import two_layer_net 21 | 22 | class RolloutBufferSamples(NamedTuple): 23 | observations: torch.Tensor 24 | actions: torch.Tensor 25 | old_values: torch.Tensor 26 | old_log_prob: torch.Tensor 27 | advantages: torch.Tensor 28 | returns: torch.Tensor 29 | 30 | 31 | class RolloutBuffer: 32 | 33 | def __init__(self, 34 | buffer_size, 35 | observation_space, 36 | action_space, 37 | gamma=0.99, 38 | device='cpu'): 39 | 40 | self.buffer_size = buffer_size 41 | self.observation_space = observation_space 42 | self.action_space = action_space 43 | self.gamma = gamma 44 | self.device = device 45 | self.obs_shape = get_obs_shape(self.observation_space) 46 | self.action_dim = get_action_dim(self.action_space) 47 | 48 | self.reset() 49 | 50 | def reset(self): 51 | 52 | self.observations = np.zeros( 53 | (self.buffer_size,) + self.obs_shape, dtype=np.float32 54 | ) 55 | self.actions = np.zeros( 56 | (self.buffer_size, self.action_dim), dtype=np.float32 57 | ) 58 | self.rewards = np.zeros( 59 | (self.buffer_size,), dtype=np.float32 60 | ) 61 | self.episode_starts = np.zeros( 62 | (self.buffer_size,), dtype=np.float32 63 | ) 64 | self.values = np.zeros( 65 | (self.buffer_size,), dtype=np.float32 66 | ) 67 | self.log_probs = np.zeros( 68 | (self.buffer_size,), dtype=np.float32 69 | ) 70 | self.advantages = np.zeros( 71 | (self.buffer_size,), dtype=np.float32 72 | ) 73 | 74 | self.full = False 75 | self.pos = 0 76 | 77 | def compute_returns_and_advantage(self, last_value, done): 78 | 79 | last_value = last_value.clone().cpu().numpy().flatten() 80 | 81 | discounted_reward = 0 82 | for step in reversed(range(self.buffer_size)): 83 | if step == self.buffer_size - 1: 84 | next_non_terminal = 1.0 - done 85 | next_value = last_value 86 | else: 87 | next_non_terminal = 1.0 - self.episode_starts[step + 1] 88 | next_value = self.values[step + 1] 89 | discounted_reward = self.rewards[step] + \ 90 | self.gamma * discounted_reward * next_non_terminal 91 | self.advantages[step] = discounted_reward - self.values[step] 92 | self.returns = self.advantages + self.values 93 | 94 | def add(self, obs, action, reward, episode_start, value, log_prob): 95 | 96 | if len(log_prob.shape) == 0: 97 | log_prob = log_prob.reshape(-1, 1) 98 | 99 | if isinstance(self.observation_space, spaces.Discrete): 100 | obs = obs.reshape((1,) + self.obs_shape) 101 | 102 | self.observations[self.pos] = np.array(obs).copy() 103 | self.actions[self.pos] = np.array(action).copy() 104 | self.rewards[self.pos] = np.array(reward).copy() 105 | self.episode_starts[self.pos] = np.array(episode_start).copy() 106 | self.values[self.pos] = value.clone().cpu().numpy().flatten() 107 | self.log_probs[self.pos] = log_prob.clone().cpu().numpy() 108 | self.pos += 1 109 | if self.pos == self.buffer_size: 110 | self.full = True 111 | 112 | def get(self, batch_size=None): 113 | assert self.full, "" 114 | indices = np.random.permutation(self.buffer_size) 115 | 116 | # Return everything, don't create minibatches 117 | if batch_size is None: 118 | batch_size = self.buffer_size 119 | 120 | start_idx = 0 121 | while start_idx < self.buffer_size: 122 | yield self._get_samples(indices[start_idx : start_idx + batch_size]) 123 | start_idx += batch_size 124 | 125 | def _get_samples(self, batch_inds): 126 | data = ( 127 | self.observations[batch_inds], 128 | self.actions[batch_inds], 129 | self.values[batch_inds].flatten(), 130 | self.log_probs[batch_inds].flatten(), 131 | self.advantages[batch_inds].flatten(), 132 | self.returns[batch_inds].flatten(), 133 | ) 134 | return RolloutBufferSamples(*tuple(map(self.to_torch, data))) 135 | 136 | def to_torch(self, array, copy=True): 137 | if copy: 138 | return torch.tensor(array).to(self.device) 139 | return torch.as_tensor(array).to(self.device) 140 | 141 | 142 | class PolicyNetwork(nn.Module): 143 | """Base class for stochastic policy networks.""" 144 | 145 | def __init__(self): 146 | super().__init__() 147 | 148 | def forward(self, state): 149 | """Take state as input, then output the parameters of the policy.""" 150 | 151 | raise NotImplemented("forward not implemented.") 152 | 153 | def sample(self, state): 154 | """ 155 | Sample an action based on the model parameters given the current state. 156 | """ 157 | 158 | raise NotImplemented("sample not implemented.") 159 | 160 | def log_probs(self, obs, actions): 161 | """ 162 | Return log probabilities for each state-action pair. 163 | """ 164 | 165 | raise NotImplemented("log_probs not implemented.") 166 | 167 | def entropy(self, obs): 168 | """ 169 | Return entropy of the policy for each state. 170 | """ 171 | 172 | raise NotImplemented("entropy not implemented.") 173 | 174 | 175 | class GaussianPolicyBase(PolicyNetwork): 176 | """ 177 | Base class for Gaussian policy. 178 | 179 | Desired network needs to be implemented. 180 | """ 181 | 182 | def __init__(self, action_dim): 183 | 184 | super().__init__() 185 | 186 | self.action_dim = action_dim 187 | 188 | def _get_covs(self, log_stds): 189 | batch_size = log_stds.shape[0] 190 | stds = log_stds.exp().reshape(batch_size, 1, 1) 191 | covs = stds * torch.eye(self.action_dim).repeat(batch_size, 1, 1) 192 | return covs 193 | 194 | def sample(self, obs, no_log_prob=False): 195 | mean, log_std = self.forward(obs) 196 | cov = log_std.exp() * torch.eye(self.action_dim) 197 | dist = td.MultivariateNormal(mean, cov) 198 | action = dist.rsample() 199 | return action if no_log_prob else (action, dist.log_prob(action)) 200 | 201 | def log_probs(self, obs, actions): 202 | means, log_stds = self.forward(obs) 203 | covs = self._get_covs(log_stds) 204 | dists = td.MultivariateNormal(means, covs) 205 | return dists.log_prob(actions) 206 | 207 | def entropy(self, obs): 208 | means, log_stds = self.forward(obs) 209 | covs = self._get_covs(log_stds) 210 | dists = td.MultivariateNormal(means, covs) 211 | return dists.entropy() 212 | 213 | 214 | class GaussianPolicy(GaussianPolicyBase): 215 | """ 216 | Gaussian policy using a two-layer, two-headed MLP with ReLU activation. 217 | """ 218 | 219 | def __init__(self, obs_dim, action_dim, 220 | min_action_val=-1.0, 221 | max_action_val=1.0, 222 | hidden_layer1_size=64, 223 | hidden_layer2_size=64): 224 | 225 | super().__init__(action_dim) 226 | 227 | self.base_net = nn.Sequential( 228 | nn.Linear(obs_dim, hidden_layer1_size), 229 | nn.ReLU(), 230 | nn.Linear(hidden_layer1_size, hidden_layer2_size), 231 | nn.ReLU(), 232 | ) 233 | 234 | self.mean_net = nn.Sequential( 235 | nn.Linear(hidden_layer2_size, action_dim), 236 | nn.Hardtanh(min_action_val, max_action_val), 237 | ) 238 | 239 | self.log_std_net = nn.Sequential( 240 | nn.Linear(hidden_layer2_size, 1), 241 | ) 242 | 243 | def forward(self, obs): 244 | x = self.base_net(obs) 245 | mean = self.mean_net(x) 246 | log_std = self.log_std_net(x) 247 | return mean, log_std 248 | 249 | 250 | # class TruncatedNormalPolicyBase(PolicyNetwork): 251 | # """ 252 | # Base class for TruncatedNormal policy. Action dimension must be 1. 253 | # Uses a function (e.g., a CBF) to convert observations into bounds for 254 | # the TruncatedNormal distribution. 255 | 256 | # Desired network needs to be implemented. 257 | # """ 258 | 259 | # def __init__(self, constraint_fn): 260 | 261 | # super().__init__() 262 | 263 | # self.constraint_fn = self._vectorize_f(constraint_fn) 264 | 265 | # def _get_dist(self, obs): 266 | # mean, log_std = self.forward(obs) 267 | # std = log_std.exp() 268 | # lb, ub = self.constraint_fn(obs) 269 | # dist = tn(mean, std, lb, ub) 270 | 271 | # return dist 272 | 273 | # def sample(self, obs, no_log_prob=False): 274 | # dist = self._get_dist(obs) 275 | # lb = dist.scale * dist.a + dist.loc 276 | # ub = dist.scale * dist.b + dist.loc 277 | # action = dist.rsample() 278 | 279 | # for _ in range(100): 280 | # if (action > lb) and (action < ub): 281 | # break 282 | # if (action < lb) or (action > ub): 283 | # print("OOPS! Resampling...") 284 | # action = lb + (ub - lb) * torch.rand(size=(1, 1), requires_grad=True) 285 | 286 | # try: 287 | # log_prob = dist.log_prob(action) 288 | # except ValueError: 289 | # import pdb; pdb.set_trace() 290 | 291 | # return action if no_log_prob else (action, log_prob) 292 | 293 | # def log_probs(self, obs, actions): 294 | # dists = self._get_dist(obs) 295 | # return dists.log_prob(actions).flatten() 296 | 297 | # def entropy(self, obs): 298 | # dists = self._get_dist(obs) 299 | # return dists.entropy 300 | 301 | # def _vectorize_f(self, f): 302 | # """ 303 | # Converts a function f defined on 1D numpy arrays and outputting pairs of 304 | # scalars into a vectorized function accepting batches of 305 | # torch tensorized arrays and output pairs of torch tensors. 306 | # """ 307 | 308 | # def vectorized_cbf(obs): 309 | 310 | # obs = obs.cpu().detach().numpy() 311 | # lbs, ubs = [], [] 312 | 313 | # if len(obs.shape) == 1: 314 | # batch_size = 1 315 | # lb, ub = f(obs) 316 | # lbs.append(lb) 317 | # ubs.append(ub) 318 | 319 | # else: 320 | # batch_size = obs.shape[0] 321 | # for i in range(batch_size): 322 | # lb, ub = f(obs[i]) 323 | # lbs.append(lb) 324 | # ubs.append(ub) 325 | 326 | # lbs = torch.FloatTensor(lbs).reshape(batch_size, 1) 327 | # ubs = torch.FloatTensor(ubs).reshape(batch_size, 1) 328 | 329 | # return lbs, ubs 330 | 331 | # return vectorized_cbf 332 | 333 | 334 | # class TruncatedNormalPolicy(TruncatedNormalPolicyBase): 335 | # """ 336 | # TruncatedNormal policy using a two-layer, two-headed MLP with ReLU 337 | # activation. Action dimension must be 1. 338 | # """ 339 | 340 | # def __init__(self, obs_dim, constraint_fn, 341 | # hidden_layer1_size=64, 342 | # hidden_layer2_size=64, 343 | # mean_min=-np.inf, 344 | # mean_max=np.inf, 345 | # log_std_lb=-10, 346 | # log_std_ub=3): 347 | 348 | # super().__init__(constraint_fn) 349 | 350 | # self.base_net = nn.Sequential( 351 | # nn.Linear(obs_dim, hidden_layer1_size), 352 | # nn.ReLU(), 353 | # nn.Linear(hidden_layer1_size, hidden_layer2_size), 354 | # nn.ReLU(), 355 | # ) 356 | 357 | # self.mean_net = nn.Sequential( 358 | # nn.Linear(hidden_layer2_size, 1), 359 | # # nn.Hardtanh(min_val=mean_min, max_val=mean_max) 360 | # ) 361 | 362 | # self.log_std_net = nn.Sequential( 363 | # nn.Linear(hidden_layer2_size, 1), 364 | # # nn.Hardtanh(min_val=log_std_lb, max_val=log_std_ub) 365 | # ) 366 | 367 | # def init_weights(m): 368 | # if isinstance(m, nn.Linear): 369 | # torch.nn.init.normal_(m.weight, std=1.0) 370 | 371 | # self.base_net.apply(init_weights) 372 | # self.mean_net.apply(init_weights) 373 | # self.log_std_net.apply(init_weights) 374 | 375 | # def forward(self, obs): 376 | # x = self.base_net(obs) 377 | # mean = self.mean_net(x) 378 | # log_std = self.log_std_net(x) 379 | # return mean, log_std 380 | 381 | 382 | class BetaPolicyBase(PolicyNetwork): 383 | """ 384 | Base class for Beta policy. 385 | 386 | Desired network needs to be implemented. 387 | """ 388 | 389 | def __init__(self, constraint_fn, action_dim, enable_cuda=False): 390 | 391 | super().__init__() 392 | 393 | self.device = torch.device( 394 | 'cuda' if torch.cuda.is_available() and enable_cuda \ 395 | else 'cpu') 396 | self.constraint_fn = self._vectorize_f(constraint_fn, action_dim) 397 | self.action_dim = action_dim 398 | 399 | def _vectorize_f(self, f, action_dim): #--vipul :added action_dim 400 | """ 401 | Converts a function f defined on 1D numpy arrays and outputting pairs of 402 | scalars into a vectorized function accepting batches of 403 | torch tensorized arrays and output pairs of torch tensors. 404 | """ 405 | 406 | def vectorized_f(obs, action_dim): #--vipul :added action_dim 407 | 408 | obs = obs.cpu().detach().numpy() 409 | 410 | if len(obs.shape) == 1: # check to see if obs is a batch or single obs 411 | batch_size = 1 412 | lbs, ubs = f(obs) 413 | lbs=np.array(lbs) 414 | ubs=np.array(ubs) 415 | #lbs = -5 416 | #ubs = 5 417 | 418 | else: 419 | batch_size = obs.shape[0] 420 | lbs = np.zeros([batch_size, self.action_dim]) 421 | ubs = np.zeros([batch_size, self.action_dim]) 422 | for i in range(batch_size): 423 | lbs[i], ubs[i] = f(obs[i]) 424 | #lbs[i] = -5 425 | #ubs[i] = 5 426 | 427 | lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim) 428 | ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim) 429 | lbs = lbs.to(self.device) 430 | ubs = ubs.to(self.device) 431 | 432 | return lbs, ubs 433 | 434 | return vectorized_f 435 | 436 | def sample(self, obs, action_dim, no_log_prob=False): 437 | """ 438 | Sample from independent Beta distributions across each action_dim. 439 | """ 440 | 441 | assert len(obs.shape) == 1, 'obs must be a flat array' 442 | 443 | alphas, betas = self.forward(obs) 444 | alphas, betas = torch.flatten(alphas), torch.flatten(betas) 445 | dists = [ 446 | td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas) 447 | ] 448 | 449 | #from original implementation 450 | #to see if everything works well! 451 | dist_orig = td.Beta(alphas, betas) 452 | action_orig = dist_orig.rsample() 453 | log_prob_orig = dist_orig.log_prob(action_orig) 454 | #from original implementation 455 | 456 | action_along_dims = [dist.rsample() for dist in dists] 457 | action = torch.tensor(action_along_dims, requires_grad=True) 458 | #action = torch.tensor(action_along_dims) 459 | log_prob = torch.sum(torch.tensor([ 460 | dist.log_prob(a) for dist, a in zip(dists, action_along_dims) 461 | ], requires_grad=True)) 462 | # log_prob = torch.sum(torch.tensor([ 463 | # dist.log_prob(a) for dist, a in zip(dists, action_along_dims) 464 | # ])) 465 | 466 | lb, ub = self.constraint_fn(obs, action_dim) 467 | action = lb + (ub - lb) * action 468 | 469 | #below line commented out 470 | #return action if no_log_prob else (action, log_prob) 471 | 472 | 473 | #new lines added - 08.29.2023 474 | action_orig = lb + (ub - lb) * action_orig 475 | return action_orig if no_log_prob else (action_orig, log_prob_orig) 476 | 477 | def log_probs(self, obs, actions, action_dim): 478 | alphas_arr, betas_arr = self.forward(obs) 479 | #adding one line 480 | dists = td.Beta(alphas_arr, betas_arr) 481 | 482 | #commenting the below lines 08.26.2023 483 | dists_new = [] 484 | for i in range(alphas_arr.shape[0]): 485 | alphas = alphas_arr[i] 486 | betas = betas_arr[i] 487 | dists_new.append([ 488 | td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas) 489 | ]) 490 | 491 | lbs, ubs = self.constraint_fn(obs, action_dim) 492 | # if lbs.device!=actions.device: 493 | # lbs = lbs.to('cuda:0') 494 | # ubs = ubs.to('cuda:0') 495 | actions = (actions - lbs) / (ubs - lbs) 496 | actions = actions.clip(0, 1) 497 | 498 | #commenting the below lines 08.26.2023 499 | log_probs = [] 500 | for action, action_dists in zip(actions, dists_new): 501 | # log_probs.append( 502 | # torch.sum(torch.tensor([ 503 | # dim_dist.log_prob(dim_action) \ 504 | # for dim_dist, dim_action in zip(action_dists, action) 505 | # ], requires_grad=True)) 506 | # ) 507 | log_probs.append( 508 | torch.sum(torch.tensor([ 509 | dim_dist.log_prob(dim_action) \ 510 | for dim_dist, dim_action in zip(action_dists, action) 511 | ])) 512 | ) 513 | log_probs = torch.tensor(log_probs, requires_grad=True) 514 | # #log_probs = torch.tensor(log_probs) 515 | 516 | #return log_probs #commenting 08/25/23 517 | return dists.log_prob(actions).flatten() 518 | 519 | def entropy(self, obs): 520 | """ 521 | Returns sum of entropies along each independent action dimension. 522 | """ 523 | alphas_arr, betas_arr = self.forward(obs) 524 | dists = [] 525 | for i in range(alphas_arr.shape[0]): 526 | alphas = alphas_arr[i] 527 | betas = betas_arr[i] 528 | dists.append([ 529 | td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas) 530 | ]) 531 | entropies = torch.tensor( 532 | [torch.sum(torch.tensor([dist.entropy() for dist in dist_list])) \ 533 | for dist_list in dists] 534 | ) 535 | return entropies 536 | 537 | # # TODO: make this consistent with sample & log_probs defined above 538 | # alphas, betas = self.forward(obs) 539 | # dists = td.Beta(alphas, betas) 540 | # return dists.entropy() 541 | 542 | 543 | ### NOTE: this is the BetaPolicyBase giving us the Dirichlet error 544 | # class BetaPolicyBase(PolicyNetwork): 545 | # """ 546 | # Base class for Beta policy. 547 | # 548 | # Desired network needs to be implemented. 549 | # """ 550 | # 551 | # def __init__(self, constraint_fn, action_dim): 552 | # 553 | # super().__init__() 554 | # 555 | # self.constraint_fn = self._vectorize_f(constraint_fn, action_dim) 556 | # self.action_dim = action_dim 557 | # #assert self.action_dim == 1, 'Action dimension must be 1' #-Line commented out--vipul 558 | # 559 | # def _vectorize_f(self, f, action_dim): #--vipul :added action_dim 560 | # """ 561 | # Converts a function f defined on 1D numpy arrays and outputting pairs of 562 | # scalars into a vectorized function accepting batches of 563 | # torch tensorized arrays and output pairs of torch tensors. 564 | # """ 565 | # 566 | # def vectorized_f(obs, action_dim): #--vipul :added action_dim 567 | # 568 | # obs = obs.cpu().detach().numpy() 569 | # 570 | # if len(obs.shape) == 1: # check to see if obs is a batch or single obs 571 | # batch_size = 1 572 | # lbs, ubs = f(obs) 573 | # 574 | # else: 575 | # batch_size = obs.shape[0] 576 | # lbs = np.zeros([batch_size, self.action_dim]) 577 | # ubs = np.zeros([batch_size, self.action_dim]) 578 | # for i in range(batch_size): 579 | # lbs[i], ubs[i] = f(obs[i]) 580 | # 581 | # lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim) 582 | # ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim) 583 | # 584 | # return lbs, ubs 585 | # 586 | # return vectorized_f 587 | # 588 | # def sample(self, obs, action_dim, no_log_prob=False): #sample from interval is taken with a beta distribution -vipul 589 | # #sample definition changed to take in argument action_dim --vipul 590 | # alpha, beta = self.forward(obs) 591 | # dist = td.Beta(alpha, beta) 592 | # action = dist.rsample() 593 | # #log_prob = dist.log_prob(action) #original code 594 | # log_prob_vec = dist.log_prob(action) #vipul 595 | # log_prob = sum(log_prob_vec) 596 | # lb, ub = self.constraint_fn(obs, action_dim) 597 | # action = lb + (ub - lb) * action #need to be changed? --vipul 598 | # #code change nedded! -vipul 599 | # return action if no_log_prob else (action, log_prob) 600 | # 601 | # def log_probs(self, obs, actions, action_dim): #function modified 602 | # alphas, betas = self.forward(obs) 603 | # dists = td.Beta(alphas, betas) 604 | # lbs, ubs = self.constraint_fn(obs, action_dim) 605 | # actions = (actions - lbs) / (ubs - lbs) 606 | # #need to be changed? --vipul 607 | # #yes, change needed! 608 | # return sum(dists.log_prob(actions).flatten()) 609 | # 610 | # def entropy(self, obs): 611 | # alphas, betas = self.forward(obs) 612 | # dists = td.Beta(alphas, betas) 613 | # return dists.entropy() 614 | ### 615 | 616 | 617 | class BetaPolicy(BetaPolicyBase): 618 | """ 619 | Beta policy using a two-layer, two-headed MLP with ReLU activation. 620 | """ 621 | 622 | def __init__(self, obs_dim, constraint_fn, action_dim, #vipul : action_dim=1 replaced by just action_dim 623 | hidden_layer1_size=64, 624 | hidden_layer2_size=64): 625 | 626 | super().__init__(constraint_fn, action_dim=action_dim) 627 | 628 | self.base_net = nn.Sequential( 629 | nn.Linear(obs_dim, hidden_layer1_size), 630 | nn.Tanh(), 631 | nn.Linear(hidden_layer1_size, hidden_layer2_size), 632 | nn.Tanh(), 633 | ) 634 | 635 | self.alpha_net = nn.Sequential( 636 | nn.Linear(hidden_layer2_size, action_dim), #vipul : 1 replaced with action_dim 637 | nn.Softplus(), 638 | ) 639 | 640 | self.beta_net = nn.Sequential( 641 | nn.Linear(hidden_layer2_size, action_dim), #vipul : 1 replaced with action_dim 642 | nn.Softplus(), 643 | ) 644 | 645 | def forward(self, obs): 646 | 647 | x = self.base_net(obs) 648 | alpha = 1.0 + self.alpha_net(x) #is there any change needed? --vipul 649 | beta = 1.0 + self.beta_net(x) #is there any change needed? --vipul 650 | 651 | return alpha, beta 652 | 653 | 654 | class CategoricalPolicy(PolicyNetwork): 655 | """ 656 | Base class for categorical policy. 657 | 658 | Desired network needs to be implemented. 659 | """ 660 | 661 | def __init__(self, num_actions): 662 | 663 | super().__init__() 664 | 665 | self.num_actions = num_actions 666 | 667 | def sample(self, obs, no_log_prob=False): 668 | logits = self.forward(obs) 669 | dist = td.Categorical(logits=logits) 670 | action = dist.sample(sample_shape=torch.tensor([1])) 671 | return action if no_log_prob else (action, dist.log_prob(action)) 672 | 673 | def log_probs(self, obs, actions): 674 | dists = td.Categorical(logits=self.forward(obs)) 675 | return dists.log_prob(actions.flatten()) 676 | 677 | def entropy(self, obs): 678 | dists = td.Categorical(logits=self.forward(obs)) 679 | return dists.entropy() 680 | 681 | 682 | class CategoricalPolicyTwoLayer(CategoricalPolicy): 683 | """ 684 | Categorical policy using a fully connected two-layer network. 685 | """ 686 | 687 | def __init__(self, state_dim, num_actions, 688 | hidden_layer1_size=64, 689 | hidden_layer2_size=64, 690 | init_std=0.001): 691 | 692 | super().__init__(num_actions) 693 | 694 | self.init_std = init_std 695 | 696 | self.linear1 = nn.Linear(state_dim, hidden_layer1_size) 697 | self.linear2 = nn.Linear(hidden_layer1_size, hidden_layer2_size) 698 | self.linear3 = nn.Linear(hidden_layer2_size, num_actions) 699 | nn.init.normal_(self.linear1.weight, std=init_std) 700 | nn.init.normal_(self.linear2.weight, std=init_std) 701 | nn.init.normal_(self.linear3.weight, std=init_std) 702 | 703 | def forward(self, state): 704 | x = F.relu(self.linear1(state)) 705 | x = F.relu(self.linear2(x)) 706 | output = self.linear3(x) 707 | return output 708 | 709 | 710 | class PPOBase: 711 | def __init__(self, 712 | env, 713 | policy, 714 | value_function, 715 | policy_lr, 716 | value_lr, 717 | entropy_coef=0.0, 718 | clip_range=0.2, 719 | n_epochs=10, 720 | batch_size=64, 721 | weight_decay=0.0, 722 | gamma=0.99, 723 | buffer_size=2048, 724 | enable_cuda=True, 725 | policy_optimizer=torch.optim.Adam, 726 | value_optimizer=torch.optim.Adam, 727 | grad_clip_radius=None): 728 | 729 | warnings.warn('This PPO implementation currently contains hacks for ' + \ 730 | 'returning information about CBF-related safety.') 731 | 732 | self.env = env 733 | self.pi = policy 734 | self.v = value_function 735 | self.entropy_coef = entropy_coef 736 | self.clip_range = clip_range 737 | self.n_epochs = n_epochs 738 | self.batch_size = batch_size 739 | 740 | self.__cuda_enabled = enable_cuda 741 | self.enable_cuda(self.__cuda_enabled, warn=False) 742 | # NOTE: self.device is defined when self.enable_cuda is called! 743 | 744 | self.pi_optim = policy_optimizer(self.pi.parameters(), 745 | lr=policy_lr, 746 | weight_decay=weight_decay) 747 | self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr) 748 | self.grad_clip_radius = grad_clip_radius 749 | 750 | self.rollout_buffer = RolloutBuffer( 751 | buffer_size, 752 | env.observation_space, 753 | env.action_space, 754 | device=self.device, 755 | gamma=gamma 756 | ) 757 | 758 | @property 759 | def cuda_enabled(self): 760 | return self.__cuda_enabled 761 | 762 | def enable_cuda(self, enable_cuda=True, warn=True): 763 | """Enable or disable cuda and update models.""" 764 | 765 | if warn: 766 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 767 | "initializing optimizers can give errors when using " 768 | "optimizers other than SGD or Adam!") 769 | 770 | self.__cuda_enabled = enable_cuda 771 | self.device = torch.device( 772 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 773 | else 'cpu') 774 | self.pi.to(self.device) 775 | self.v.to(self.device) 776 | 777 | def load_models(self, filename, enable_cuda=True, continue_training=True): 778 | """ 779 | Load policy and value functions. Copy them to target functions. 780 | 781 | This method is for evaluation only. Use load_checkpoint to continue 782 | training. 783 | """ 784 | 785 | models = torch.load(filename) 786 | 787 | self.pi.load_state_dict(models['pi_state_dict']) 788 | self.v.load_state_dict(models['v_state_dict']) 789 | 790 | self.pi.eval() 791 | self.v.eval() 792 | 793 | self.enable_cuda(enable_cuda, warn=False) 794 | 795 | def save_checkpoint(self, filename): 796 | """Save state_dicts of models and optimizers.""" 797 | 798 | torch.save({ 799 | 'using_cuda': self.__cuda_enabled, 800 | 'pi_state_dict': self.pi.state_dict(), 801 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 802 | 'v_state_dict': self.v.state_dict(), 803 | 'v_optimizer_state_dict': self.v_optim.state_dict(), 804 | }, filename) 805 | 806 | def load_checkpoint(self, filename, continue_training=True): 807 | """Load state_dicts for models and optimizers.""" 808 | 809 | checkpoint = torch.load(filename) 810 | 811 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 812 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 813 | self.v.load_state_dict(models['v_state_dict']) 814 | self.v_optim.load_state_dict(models['v_optimizer_state_dict']) 815 | 816 | if continue_training: 817 | self.pi.train() 818 | self.v.train() 819 | else: 820 | self.pi.eval() 821 | self.v.eval() 822 | 823 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 824 | 825 | def collect_rollout(self, env, rollout_length): 826 | """ 827 | Perform a rollout and fill the rollout buffer. 828 | """ 829 | 830 | self._last_obs = env.reset() 831 | self._last_episode_start = np.zeros(1) 832 | n_steps = 0 833 | self.rollout_buffer.reset() 834 | 835 | num_unsafe_steps = 0 836 | x_t=[] 837 | y_t=[] 838 | while n_steps < rollout_length: 839 | 840 | #vipul--- I removed these lines for now! 841 | #as they are pendulum specific 842 | # if env._state[0] < env.theta_safety_bounds[0] - 1e-8 or \ 843 | # env._state[0] > env.theta_safety_bounds[1] + 1e-8: 844 | # num_unsafe_steps += 1 845 | 846 | action_dim=get_action_dim(env.action_space) 847 | #action_dim added --vipul 848 | 849 | with torch.no_grad(): 850 | # Convert to pytorch tensor or to TensorDict 851 | obs_tensor = obs_as_tensor(self._last_obs, self.device).float() 852 | action, log_prob = self.pi.sample(obs_tensor, action_dim) 853 | value = self.v(obs_tensor) 854 | action = action.cpu().numpy() 855 | 856 | # Rescale and perform action 857 | clipped_action = action 858 | # Clip the actions to avoid out of bound error 859 | if isinstance(self.env.action_space, gym.spaces.Box): 860 | clipped_action = np.clip(action, self.env.action_space.low, 861 | self.env.action_space.high) 862 | elif isinstance(self.env.action_space, gym.spaces.Discrete): 863 | clipped_action = int(clipped_action) 864 | 865 | new_obs, reward, done, info = env.step(clipped_action) 866 | 867 | #adding the animation code here --vipul 868 | x_t.append(new_obs[0]) 869 | y_t.append(new_obs[1]) 870 | # if abs(new_obs[0]-env.obstacle[0])<0.1 and abs(new_obs[1]-env.obstacle[1])<0.1: 871 | # print("crash") 872 | # break 873 | 874 | n_steps += 1 875 | 876 | if isinstance(self.env.action_space, gym.spaces.Discrete): 877 | # Reshape in case of discrete action 878 | action = action.reshape(-1, 1) 879 | 880 | self.rollout_buffer.add(self._last_obs, action, reward, 881 | self._last_episode_start, value, log_prob) 882 | self._last_obs = new_obs.flatten() 883 | self._last_episode_start = done 884 | 885 | if n_steps == rollout_length: 886 | env.reset() 887 | 888 | #code for plotting the quadrotor trajectory in an episode 889 | # plt.xlim(-200,200) 890 | # plt.ylim(-200,200) 891 | # plt.plot(x_t, y_t, color = 'red') 892 | # plt.pause(1) 893 | plt.xlim(-1,14) 894 | plt.ylim(-1,14) 895 | plt.xlabel('X axis') 896 | plt.ylabel('Y-axis') 897 | plt.plot(x_t,y_t) 898 | # plt.plot(env.goal[0],env.goal[1],marker='o',color='red') 899 | # plt.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black') 900 | 901 | 902 | # def f(x, y, xa, yb, a, b): 903 | # return (x - xa)**4/a**4 + (y - yb)**4/b**4 904 | 905 | # # Define the point around which to plot 906 | # xa, yb = env.obstacle[0], env.obstacle[1] 907 | 908 | # # Define the range of x and y values to plot 909 | # x_vals = np.linspace(xa - env.a_d, xa + env.a_d, 100) 910 | # y_vals = np.linspace(yb - env.b_d, yb + env.b_d, 100) 911 | 912 | # # Create a grid of x and y values 913 | # X, Y = np.meshgrid(x_vals, y_vals) 914 | 915 | # # Evaluate the function at each point in the grid 916 | # Z = f(X, Y, xa, yb, env.a_d, env.b_d) 917 | 918 | # # Plot the function as a contour plot 919 | # plt.contour(X, Y, Z, levels=[env.safety_dist]) 920 | #quadrotor.plot_step(new_obs, u_hat_acc, state_hist, plot_handle) 921 | 922 | self.rollout_buffer.compute_returns_and_advantage(last_value=value, 923 | done=done) 924 | 925 | safety_rate = 100 * (1 - num_unsafe_steps / rollout_length) 926 | 927 | #return np.sum(self.rollout_buffer.rewards) / np.sum( 928 | #self.rollout_buffer.episode_starts), safety_rate 929 | return np.sum(self.rollout_buffer.rewards), safety_rate 930 | 931 | def train(self): 932 | """ 933 | Train on the current rollout buffer. 934 | """ 935 | #action_dim = get_action_dim(self.action_space) 936 | for epoch in range(self.n_epochs): 937 | 938 | # Do a complete pass on the rollout buffer 939 | for rollout_data in self.rollout_buffer.get(self.batch_size): 940 | 941 | actions = rollout_data.actions 942 | obs = rollout_data.observations 943 | values = self.v(obs).flatten() 944 | log_probs = self.pi.log_probs(obs, actions, actions.shape[1]) 945 | 946 | entropies = self.pi.entropy(obs) 947 | if log_probs.device!=actions.device: 948 | log_probs=log_probs.to('cuda:0') 949 | entropies=entropies.to('cuda:0') 950 | advantages = rollout_data.advantages 951 | 952 | #08.14.2023 Vipul 953 | #I commented out the below line 954 | #08.26.2023 955 | #I uncommented the below line 956 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) 957 | 958 | # ratio between old and new policy, should be one at the first iteration 959 | ratio = torch.exp(log_probs - rollout_data.old_log_prob) 960 | 961 | # if ratio[0]>1.001: 962 | # print(ratio) 963 | # import pdb; pdb.set_trace() 964 | # clipped surrogate loss 965 | policy_loss_1 = advantages * ratio 966 | policy_loss_2 = advantages * torch.clamp(ratio, 967 | 1 - self.clip_range, 968 | 1 + self.clip_range) 969 | policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean() - \ 970 | self.entropy_coef * entropies.mean() 971 | #08/08/2023 -- Changed the entropy coefficient to 0.01--vipul 972 | self.pi_optim.zero_grad() 973 | policy_loss.backward() 974 | # Clip grad norm 975 | if self.grad_clip_radius is not None: 976 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 977 | self.grad_clip_radius) 978 | self.pi_optim.step() 979 | 980 | value_loss = F.mse_loss(rollout_data.returns, values) 981 | 982 | self.v_optim.zero_grad() 983 | value_loss.backward() 984 | # Clip grad norm 985 | if self.grad_clip_radius is not None: 986 | torch.nn.utils.clip_grad_norm_(self.v.parameters(), 987 | self.grad_clip_radius) 988 | self.v_optim.step() 989 | 990 | #import pdb; pdb.set_trace() 991 | 992 | #this definition is copied from stable-baselines3' website and DOES NOT belong to this code 993 | # def obs_as_tensor( 994 | # obs: Union[np.ndarray, Dict[Union[str, int], np.ndarray]], device: th.device 995 | # ) -> Union[th.Tensor, TensorDict]: 996 | # """ 997 | # Moves the observation to the given device. 998 | 999 | # :param obs: 1000 | # :param device: PyTorch device 1001 | # :return: PyTorch tensor of the observation on a desired device. 1002 | # """ 1003 | # if isinstance(obs, np.ndarray): 1004 | # return th.as_tensor(obs, device=device) 1005 | # elif isinstance(obs, dict): 1006 | # return {key: th.as_tensor(_obs, device=device) for (key, _obs) in obs.items()} 1007 | # else: 1008 | # raise Exception(f"Unrecognized type of observation {type(obs)}") -------------------------------------------------------------------------------- /agents/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.distributions as td 4 | from torch.nn import functional as F 5 | import gym 6 | from gym import spaces 7 | import numpy as np 8 | from typing import NamedTuple 9 | import warnings 10 | from matplotlib import pyplot as plt 11 | from matplotlib.animation import FuncAnimation 12 | import datetime 13 | import os 14 | import math 15 | import csv 16 | 17 | from stable_baselines3.common.utils import obs_as_tensor 18 | from stable_baselines3.common.preprocessing import ( 19 | get_obs_shape, get_action_dim 20 | ) 21 | 22 | 23 | #from agents.TruncatedNormal import TruncatedNormal as tn 24 | 25 | from wesutils import two_layer_net 26 | 27 | class RolloutBufferSamples(NamedTuple): 28 | observations: torch.Tensor 29 | actions: torch.Tensor 30 | old_values: torch.Tensor 31 | old_log_prob: torch.Tensor 32 | advantages: torch.Tensor 33 | returns: torch.Tensor 34 | 35 | 36 | class RolloutBuffer: 37 | 38 | def __init__(self, 39 | buffer_size, 40 | observation_space, 41 | action_space, 42 | gamma=0.90, 43 | device='cpu'): 44 | 45 | self.buffer_size = buffer_size 46 | self.observation_space = observation_space 47 | self.action_space = action_space 48 | self.gamma = gamma 49 | self.device = device 50 | self.obs_shape = get_obs_shape(self.observation_space) 51 | self.action_dim = get_action_dim(self.action_space) 52 | 53 | self.reset() 54 | 55 | def reset(self): 56 | 57 | self.observations = np.zeros( 58 | (self.buffer_size,) + self.obs_shape, dtype=np.float32 59 | ) 60 | self.actions = np.zeros( 61 | (self.buffer_size, self.action_dim), dtype=np.float32 62 | ) 63 | self.rewards = np.zeros( 64 | (self.buffer_size,), dtype=np.float32 65 | ) 66 | self.episode_starts = np.zeros( 67 | (self.buffer_size,), dtype=np.float32 68 | ) 69 | self.values = np.zeros( 70 | (self.buffer_size,), dtype=np.float32 71 | ) 72 | self.log_probs = np.zeros( 73 | (self.buffer_size,), dtype=np.float32 74 | ) 75 | self.advantages = np.zeros( 76 | (self.buffer_size,), dtype=np.float32 77 | ) 78 | 79 | self.full = False 80 | self.pos = 0 81 | 82 | def compute_returns_and_advantage(self, last_value, done): 83 | 84 | last_value = last_value.clone().cpu().numpy().flatten() 85 | 86 | discounted_reward = 0 87 | for step in reversed(range(self.buffer_size)): 88 | if step == self.buffer_size - 1: 89 | next_non_terminal = 1.0 - done 90 | next_value = last_value 91 | else: 92 | next_non_terminal = 1.0 - self.episode_starts[step + 1] 93 | next_value = self.values[step + 1] 94 | discounted_reward = self.rewards[step] + \ 95 | self.gamma * discounted_reward * next_non_terminal 96 | self.advantages[step] = discounted_reward - self.values[step] 97 | self.returns = self.advantages + self.values 98 | 99 | def add(self, obs, action, reward, episode_start, value, log_prob): 100 | 101 | if len(log_prob.shape) == 0: 102 | log_prob = log_prob.reshape(-1, 1) 103 | 104 | if isinstance(self.observation_space, spaces.Discrete): 105 | obs = obs.reshape((1,) + self.obs_shape) 106 | 107 | self.observations[self.pos] = np.array(obs).copy() 108 | self.actions[self.pos] = np.array(action).copy() 109 | self.rewards[self.pos] = np.array(reward).copy() 110 | self.episode_starts[self.pos] = np.array(episode_start).copy() 111 | self.values[self.pos] = value.clone().cpu().numpy().flatten() 112 | self.log_probs[self.pos] = log_prob.clone().cpu().numpy() 113 | self.pos += 1 114 | if self.pos == self.buffer_size: 115 | self.full = True 116 | 117 | def get(self, batch_size=None): 118 | assert self.full, "" 119 | indices = np.random.permutation(self.buffer_size) 120 | 121 | # Return everything, don't create minibatches 122 | if batch_size is None: 123 | batch_size = self.buffer_size 124 | 125 | start_idx = 0 126 | while start_idx < self.buffer_size: 127 | yield self._get_samples(indices[start_idx : start_idx + batch_size]) 128 | start_idx += batch_size 129 | 130 | def _get_samples(self, batch_inds): 131 | data = ( 132 | self.observations[batch_inds], 133 | self.actions[batch_inds], 134 | self.values[batch_inds].flatten(), 135 | self.log_probs[batch_inds].flatten(), 136 | self.advantages[batch_inds].flatten(), 137 | self.returns[batch_inds].flatten(), 138 | ) 139 | return RolloutBufferSamples(*tuple(map(self.to_torch, data))) 140 | 141 | def to_torch(self, array, copy=True): 142 | if copy: 143 | return torch.tensor(array).to(self.device) 144 | return torch.as_tensor(array).to(self.device) 145 | 146 | 147 | class PolicyNetwork(nn.Module): 148 | """Base class for stochastic policy networks.""" 149 | 150 | def __init__(self): 151 | super().__init__() 152 | 153 | def forward(self, state): 154 | """Take state as input, then output the parameters of the policy.""" 155 | 156 | raise NotImplemented("forward not implemented.") 157 | 158 | def sample(self, state): 159 | """ 160 | Sample an action based on the model parameters given the current state. 161 | """ 162 | 163 | raise NotImplemented("sample not implemented.") 164 | 165 | def log_probs(self, obs, actions): 166 | """ 167 | Return log probabilities for each state-action pair. 168 | """ 169 | 170 | raise NotImplemented("log_probs not implemented.") 171 | 172 | def entropy(self, obs): 173 | """ 174 | Return entropy of the policy for each state. 175 | """ 176 | 177 | raise NotImplemented("entropy not implemented.") 178 | 179 | 180 | class GaussianPolicyBase(PolicyNetwork): 181 | """ 182 | Base class for Gaussian policy. 183 | 184 | Desired network needs to be implemented. 185 | """ 186 | 187 | def __init__(self, action_dim): 188 | 189 | super().__init__() 190 | 191 | self.action_dim = action_dim 192 | 193 | def _get_covs(self, log_stds): 194 | batch_size = log_stds.shape[0] 195 | stds = log_stds.exp().reshape(batch_size, 1, 1) 196 | covs = stds * torch.eye(self.action_dim).repeat(batch_size, 1, 1) 197 | return covs 198 | 199 | def sample(self, obs, no_log_prob=False): 200 | mean, log_std = self.forward(obs) 201 | cov = log_std.exp() * torch.eye(self.action_dim) 202 | dist = td.MultivariateNormal(mean, cov) 203 | action = dist.rsample() 204 | return action if no_log_prob else (action, dist.log_prob(action)) 205 | 206 | def log_probs(self, obs, actions): 207 | means, log_stds = self.forward(obs) 208 | covs = self._get_covs(log_stds) 209 | dists = td.MultivariateNormal(means, covs) 210 | return dists.log_prob(actions) 211 | 212 | def entropy(self, obs): 213 | means, log_stds = self.forward(obs) 214 | covs = self._get_covs(log_stds) 215 | dists = td.MultivariateNormal(means, covs) 216 | return dists.entropy() 217 | 218 | 219 | class GaussianPolicy(GaussianPolicyBase): 220 | """ 221 | Gaussian policy using a two-layer, two-headed MLP with ReLU activation. 222 | """ 223 | 224 | def __init__(self, obs_dim, action_dim, 225 | min_action_val=-20.0 * np.array([1, 1]), 226 | max_action_val=20.0 * np.array([1, 1]), 227 | hidden_layer1_size=64, 228 | hidden_layer2_size=64): 229 | 230 | super().__init__(action_dim) 231 | 232 | self.base_net = nn.Sequential( 233 | nn.Linear(obs_dim, hidden_layer1_size), 234 | nn.ReLU(), 235 | nn.Linear(hidden_layer1_size, hidden_layer2_size), 236 | nn.ReLU(), 237 | ) 238 | 239 | self.mean_net = nn.Sequential( 240 | nn.Linear(hidden_layer2_size, action_dim), 241 | nn.Hardtanh(min_action_val[0], max_action_val[0]), 242 | nn.Hardtanh(min_action_val[1], max_action_val[1]) 243 | ) 244 | 245 | self.log_std_net = nn.Sequential( 246 | nn.Linear(hidden_layer2_size, 1), 247 | ) 248 | 249 | def forward(self, obs): 250 | x = self.base_net(obs) 251 | mean = self.mean_net(x) 252 | log_std = self.log_std_net(x) 253 | return mean, log_std 254 | 255 | 256 | # class TruncatedNormalPolicyBase(PolicyNetwork): 257 | # """ 258 | # Base class for TruncatedNormal policy. Action dimension must be 1. 259 | # Uses a function (e.g., a CBF) to convert observations into bounds for 260 | # the TruncatedNormal distribution. 261 | 262 | # Desired network needs to be implemented. 263 | # """ 264 | 265 | # def __init__(self, constraint_fn): 266 | 267 | # super().__init__() 268 | 269 | # self.constraint_fn = self._vectorize_f(constraint_fn) 270 | 271 | # def _get_dist(self, obs): 272 | # mean, log_std = self.forward(obs) 273 | # std = log_std.exp() 274 | # lb, ub = self.constraint_fn(obs) 275 | # dist = tn(mean, std, lb, ub) 276 | 277 | # return dist 278 | 279 | # def sample(self, obs, no_log_prob=False): 280 | # dist = self._get_dist(obs) 281 | # lb = dist.scale * dist.a + dist.loc 282 | # ub = dist.scale * dist.b + dist.loc 283 | # action = dist.rsample() 284 | 285 | # for _ in range(100): 286 | # if (action > lb) and (action < ub): 287 | # break 288 | # if (action < lb) or (action > ub): 289 | # print("OOPS! Resampling...") 290 | # action = lb + (ub - lb) * torch.rand(size=(1, 1), requires_grad=True) 291 | 292 | # try: 293 | # log_prob = dist.log_prob(action) 294 | # except ValueError: 295 | # import pdb; pdb.set_trace() 296 | 297 | # return action if no_log_prob else (action, log_prob) 298 | 299 | # def log_probs(self, obs, actions): 300 | # dists = self._get_dist(obs) 301 | # return dists.log_prob(actions).flatten() 302 | 303 | # def entropy(self, obs): 304 | # dists = self._get_dist(obs) 305 | # return dists.entropy 306 | 307 | # def _vectorize_f(self, f): 308 | # """ 309 | # Converts a function f defined on 1D numpy arrays and outputting pairs of 310 | # scalars into a vectorized function accepting batches of 311 | # torch tensorized arrays and output pairs of torch tensors. 312 | # """ 313 | 314 | # def vectorized_cbf(obs): 315 | 316 | # obs = obs.cpu().detach().numpy() 317 | # lbs, ubs = [], [] 318 | 319 | # if len(obs.shape) == 1: 320 | # batch_size = 1 321 | # lb, ub = f(obs) 322 | # lbs.append(lb) 323 | # ubs.append(ub) 324 | 325 | # else: 326 | # batch_size = obs.shape[0] 327 | # for i in range(batch_size): 328 | # lb, ub = f(obs[i]) 329 | # lbs.append(lb) 330 | # ubs.append(ub) 331 | 332 | # lbs = torch.FloatTensor(lbs).reshape(batch_size, 1) 333 | # ubs = torch.FloatTensor(ubs).reshape(batch_size, 1) 334 | 335 | # return lbs, ubs 336 | 337 | # return vectorized_cbf 338 | 339 | 340 | # class TruncatedNormalPolicy(TruncatedNormalPolicyBase): 341 | # """ 342 | # TruncatedNormal policy using a two-layer, two-headed MLP with ReLU 343 | # activation. Action dimension must be 1. 344 | # """ 345 | 346 | # def __init__(self, obs_dim, constraint_fn, 347 | # hidden_layer1_size=64, 348 | # hidden_layer2_size=64, 349 | # mean_min=-np.inf, 350 | # mean_max=np.inf, 351 | # log_std_lb=-10, 352 | # log_std_ub=3): 353 | 354 | # super().__init__(constraint_fn) 355 | 356 | # self.base_net = nn.Sequential( 357 | # nn.Linear(obs_dim, hidden_layer1_size), 358 | # nn.ReLU(), 359 | # nn.Linear(hidden_layer1_size, hidden_layer2_size), 360 | # nn.ReLU(), 361 | # ) 362 | 363 | # self.mean_net = nn.Sequential( 364 | # nn.Linear(hidden_layer2_size, 1), 365 | # # nn.Hardtanh(min_val=mean_min, max_val=mean_max) 366 | # ) 367 | 368 | # self.log_std_net = nn.Sequential( 369 | # nn.Linear(hidden_layer2_size, 1), 370 | # # nn.Hardtanh(min_val=log_std_lb, max_val=log_std_ub) 371 | # ) 372 | 373 | # def init_weights(m): 374 | # if isinstance(m, nn.Linear): 375 | # torch.nn.init.normal_(m.weight, std=1.0) 376 | 377 | # self.base_net.apply(init_weights) 378 | # self.mean_net.apply(init_weights) 379 | # self.log_std_net.apply(init_weights) 380 | 381 | # def forward(self, obs): 382 | # x = self.base_net(obs) 383 | # mean = self.mean_net(x) 384 | # log_std = self.log_std_net(x) 385 | # return mean, log_std 386 | 387 | 388 | class BetaPolicyBase(PolicyNetwork): 389 | """ 390 | Base class for Beta policy. 391 | 392 | Desired network needs to be implemented. 393 | """ 394 | 395 | def __init__(self, constraint_fn, action_dim, enable_cuda=False): 396 | 397 | super().__init__() 398 | 399 | self.device = torch.device( 400 | 'cuda' if torch.cuda.is_available() and enable_cuda \ 401 | else 'cpu') 402 | self.constraint_fn = self._vectorize_f(constraint_fn, action_dim) 403 | self.action_dim = action_dim 404 | 405 | def _vectorize_f(self, f, action_dim): #--vipul :added action_dim 406 | """ 407 | Converts a function f defined on 1D numpy arrays and outputting pairs of 408 | scalars into a vectorized function accepting batches of 409 | torch tensorized arrays and output pairs of torch tensors. 410 | """ 411 | 412 | def vectorized_f(obs, action_dim): #--vipul :added action_dim 413 | 414 | obs = obs.cpu().detach().numpy() 415 | 416 | if len(obs.shape) == 1: # check to see if obs is a batch or single obs 417 | batch_size = 1 418 | lbs, ubs = f(obs) 419 | 420 | else: 421 | batch_size = obs.shape[0] 422 | lbs = np.zeros([batch_size, self.action_dim]) 423 | ubs = np.zeros([batch_size, self.action_dim]) 424 | for i in range(batch_size): 425 | lbs[i], ubs[i] = f(obs[i]) 426 | 427 | lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim) 428 | ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim) 429 | lbs = lbs.to(self.device) 430 | ubs = ubs.to(self.device) 431 | 432 | return lbs, ubs 433 | 434 | return vectorized_f 435 | 436 | def sample(self, obs, action_dim, no_log_prob=False): 437 | """ 438 | Sample from independent Beta distributions across each action_dim. 439 | """ 440 | 441 | assert len(obs.shape) == 1, 'obs must be a flat array' 442 | 443 | alphas, betas = self.forward(obs) 444 | alphas, betas = torch.flatten(alphas), torch.flatten(betas) 445 | dists = [ 446 | td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas) 447 | ] 448 | action_along_dims = [dist.rsample() for dist in dists] 449 | action = torch.tensor(action_along_dims, requires_grad=True) 450 | log_prob = torch.sum(torch.tensor([ 451 | dist.log_prob(a) for dist, a in zip(dists, action_along_dims) 452 | ], requires_grad=True)) 453 | lb, ub = self.constraint_fn(obs, action_dim) 454 | action = lb + (ub - lb) * action 455 | return action if no_log_prob else (action, log_prob) 456 | 457 | def log_probs(self, obs, actions, action_dim): 458 | alphas_arr, betas_arr = self.forward(obs) 459 | dists = [] 460 | #import pdb; pdb.set_trace() 461 | 462 | #08.28.2023 -vipul making last attempts 463 | alphas_arr_1 = alphas_arr[:,0] 464 | alphas_arr_2 = alphas_arr[:,1] 465 | betas_arr_1 = betas_arr[:,0] 466 | betas_arr_2 = betas_arr[:,1] 467 | try: 468 | dists_1 = td.Beta(alphas_arr_1, betas_arr_1) 469 | except: 470 | import pdb; pdb.set_trace() 471 | 472 | try: 473 | dists_2 = td.Beta(alphas_arr_2, betas_arr_2) 474 | except: 475 | import pdb; pdb.set_trace() 476 | #08.23.2023 -vipul done making last attempts 477 | 478 | for i in range(alphas_arr.shape[0]): 479 | alphas = alphas_arr[i] 480 | betas = betas_arr[i] 481 | dists.append([ 482 | td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas) 483 | ]) 484 | 485 | lbs, ubs = self.constraint_fn(obs, action_dim) 486 | if lbs.device!=actions.device: 487 | lbs = lbs.to('cuda:0') 488 | ubs = ubs.to('cuda:0') 489 | actions = (actions - lbs) / (ubs - lbs) 490 | actions = actions.clip(0, 1) 491 | 492 | log_probs = [] 493 | for action, action_dists in zip(actions, dists): 494 | log_probs.append( 495 | torch.sum(torch.tensor([ 496 | dim_dist.log_prob(dim_action) \ 497 | for dim_dist, dim_action in zip(action_dists, action) 498 | ], requires_grad=True)) 499 | ) 500 | log_probs = torch.tensor(log_probs, requires_grad=True) 501 | 502 | #08.28.2023 -vipul making last attempts 503 | return_new = dists_1.log_prob(actions[:,0]).flatten() + dists_2.log_prob(actions[:,1]).flatten() 504 | #08.23.2023 -vipul done making last attempts 505 | 506 | #return log_probs #--original return 08.28.2023 507 | 508 | return return_new #--vipul's return 08.28.2023 509 | 510 | def entropy(self, obs): 511 | """ 512 | Returns sum of entropies along each independent action dimension. 513 | """ 514 | alphas_arr, betas_arr = self.forward(obs) 515 | dists = [] 516 | for i in range(alphas_arr.shape[0]): 517 | alphas = alphas_arr[i] 518 | betas = betas_arr[i] 519 | dists.append([ 520 | td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas) 521 | ]) 522 | entropies = torch.tensor( 523 | [torch.sum(torch.tensor([dist.entropy() for dist in dist_list])) \ 524 | for dist_list in dists] 525 | ) 526 | return entropies 527 | 528 | # # TODO: make this consistent with sample & log_probs defined above 529 | # alphas, betas = self.forward(obs) 530 | # dists = td.Beta(alphas, betas) 531 | # return dists.entropy() 532 | 533 | 534 | ### NOTE: this is the BetaPolicyBase giving us the Dirichlet error 535 | # class BetaPolicyBase(PolicyNetwork): 536 | # """ 537 | # Base class for Beta policy. 538 | # 539 | # Desired network needs to be implemented. 540 | # """ 541 | # 542 | # def __init__(self, constraint_fn, action_dim): 543 | # 544 | # super().__init__() 545 | # 546 | # self.constraint_fn = self._vectorize_f(constraint_fn, action_dim) 547 | # self.action_dim = action_dim 548 | # #assert self.action_dim == 1, 'Action dimension must be 1' #-Line commented out--vipul 549 | # 550 | # def _vectorize_f(self, f, action_dim): #--vipul :added action_dim 551 | # """ 552 | # Converts a function f defined on 1D numpy arrays and outputting pairs of 553 | # scalars into a vectorized function accepting batches of 554 | # torch tensorized arrays and output pairs of torch tensors. 555 | # """ 556 | # 557 | # def vectorized_f(obs, action_dim): #--vipul :added action_dim 558 | # 559 | # obs = obs.cpu().detach().numpy() 560 | # 561 | # if len(obs.shape) == 1: # check to see if obs is a batch or single obs 562 | # batch_size = 1 563 | # lbs, ubs = f(obs) 564 | # 565 | # else: 566 | # batch_size = obs.shape[0] 567 | # lbs = np.zeros([batch_size, self.action_dim]) 568 | # ubs = np.zeros([batch_size, self.action_dim]) 569 | # for i in range(batch_size): 570 | # lbs[i], ubs[i] = f(obs[i]) 571 | # 572 | # lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim) 573 | # ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim) 574 | # 575 | # return lbs, ubs 576 | # 577 | # return vectorized_f 578 | # 579 | # def sample(self, obs, action_dim, no_log_prob=False): #sample from interval is taken with a beta distribution -vipul 580 | # #sample definition changed to take in argument action_dim --vipul 581 | # alpha, beta = self.forward(obs) 582 | # dist = td.Beta(alpha, beta) 583 | # action = dist.rsample() 584 | # #log_prob = dist.log_prob(action) #original code 585 | # log_prob_vec = dist.log_prob(action) #vipul 586 | # log_prob = sum(log_prob_vec) 587 | # lb, ub = self.constraint_fn(obs, action_dim) 588 | # action = lb + (ub - lb) * action #need to be changed? --vipul 589 | # #code change nedded! -vipul 590 | # return action if no_log_prob else (action, log_prob) 591 | # 592 | # def log_probs(self, obs, actions, action_dim): #function modified 593 | # alphas, betas = self.forward(obs) 594 | # dists = td.Beta(alphas, betas) 595 | # lbs, ubs = self.constraint_fn(obs, action_dim) 596 | # actions = (actions - lbs) / (ubs - lbs) 597 | # #need to be changed? --vipul 598 | # #yes, change needed! 599 | # return sum(dists.log_prob(actions).flatten()) 600 | # 601 | # def entropy(self, obs): 602 | # alphas, betas = self.forward(obs) 603 | # dists = td.Beta(alphas, betas) 604 | # return dists.entropy() 605 | ### 606 | 607 | 608 | class BetaPolicy(BetaPolicyBase): 609 | """ 610 | Beta policy using a two-layer, two-headed MLP with ReLU activation. 611 | """ 612 | 613 | def __init__(self, obs_dim, constraint_fn, action_dim, #vipul : action_dim=1 replaced by just action_dim 614 | hidden_layer1_size=64, 615 | hidden_layer2_size=64): 616 | 617 | super().__init__(constraint_fn, action_dim=action_dim) 618 | 619 | self.base_net = nn.Sequential( 620 | nn.Linear(obs_dim, hidden_layer1_size), 621 | nn.Tanh(), 622 | nn.Linear(hidden_layer1_size, hidden_layer2_size), 623 | nn.Tanh(), 624 | ) 625 | 626 | self.alpha_net = nn.Sequential( 627 | nn.Linear(hidden_layer2_size, action_dim), #vipul : 1 replaced with action_dim 628 | nn.Softplus(), 629 | ) 630 | 631 | self.beta_net = nn.Sequential( 632 | nn.Linear(hidden_layer2_size, action_dim), #vipul : 1 replaced with action_dim 633 | nn.Softplus(), 634 | ) 635 | 636 | def forward(self, obs): 637 | 638 | x = self.base_net(obs) 639 | # if math.isnan(x[0].item()): 640 | # import pdb; pdb.set_trace() 641 | alpha = 1.0 + self.alpha_net(x) #is there any change needed? --vipul 642 | beta = 1.0 + self.beta_net(x) #is there any change needed? --vipul 643 | 644 | return alpha, beta 645 | 646 | 647 | class CategoricalPolicy(PolicyNetwork): 648 | """ 649 | Base class for categorical policy. 650 | 651 | Desired network needs to be implemented. 652 | """ 653 | 654 | def __init__(self, num_actions): 655 | 656 | super().__init__() 657 | 658 | self.num_actions = num_actions 659 | 660 | def sample(self, obs, no_log_prob=False): 661 | logits = self.forward(obs) 662 | dist = td.Categorical(logits=logits) 663 | action = dist.sample(sample_shape=torch.tensor([1])) 664 | return action if no_log_prob else (action, dist.log_prob(action)) 665 | 666 | def log_probs(self, obs, actions): 667 | dists = td.Categorical(logits=self.forward(obs)) 668 | return dists.log_prob(actions.flatten()) 669 | 670 | def entropy(self, obs): 671 | dists = td.Categorical(logits=self.forward(obs)) 672 | return dists.entropy() 673 | 674 | 675 | class CategoricalPolicyTwoLayer(CategoricalPolicy): 676 | """ 677 | Categorical policy using a fully connected two-layer network. 678 | """ 679 | 680 | def __init__(self, state_dim, num_actions, 681 | hidden_layer1_size=64, 682 | hidden_layer2_size=64, 683 | init_std=0.001): 684 | 685 | super().__init__(num_actions) 686 | 687 | self.init_std = init_std 688 | 689 | self.linear1 = nn.Linear(state_dim, hidden_layer1_size) 690 | self.linear2 = nn.Linear(hidden_layer1_size, hidden_layer2_size) 691 | self.linear3 = nn.Linear(hidden_layer2_size, num_actions) 692 | nn.init.normal_(self.linear1.weight, std=init_std) 693 | nn.init.normal_(self.linear2.weight, std=init_std) 694 | nn.init.normal_(self.linear3.weight, std=init_std) 695 | 696 | def forward(self, state): 697 | x = F.relu(self.linear1(state)) 698 | x = F.relu(self.linear2(x)) 699 | output = self.linear3(x) 700 | return output 701 | 702 | 703 | class PPOBase: 704 | def __init__(self, 705 | env, 706 | policy, 707 | value_function, 708 | policy_lr, 709 | value_lr, 710 | entropy_coef=0.0, 711 | clip_range=0.2, 712 | n_epochs=10, 713 | batch_size=64, 714 | weight_decay=0.0, 715 | gamma=0.99, 716 | buffer_size=2048, 717 | enable_cuda=True, 718 | policy_optimizer=torch.optim.Adam, 719 | value_optimizer=torch.optim.Adam, 720 | grad_clip_radius=None): 721 | 722 | warnings.warn('This PPO implementation currently contains hacks for ' + \ 723 | 'returning information about CBF-related safety.') 724 | 725 | self.env = env 726 | self.pi = policy 727 | self.v = value_function 728 | self.entropy_coef = entropy_coef 729 | self.clip_range = clip_range 730 | self.n_epochs = n_epochs 731 | self.batch_size = batch_size 732 | 733 | self.__cuda_enabled = enable_cuda 734 | self.enable_cuda(self.__cuda_enabled, warn=False) 735 | # NOTE: self.device is defined when self.enable_cuda is called! 736 | 737 | self.pi_optim = policy_optimizer(self.pi.parameters(), 738 | lr=policy_lr, 739 | weight_decay=weight_decay) 740 | self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr) 741 | self.grad_clip_radius = grad_clip_radius 742 | 743 | self.rollout_buffer = RolloutBuffer( 744 | buffer_size, 745 | env.observation_space, 746 | env.action_space, 747 | device=self.device, 748 | gamma=gamma 749 | ) 750 | 751 | @property 752 | def cuda_enabled(self): 753 | return self.__cuda_enabled 754 | 755 | def enable_cuda(self, enable_cuda=True, warn=True): 756 | """Enable or disable cuda and update models.""" 757 | 758 | if warn: 759 | warnings.warn("Converting models between 'cpu' and 'cuda' after " 760 | "initializing optimizers can give errors when using " 761 | "optimizers other than SGD or Adam!") 762 | 763 | self.__cuda_enabled = enable_cuda 764 | self.device = torch.device( 765 | 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \ 766 | else 'cpu') 767 | self.pi.to(self.device) 768 | self.v.to(self.device) 769 | 770 | def load_models(self, filename, enable_cuda=True, continue_training=True): 771 | """ 772 | Load policy and value functions. Copy them to target functions. 773 | 774 | This method is for evaluation only. Use load_checkpoint to continue 775 | training. 776 | """ 777 | 778 | models = torch.load(filename) 779 | 780 | self.pi.load_state_dict(models['pi_state_dict']) 781 | self.v.load_state_dict(models['v_state_dict']) 782 | 783 | self.pi.eval() 784 | self.v.eval() 785 | 786 | self.enable_cuda(enable_cuda, warn=False) 787 | 788 | def save_checkpoint(self, filename): 789 | """Save state_dicts of models and optimizers.""" 790 | 791 | torch.save({ 792 | 'using_cuda': self.__cuda_enabled, 793 | 'pi_state_dict': self.pi.state_dict(), 794 | 'pi_optimizer_state_dict': self.pi_optim.state_dict(), 795 | 'v_state_dict': self.v.state_dict(), 796 | 'v_optimizer_state_dict': self.v_optim.state_dict(), 797 | }, filename) 798 | 799 | def load_checkpoint(self, filename, continue_training=True): 800 | """Load state_dicts for models and optimizers.""" 801 | 802 | checkpoint = torch.load(filename) 803 | 804 | self.pi.load_state_dict(checkpoint['pi_state_dict']) 805 | self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict']) 806 | self.v.load_state_dict(models['v_state_dict']) 807 | self.v_optim.load_state_dict(models['v_optimizer_state_dict']) 808 | 809 | if continue_training: 810 | self.pi.train() 811 | self.v.train() 812 | else: 813 | self.pi.eval() 814 | self.v.eval() 815 | 816 | self.enable_cuda(checkpoint['using_cuda'], warn=False) 817 | 818 | def collect_rollout(self, env, rollout_length): 819 | """ 820 | Perform a rollout and fill the rollout buffer. 821 | """ 822 | 823 | self._last_obs = env.reset() 824 | self._last_episode_start = np.zeros(1) 825 | n_steps = 0 826 | self.rollout_buffer.reset() 827 | 828 | num_unsafe_steps = 0 829 | x_t=[] 830 | y_t=[] 831 | 832 | local_flag_done = False 833 | while n_steps < rollout_length: 834 | 835 | #vipul--- I removed these lines for now! 836 | #as they are pendulum specific 837 | # if env._state[0] < env.theta_safety_bounds[0] - 1e-8 or \ 838 | # env._state[0] > env.theta_safety_bounds[1] + 1e-8: 839 | # num_unsafe_steps += 1 840 | 841 | action_dim=get_action_dim(env.action_space) 842 | #action_dim added --vipul 843 | 844 | with torch.no_grad(): 845 | # Convert to pytorch tensor or to TensorDict 846 | obs_tensor = obs_as_tensor(self._last_obs, self.device).float() 847 | action, log_prob = self.pi.sample(obs_tensor, action_dim) 848 | value = self.v(obs_tensor) 849 | action = action.cpu().numpy() 850 | 851 | # Rescale and perform action 852 | clipped_action = action 853 | # Clip the actions to avoid out of bound error 854 | if isinstance(self.env.action_space, gym.spaces.Box): 855 | clipped_action = np.clip(action, self.env.action_space.low, 856 | self.env.action_space.high) 857 | elif isinstance(self.env.action_space, gym.spaces.Discrete): 858 | clipped_action = int(clipped_action) 859 | 860 | new_obs, reward, done, info = env.step(clipped_action) 861 | 862 | #adding the animation code here --vipul 863 | x_t.append(new_obs[0]) 864 | y_t.append(new_obs[1]) 865 | if abs(new_obs[0]-env.obstacle[0])<0.1 and abs(new_obs[1]-env.obstacle[1])<0.1: 866 | print("crash") 867 | #break 868 | 869 | n_steps += 1 870 | 871 | if isinstance(self.env.action_space, gym.spaces.Discrete): 872 | # Reshape in case of discrete action 873 | action = action.reshape(-1, 1) 874 | 875 | self.rollout_buffer.add(self._last_obs, action, reward, 876 | self._last_episode_start, value, log_prob) 877 | self._last_obs = new_obs.flatten() 878 | self._last_episode_start = done 879 | 880 | if n_steps == rollout_length: 881 | env.reset() 882 | # if done == 0 and local_flag_done == False: 883 | # x_t.append(new_obs[0]) 884 | # y_t.append(new_obs[1]) 885 | 886 | # elif done == 1 and n_steps>1: 887 | # local_flag_done = True 888 | 889 | 890 | 891 | # with open('my_array.csv', 'w', newline='') as file: 892 | # writer = csv.writer(file) 893 | # writer.writerow(x_t) 894 | # writer.writerow(y_t) 895 | #code for plotting the quadrotor trajectory in an episode 896 | # plt.xlim(-200,200) 897 | # plt.ylim(-200,200) 898 | # plt.plot(x_t, y_t, color = 'red') 899 | # plt.pause(1) 900 | plt.xlim(np.double(env.min_x),np.double(env.max_x)) 901 | plt.ylim(np.double(env.min_y),np.double(env.max_y)) 902 | plt.xlabel('X axis') 903 | plt.ylabel('Y-axis') 904 | plt.plot(x_t,y_t) 905 | plt.plot(env.goal[0],env.goal[1],marker='o',color='red') 906 | plt.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black') 907 | 908 | 909 | def f(x, y, xa, yb, a, b): 910 | return (x - xa)**4/a**4 + (y - yb)**4/b**4 911 | 912 | # Define the point around which to plot 913 | xa, yb = env.obstacle[0], env.obstacle[1] 914 | 915 | # Define the range of x and y values to plot 916 | x_vals = np.linspace(xa - env.a_d, xa + env.a_d, 100) 917 | y_vals = np.linspace(yb - env.b_d, yb + env.b_d, 100) 918 | 919 | # Create a grid of x and y values 920 | X, Y = np.meshgrid(x_vals, y_vals) 921 | 922 | # Evaluate the function at each point in the grid 923 | Z = f(X, Y, xa, yb, env.a_d, env.b_d) 924 | 925 | # Plot the function as a contour plot 926 | 927 | #ADDED - 09.1.23 928 | #plt.savefig(f"{env.date}_run={env.run}_device={env.device_run}_cbf={env.env_cbf}_lr={env.lr}_entr={env.entropy}_umin={env.umin[0]}_umax={env.umax[0]}_lyr=batch={env.layer_size}_roll={rollout_length}.png") 929 | ##Create a folder in the current directory 930 | folder_name_main = f"{{{env.date}}}" 931 | os.makedirs(folder_name_main, exist_ok=True) 932 | ##Change the current working directory to the newly created folder 933 | os.chdir(folder_name_main) 934 | 935 | folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}" 936 | os.makedirs(folder_name, exist_ok=True) 937 | ##Change the current working directory to the newly created folder 938 | os.chdir(folder_name) 939 | 940 | #folder_name_0 = f"{{run={env.run}}}" 941 | 942 | 943 | folder_name_1 = f"{{lr={env.lr}_entr={env.entropy}_umin={env.umin[0]}_umax={env.umax[0]}_lyr=batch={env.layer_size}}}" 944 | os.makedirs(folder_name_1, exist_ok=True) 945 | os.chdir(folder_name_1) 946 | #ADDED - 09.1.23 947 | with open(f"episode={env.episodes}.csv", 'w', newline='') as file: 948 | writer = csv.writer(file) 949 | writer.writerow(x_t) 950 | writer.writerow(y_t) 951 | if (env.episodes)%1 == 0: 952 | #plt.text(10, 12, f"{env.episodes}", fontsize=10, color='blue', ha='right') 953 | plt.savefig(f"ep={env.episodes}.png") 954 | plt.contour(X, Y, Z, levels=[env.safety_dist]) 955 | 956 | # Return to the original working directory (optional) 957 | os.chdir('..') 958 | os.chdir('..') 959 | os.chdir('..') 960 | 961 | #quadrotor.plot_step(new_obs, u_hat_acc, state_hist, plot_handle) 962 | 963 | # #creating animation -08.31.23 964 | # fig, ax = plt.subplots() 965 | # line, = ax.plot([], [], lw=2) 966 | # ax.contour(X, Y, Z, levels=[env.safety_dist]) 967 | # ax.plot(env.goal[0],env.goal[1],marker='o',color='red') 968 | # ax.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black') 969 | # def init(): 970 | # ax.set_xlim(-1, 14) 971 | # ax.set_ylim(-1, 14) 972 | # return line, 973 | # def update(frame): 974 | # line.set_data(x_t[frame], y_t[frame]) 975 | # return line, 976 | # ani = FuncAnimation(fig, update, frames=len(x_t), init_func=init, blit=True) 977 | # ani.save('1st_sep_trajectroy_animation.gif', writer='pillow') 978 | 979 | self.rollout_buffer.compute_returns_and_advantage(last_value=value, 980 | done=done) 981 | 982 | safety_rate = 100 * (1 - num_unsafe_steps / rollout_length) 983 | 984 | #return np.sum(self.rollout_buffer.rewards) / np.sum( 985 | #self.rollout_buffer.episode_starts), safety_rate 986 | return np.sum(self.rollout_buffer.rewards), safety_rate 987 | 988 | def train(self): 989 | """ 990 | Train on the current rollout buffer. 991 | """ 992 | #action_dim = get_action_dim(self.action_space) 993 | for epoch in range(self.n_epochs): 994 | 995 | # Do a complete pass on the rollout buffer 996 | for rollout_data in self.rollout_buffer.get(self.batch_size): 997 | 998 | actions = rollout_data.actions 999 | obs = rollout_data.observations 1000 | values = self.v(obs).flatten() 1001 | try: 1002 | log_probs = self.pi.log_probs(obs, actions, actions.shape[1]) 1003 | except: 1004 | print(self.pi.log_probs(obs, actions, actions.shape[1])) 1005 | import pdb; pdb.set_trace() 1006 | 1007 | entropies = self.pi.entropy(obs) 1008 | if log_probs.device!=actions.device: 1009 | log_probs=log_probs.to('cuda:0') 1010 | entropies=entropies.to('cuda:0') 1011 | advantages = rollout_data.advantages 1012 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) 1013 | 1014 | # ratio between old and new policy, should be one at the first iteration 1015 | ratio = torch.exp(log_probs - rollout_data.old_log_prob) 1016 | 1017 | # if ratio[0]>1.001: 1018 | # print(ratio) 1019 | # import pdb; pdb.set_trace() 1020 | 1021 | 1022 | policy_loss_1 = advantages * ratio 1023 | policy_loss_2 = advantages * torch.clamp(ratio, 1024 | 1 - self.clip_range, 1025 | 1 + self.clip_range) 1026 | policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean() - \ 1027 | self.entropy_coef * entropies.mean() 1028 | 1029 | self.pi_optim.zero_grad() 1030 | policy_loss.backward() 1031 | # Clip grad norm 1032 | if self.grad_clip_radius is not None: 1033 | torch.nn.utils.clip_grad_norm_(self.pi.parameters(), 1034 | self.grad_clip_radius) 1035 | self.pi_optim.step() 1036 | 1037 | value_loss = F.mse_loss(rollout_data.returns, values) 1038 | 1039 | self.v_optim.zero_grad() 1040 | value_loss.backward() 1041 | # Clip grad norm 1042 | if self.grad_clip_radius is not None: 1043 | torch.nn.utils.clip_grad_norm_(self.v.parameters(), 1044 | self.grad_clip_radius) 1045 | self.v_optim.step() 1046 | 1047 | # import pdb; pdb.set_trace() 1048 | 1049 | #this definition is copied from stable-baselines3' website and DOES NOT belong to this code 1050 | # def obs_as_tensor( 1051 | # obs: Union[np.ndarray, Dict[Union[str, int], np.ndarray]], device: th.device 1052 | # ) -> Union[th.Tensor, TensorDict]: 1053 | # """ 1054 | # Moves the observation to the given device. 1055 | 1056 | # :param obs: 1057 | # :param device: PyTorch device 1058 | # :return: PyTorch tensor of the observation on a desired device. 1059 | # """ 1060 | # if isinstance(obs, np.ndarray): 1061 | # return th.as_tensor(obs, device=device) 1062 | # elif isinstance(obs, dict): 1063 | # return {key: th.as_tensor(_obs, device=device) for (key, _obs) in obs.items()} 1064 | # else: 1065 | # raise Exception(f"Unrecognized type of observation {type(obs)}") 1066 | --------------------------------------------------------------------------------