├── Pendulum
    ├── agents
    │   ├── __init__.py
    │   ├── TruncatedNormal.py
    │   ├── agents.py
    │   └── ppo.py
    ├── gym_models
    │   ├── envs
    │   │   ├── assets
    │   │   │   └── clockwise.png
    │   │   ├── __init__.py
    │   │   ├── double_integrator.py
    │   │   └── pendulum.py
    │   └── __init__.py
    ├── __init__.py
    ├── setup.py
    ├── config.yml
    └── Testing-pendulum.py
├── gym_models
    ├── envs
    │   └── __init__.py
    └── __init__.py
├── experiments
    ├── sampling_beta
    │   ├── config.yml
    │   ├── Testing-beta-sampling.py
    │   └── ppo.py
    └── projection_guassian
    │   ├── config.yml
    │   ├── Testing-projection.py
    │   └── ppo_proj.py
├── setup.py
├── README.md
└── agents
    ├── agents.py
    └── ppo.py


/Pendulum/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gym_models/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from gym_models.envs.quad_gym_env import QuadDynamics
2 | from gym_models.envs.quad_gym_env_proj import QuadDynamicsProj
3 | 


--------------------------------------------------------------------------------
/Pendulum/gym_models/envs/assets/clockwise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sharma1256/cbf-constrained_ppo/HEAD/Pendulum/gym_models/envs/assets/clockwise.png


--------------------------------------------------------------------------------
/Pendulum/gym_models/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from gym_models.envs.double_integrator import DoubleIntegrator
2 | from gym_models.envs.pendulum import InvertedPendulum
3 | 


--------------------------------------------------------------------------------
/gym_models/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | register(
 4 |     id='quad_gym_env',
 5 |     entry_point='gym_models.envs:QuadDynamics',
 6 | )
 7 | 
 8 | register(
 9 |     id='proj_quad_gym_env',
10 |     entry_point='gym_models.envs:QuadDynamicsProj',
11 | )
12 | 


--------------------------------------------------------------------------------
/Pendulum/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | register(
 4 |     id='double_integrator-v0',
 5 |     entry_point='gym_models.envs:DoubleIntegrator',
 6 | )
 7 | 
 8 | register(
 9 |     id='inverted_pendulum-v0',
10 |     entry_point='gym_models.envs:InvertedPendulum',
11 | )
12 | 


--------------------------------------------------------------------------------
/Pendulum/gym_models/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | register(
 4 |     id='double_integrator-v0',
 5 |     entry_point='gym_models.envs:DoubleIntegrator',
 6 | )
 7 | 
 8 | register(
 9 |     id='inverted_pendulum-v0',
10 |     entry_point='gym_models.envs:InvertedPendulum',
11 | )
12 | 


--------------------------------------------------------------------------------
/Pendulum/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='gym_models',
 4 |       version='0.0.1',
 5 |       author="Krishna Chaitanya Kosaraju and Wesley Suttle",
 6 |       install_requires=['gym',
 7 |                         'numpy',
 8 |                         'matplotlib',
 9 |                         'torch',
10 |                         'wesutils',
11 |                         'scipy',
12 |                         'ray',
13 |                         'pandas',
14 |                         'seaborn',
15 |                         'stable_baselines3',
16 |                        ]
17 | )
18 | 


--------------------------------------------------------------------------------
/experiments/sampling_beta/config.yml:
--------------------------------------------------------------------------------
 1 | # config file for experiment comparing Beta and Gaussian policies
 2 | 
 3 | env:
 4 |   dt:                   0.1
 5 |   umin:                 [-10.0, -10.0]
 6 |   umax:                 [10.0, 10.0]
 7 | 
 8 | 
 9 | experiment:
10 |   n_episodes:           10000
11 |   rollout_length:       180
12 |   buffer_size:          180
13 |   n_epochs:             10
14 |   entropy_coefficient:  0.0
15 |   weight_decay:         0.0
16 |   T_reward:             1
17 |   cbf:                  True
18 |   enable_cuda:          True
19 |   max_steps:            1000
20 | 
21 | 
22 | beta:
23 |   pi_lr:                0.0006
24 |   v_lr:                 0.0006
25 |   v_units1:             256
26 |   v_units2:             256
27 |   pi_units1:            256
28 |   pi_units2:            256
29 | 


--------------------------------------------------------------------------------
/Pendulum/config.yml:
--------------------------------------------------------------------------------
 1 | # config file for experiment comparing Beta and Gaussian policies
 2 | 
 3 | env:
 4 |   tau:                  0.05
 5 |   theta_safety_bounds:  [-1.0, 1.0]
 6 | 
 7 | 
 8 | experiment:
 9 |   n_episodes:           300
10 |   rollout_length:       2048
11 |   n_replications:       5
12 | 
13 | 
14 | beta:
15 |   torque_bounds:        [-.inf, .inf]
16 |   pi_lr:                0.01
17 |   v_lr:                 0.01
18 |   v_units1:             64
19 |   v_units2:             64
20 |   pi_units1:            64
21 |   pi_units2:            64
22 | 
23 | 
24 | gaussian:
25 |   torque_bounds:        [-15.0, 15.0]
26 |   pi_lr:                0.0003
27 |   v_lr:                 0.0003
28 |   v_units1:             64
29 |   v_units2:             64
30 |   pi_units1:            64
31 |   pi_units2:            64
32 | 


--------------------------------------------------------------------------------
/experiments/projection_guassian/config.yml:
--------------------------------------------------------------------------------
 1 | # config file for experiment comparing Beta and Gaussian policies
 2 | 
 3 | env:
 4 |   dt:                   0.1
 5 |   umin:                 [-15.0, -15.0]
 6 |   umax:                 [15.0, 15.0]
 7 | 
 8 | 
 9 | experiment:
10 |   n_episodes:           10000
11 |   rollout_length:       320
12 |   buffer_size:          320
13 |   n_epochs:             10
14 |   entropy_coefficient:  0.00000001
15 |   weight_decay:         0.0
16 |   T_reward:             1
17 |   cbf:                  True
18 |   enable_cuda:          True
19 |   max_steps:            1000
20 | 
21 | 
22 | gaussian:
23 |   pi_lr:                0.0004
24 |   v_lr:                 0.0004
25 |   v_units1:             256
26 |   v_units2:             256
27 |   pi_units1:            256
28 |   pi_units2:            256
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='gym_models',
 4 |       version='0.0.1',
 5 |       install_requires=['gym',
 6 |                         'numpy',
 7 |                         'matplotlib',
 8 |                         'torch',
 9 |                         'wesutils',
10 |                         'scipy',
11 |                         'ray',
12 |                         'pandas',
13 |                         'seaborn',
14 |                         'stable_baselines3',
15 |                         'math',
16 |                         'cvxpy',
17 |                         'cvxopt',
18 |                         'time',
19 |                         'warnings',
20 |                         'copy',
21 |                         'scipy',
22 |                         'csv',
23 |                         'datetime'
24 |                        ]
25 | )
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Control Barrier Function-constrained Proximal Policy Optimization
 2 | 
 3 | This repository provides the framework used to conduct the experiments for our paper "Sampling-Based Safe Reinforcement Learning for Nonlinear Dynamical Systems", appearing in _Proceedings of the 27th International Conference on Artificial Intelligence and Statistics (AISTATS)_, 2024. The paper is available [here](https://arxiv.org/abs/2403.04007).
 4 | 
 5 | Specifically, this repo contains the following:
 6 | 1) Sampling based safety-constrained PPO
 7 | 2) Constrained beta policy
 8 | 3) Projection or Safety filter Benchmark
 9 | 
10 | A Beta policy, given in `ppo.py`, is constrained over the safe control set obtained from the `cbf` function defined in `quad_gym_env.py` that represents Control Barrier Function (CBF) based safety constraints,and,
11 | this policy is then updated using proximal policy optimization defined in `ppo.py`, which was adapted from [Stable Baselines3](https://stable-baselines3.readthedocs.io/en/master/). 
12 | 
13 | In addition, we created a benchmark using projection-based or safety-filter based safe RL policies in `ppo_proj.py` using the CBFs defined in `quad_gym_env_proj.py` to obtain safety constraints. This essentially leads to a projection based safe RL policy like that proposed in [Cheng et al., 2019](https://cdn.aaai.org/ojs/4213/4213-13-7267-1-10-20190705.pdf).
14 | 
15 | Some of the dynamical components involved in our safe quadcopter gym environment are adapted from the repo: `https://github.com/hocherie/cbf_quadrotor`
16 | 
17 | ### Usage
18 | 
19 | 1) To install, first set up your preferred virtual environment, then do `pip install -e .`
20 | 2) For Quadcopter experiments:
21 |     Go to `experiments` directory and select the experiment (e.g., `Testing-projection.py` or `Testing-beta-sampling.py`) that you wish to run
22 | 3) For Pendulum Experiments:
23 |     Go to Pendulum directory and run `Testing-pendulum.py`
24 | 5) You'll see plots and rewards arrays being stored in the corresponding experiment folder
25 | 
26 | ### Reference
27 | ## [Sampling-based safe reinforcement learning for nonlinear dynamical systems](https://proceedings.mlr.press/v238/suttle24a.html)
28 | 
29 | `@inproceedings{suttle2024sampling,
30 |   title={Sampling-based safe reinforcement learning for nonlinear dynamical systems},
31 |   author={Suttle, Wesley and Sharma, Vipul Kumar and Kosaraju, Krishna Chaitanya and Seetharaman, Sivaranjani and Liu, Ji and Gupta, Vijay and Sadler, Brian M},
32 |   booktitle={International Conference on Artificial Intelligence and Statistics},
33 |   pages={4420--4428},
34 |   year={2024},
35 |   organization={PMLR}
36 | }`
37 | 


--------------------------------------------------------------------------------
/Pendulum/Testing-pendulum.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jan 24 20:16:49 2023
  4 | 
  5 | @author: VIPUL
  6 | """
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | import argparse
 10 | from itertools import product
 11 | import pickle
 12 | import os
 13 | import yaml
 14 | import time
 15 | import shutil
 16 | import torch
 17 | #import torch.distributions
 18 | import wesutils
 19 | #from quad_gym_env import QuadDynamics
 20 | #from ppo import BetaPolicy
 21 | import agents.ppo
 22 | import gym_models
 23 | import gym_models.envs.pendulum
 24 | #from torch import distributions
 25 | 
 26 | import pdb
 27 | 
 28 | 
 29 | ### Hyperparameters...
 30 | 
 31 | # ...for the agent
 32 | n_episodes = 1000
 33 | #rollout_length = 2048
 34 | rollout_length = 2048
 35 | buffer_size = rollout_length
 36 | policy_lr = 0.0001
 37 | value_lr = 0.0001
 38 | layer_size = 512
 39 | enable_cuda = False
 40 | n_epochs = 10
 41 | batch_size = 64 #modified from 256
 42 | entropy_coef = 0.000001
 43 | weight_decay = 0.0
 44 | T=5 #storing reward per 100 episodes
 45 | tau=0.05
 46 | theta_safety_bounds=[-1.0, 1.0]
 47 | beta_torque_bounds=[-15.0, 15.0]
 48 | pi_units1=128
 49 | pi_units2=128
 50 | v_units1=64
 51 | v_units2=64
 52 | def train():
 53 |     env = gym_models.envs.pendulum.InvertedPendulum(
 54 |         tau=tau,
 55 |         theta_safety_bounds=theta_safety_bounds,
 56 |         torque_bounds=beta_torque_bounds
 57 |     )
 58 | 
 59 |     pi = agents.ppo.BetaPolicy(
 60 |         3, env.cbf, 1,
 61 |         hidden_layer1_size=pi_units1,
 62 |         hidden_layer2_size=pi_units2
 63 |     )
 64 |     v = wesutils.two_layer_net(
 65 |         3, 1, v_units1, v_units2
 66 |     )
 67 |     agent = agents.ppo.PPOBase(
 68 |         env, pi, v,
 69 |         policy_lr, value_lr,
 70 |         buffer_size=buffer_size,
 71 |         enable_cuda=enable_cuda,
 72 |         n_epochs=n_epochs,
 73 |         batch_size=batch_size,
 74 |         entropy_coef=entropy_coef,
 75 |         weight_decay=weight_decay,
 76 |     )
 77 |     
 78 |     # train and collect data
 79 |     rewards, safety_rates = [], [] # TODO: get rid of safety_rates
 80 |     for i in range(n_episodes):
 81 |         reward, safety_rate = agent.collect_rollout(env, rollout_length)
 82 |         agent.train()
 83 |         rewards.append(reward)
 84 |         safety_rates.append(safety_rate)
 85 |         if i%T==0:
 86 |             np.save("rewards_sequence.npy", rewards)
 87 |         safety_rates.append(safety_rate)
 88 |         
 89 |         print(f'Episode {i} return: {reward:.2f}') # does this work?
 90 |     
 91 |     return {'rewards': rewards,
 92 |             'safety_rates': safety_rates}
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 | 
 97 |     start_time = time.time()
 98 |     train()
 99 |     total_runtime = time.time() - start_time
100 | 
101 |     print(f'Total runtime: {total_runtime / 60:.1f}m')
102 | 


--------------------------------------------------------------------------------
/experiments/sampling_beta/Testing-beta-sampling.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import os
  5 | import time
  6 | import wesutils
  7 | from quad_gym_env import QuadDynamics
  8 | from ppo import BetaPolicy
  9 | import ppo
 10 | 
 11 | from datetime import datetime
 12 | 
 13 | # Get today's date as a datetime object
 14 | today_date = datetime.today()
 15 | 
 16 | # Convert the datetime object to a string
 17 | today_date_str = today_date.strftime("%Y-%m-%d") 
 18 | 
 19 | ### Hyperparameters...
 20 | n_episodes = 10000 #1000
 21 | rollout_length = 180
 22 | buffer_size = rollout_length
 23 | policy_lr = 0.0006
 24 | value_lr = 0.0006
 25 | layer_size = 256
 26 | enable_cuda = True
 27 | n_epochs = 10
 28 | batch_size = 256
 29 | entropy_coef = 0.0
 30 | weight_decay = 0.0
 31 | T=1 
 32 | dt = 0.1
 33 | max_steps = 1000
 34 | umin = -10.0* np.array([1, 1])
 35 | umax = 10.0 * np.array([1, 1])
 36 | episode = 0
 37 | cbf = True
 38 | device_run = 'sampling'
 39 | run = 1
 40 | 
 41 | 
 42 | def train():
 43 |     env = QuadDynamics(
 44 |         dt=dt,
 45 |         max_steps=max_steps,
 46 |         umax=umax,
 47 |         umin=umin,
 48 |         env_cbf = cbf,
 49 |         layer_size = layer_size,
 50 |         entropy = entropy_coef,
 51 |         lr = policy_lr,
 52 |         device_run = device_run,
 53 |         date = today_date_str,
 54 |         run = run
 55 |     )
 56 |     
 57 |     pi = BetaPolicy(
 58 |         10, env.cbf, 2,
 59 |         hidden_layer1_size=layer_size,
 60 |         hidden_layer2_size=layer_size,
 61 |     )
 62 |     v = wesutils.two_layer_net(
 63 |         10, 1, layer_size, layer_size
 64 |     )
 65 |     agent = ppo.PPOBase(
 66 |         env, pi, v,
 67 |         policy_lr, value_lr,
 68 |         buffer_size=buffer_size,
 69 |         enable_cuda=enable_cuda,
 70 |         n_epochs=n_epochs,
 71 |         batch_size=batch_size,
 72 |         entropy_coef=entropy_coef,
 73 |         weight_decay=weight_decay,
 74 |     )
 75 |     
 76 |     # train and collect data
 77 |     rewards, safety_rates = [], [] # TODO: get rid of safety_rates
 78 |     for i in range(n_episodes):
 79 |         env.episodes = i
 80 |         reward, safety_rate = agent.collect_rollout(env, rollout_length)
 81 |         agent.train()
 82 |         rewards.append(reward)
 83 |         safety_rates.append(safety_rate)
 84 |         if i%T==0:
 85 |             folder_name_main = f"{{{env.date}}}"
 86 |             os.makedirs(folder_name_main, exist_ok=True)
 87 |             ##Change the current working directory to the newly created folder
 88 |             os.chdir(folder_name_main)
 89 |             
 90 |             folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}"
 91 |             os.makedirs(folder_name, exist_ok=True)
 92 |             ##Change the current working directory to the newly created folder
 93 |             os.chdir(folder_name)
 94 |             np.save(f"lr={policy_lr}_ent={entropy_coef}_lyr=batch={layer_size}_roll={rollout_length}.npy", rewards)
 95 |             os.chdir('..')
 96 |             os.chdir('..')
 97 |         safety_rates.append(safety_rate)
 98 |         
 99 |         print(f'Episode {i} return: {reward:.2f}')
100 |     
101 |     return {'rewards': rewards,
102 |             'safety_rates': safety_rates}
103 | 
104 | 
105 | if __name__ == '__main__':
106 | 
107 |     start_time = time.time()
108 |     train()
109 |     total_runtime = time.time() - start_time
110 | 
111 |     print(f'Total runtime: {total_runtime / 60:.1f}m')
112 | 


--------------------------------------------------------------------------------
/experiments/projection_guassian/Testing-projection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import os
  4 | import time
  5 | import wesutils
  6 | from quad_gym_env_proj import QuadDynamicsProj
  7 | from ppo_proj import GaussianPolicy
  8 | import ppo_proj
  9 | from datetime import datetime
 10 | 
 11 | # Get today's date as a datetime object
 12 | today_date = datetime.today()
 13 | 
 14 | # Convert the datetime object to a string
 15 | today_date_str = today_date.strftime("%Y-%m-%d") 
 16 | 
 17 | # Hyperparameters...
 18 | n_episodes = 10000 #1000
 19 | rollout_length = 320
 20 | buffer_size = rollout_length
 21 | policy_lr = 0.0004
 22 | value_lr = 0.0004
 23 | layer_size = 256
 24 | enable_cuda = False
 25 | n_epochs = 10
 26 | batch_size = 256 #modified from 256
 27 | entropy_coef = 0.00000001
 28 | weight_decay = 0.0
 29 | T=1
 30 | # ...for the environment
 31 | dt = 0.1
 32 | max_steps = 1000
 33 | umin = -15.0 * np.array([1, 1])
 34 | umax = 15.0 * np.array([1, 1])
 35 | episode = 0
 36 | cbf = True
 37 | device_run = 'projection'
 38 | run = 1
 39 | 
 40 | def train():
 41 |     env = QuadDynamicsProj(
 42 |         dt=dt,
 43 |         max_steps=max_steps,
 44 |         umax=umax,
 45 |         umin=umin,
 46 |         env_cbf = cbf,
 47 |         layer_size = layer_size,
 48 |         entropy = entropy_coef,
 49 |         lr = policy_lr,
 50 |         device_run = device_run,
 51 |         date = today_date_str,
 52 |         run = run
 53 |     )   
 54 |     pi = GaussianPolicy(
 55 |         10, 2, umin, umax,
 56 |         hidden_layer1_size=layer_size,
 57 |         hidden_layer2_size=layer_size,
 58 |     )
 59 |     v = wesutils.two_layer_net(
 60 |         10, 1, layer_size, layer_size
 61 |     )
 62 |     agent = ppo_proj.PPOBase(
 63 |         env, pi, v,
 64 |         policy_lr, value_lr,
 65 |         buffer_size=buffer_size,
 66 |         enable_cuda=enable_cuda,
 67 |         n_epochs=n_epochs,
 68 |         batch_size=batch_size,
 69 |         entropy_coef=entropy_coef,
 70 |         weight_decay=weight_decay,
 71 |     )   
 72 |     # train and collect data
 73 |     rewards, safety_rates = [], [] # TODO: get rid of safety_rates
 74 |     for i in range(n_episodes):
 75 |         env.episodes = i
 76 |         reward, safety_rate = agent.collect_rollout(env, rollout_length)
 77 |         agent.train()
 78 |         rewards.append(reward)
 79 |         safety_rates.append(safety_rate)
 80 |         if i%T==0:
 81 |             folder_name_main = f"{{{env.date}}}"
 82 |             os.makedirs(folder_name_main, exist_ok=True)
 83 |             ##Change the current working directory to the newly created folder
 84 |             os.chdir(folder_name_main)
 85 |             folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}"
 86 |             os.makedirs(folder_name, exist_ok=True)
 87 |             ##Change the current working directory to the newly created folder
 88 |             os.chdir(folder_name)
 89 |             np.save(f"lr={policy_lr}_ent={entropy_coef}_lyr=batch={layer_size}_roll={rollout_length}.npy", rewards)
 90 |             os.chdir('..')
 91 |             os.chdir('..')
 92 |         safety_rates.append(safety_rate)      
 93 |         print(f'Episode {i} return: {reward:.2f}') # does this work?    
 94 |     return {'rewards': rewards,
 95 |             'safety_rates': safety_rates}
 96 | 
 97 | if __name__ == '__main__':
 98 | 
 99 |     start_time = time.time()
100 |     train()
101 |     total_runtime = time.time() - start_time
102 | 
103 |     print(f'Total runtime: {total_runtime / 60:.1f}m')
104 | 


--------------------------------------------------------------------------------
/Pendulum/gym_models/envs/double_integrator.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | import numpy as np
  3 | import gym
  4 | from gym import spaces
  5 | from gym.utils import seeding
  6 | 
  7 | 
  8 | 
  9 | class DoubleIntegrator(gym.Env):
 10 |     """
 11 |     Description:
 12 |     """
 13 | 
 14 | 
 15 |     metadata = {'render.modes': ['console']}
 16 | 
 17 |     def __init__(self, 
 18 |                  tau: float = 1e-2, 
 19 |                  initial_state: Union[float, float] = [9.1, 1],
 20 |                  safety_bounds: Union[float, float] = [9., 11.],
 21 |                  x1_des: float = [10.],
 22 |                  max_steps: int = 1000
 23 |                  ):
 24 |         super(DoubleIntegrator, self).__init__()
 25 | 
 26 |         self._tau = tau
 27 |         self.initial_state = initial_state
 28 |         self.safety_bounds = safety_bounds
 29 |         self.x1_des = x1_des
 30 |         self.action_space = spaces.Box(-np.inf, np.inf, shape=(1,), dtype=np.float64)
 31 |         self.observation_space = spaces.Box(-np.inf, np.inf, shape=(2,1), dtype=np.float64)
 32 |         self._state = np.array(self.initial_state)
 33 |         self.max_steps = max_steps
 34 |         self.count = 0
 35 |         self.seed()
 36 | 
 37 |     @property
 38 |     def tau(self):
 39 |         return self._tau
 40 | 
 41 |     @tau.setter
 42 |     def tau(self, value: float):
 43 |         if value>1e-1:
 44 |             print("discretizing time is too high, consider reducing for better results")
 45 |         self._tau = value
 46 | 
 47 |     @property
 48 |     def state(self):
 49 |         return self._state.reshape(2,1)
 50 | 
 51 |     def seed(self, seed=None):
 52 |         # not used
 53 |         self.np_random, seed = seeding.np_random(seed)
 54 |         return [seed]
 55 | 
 56 |     def reward(self):
 57 |         x1 = self._state[0]
 58 |         return (-0.5) * ((x1 - self.x1_des) ** 2)
 59 | 
 60 |     def step(self, action: float):
 61 |         self.count += 1
 62 |         x1, x2 = self._state[0], self._state[1]
 63 |         x1_new = x1 + (self._tau * x2) + (0.5 * (self._tau ** 2)) * action
 64 |         x2_new = x2 + (self._tau) * action
 65 |         self._state = np.array([x1_new, x2_new], dtype=float)
 66 | 
 67 |         x1_min, x1_max = self.safety_bounds[0], self.safety_bounds[1]
 68 |         done = False
 69 |         if (x1_new < x1_min) or (x1_new > x1_max) or (self.count>self.max_steps):
 70 |             done = True
 71 |         return self.state, self.reward(), done, {}
 72 |         
 73 |     def reset(self):
 74 |         self._state = np.array(self.initial_state)
 75 |         self.count = 0
 76 |         return self.state
 77 |     
 78 |     def render(self, mode='console'):
 79 |         if mode != 'console':
 80 |             raise NotImplementedError()
 81 |         print("not implemented")
 82 | 
 83 |     def close(self):
 84 |         pass
 85 |     
 86 |     def cbf(self, state=None, eta: float = 0.5):
 87 |         """
 88 |         Calculates CBF constraint set at a given state. Default is
 89 |         the current state.
 90 |         """
 91 | 
 92 |         state = state if state is not None else self._state
 93 | 
 94 |         if (eta>1-1e-3) or (eta<1e-5):
 95 |             raise ValueError("eta should be inside (0, 1)")
 96 |         x1, x2 = state[0], state[1]
 97 |         x1_min, x1_max = self.safety_bounds[0], self.safety_bounds[1]
 98 |         u_min = (2 / (self._tau ** 2) ) * ( - (self._tau * x2) - eta * (x1 - x1_min))
 99 |         u_max = (2 / (self._tau ** 2) ) * ( - (self._tau * x2) + eta * (x1_max - x1))
100 |         if u_min>u_max:
101 |             raise ValueError("Infeasible")
102 |         else:
103 |             return [u_min, u_max]
104 | 


--------------------------------------------------------------------------------
/Pendulum/agents/TruncatedNormal.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from numbers import Number
  3 | 
  4 | import torch
  5 | from torch.distributions import Distribution, constraints
  6 | from torch.distributions.utils import broadcast_all
  7 | 
  8 | CONST_SQRT_2 = math.sqrt(2)
  9 | CONST_INV_SQRT_2PI = 1 / math.sqrt(2 * math.pi)
 10 | CONST_INV_SQRT_2 = 1 / math.sqrt(2)
 11 | CONST_LOG_INV_SQRT_2PI = math.log(CONST_INV_SQRT_2PI)
 12 | CONST_LOG_SQRT_2PI_E = 0.5 * math.log(2 * math.pi * math.e)
 13 | 
 14 | 
 15 | class TruncatedStandardNormal(Distribution):
 16 |     """
 17 |     Truncated Standard Normal distribution
 18 |     https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
 19 |     """
 20 | 
 21 |     arg_constraints = {
 22 |         'a': constraints.real,
 23 |         'b': constraints.real,
 24 |     }
 25 |     has_rsample = True
 26 | 
 27 |     def __init__(self, a, b, validate_args=None):
 28 |         self.a, self.b = broadcast_all(a, b)
 29 |         if isinstance(a, Number) and isinstance(b, Number):
 30 |             batch_shape = torch.Size()
 31 |         else:
 32 |             batch_shape = self.a.size()
 33 |         super(TruncatedStandardNormal, self).__init__(batch_shape, validate_args=validate_args)
 34 |         if self.a.dtype != self.b.dtype:
 35 |             raise ValueError('Truncation bounds types are different')
 36 |         if any((self.a >= self.b).view(-1,).tolist()):
 37 |             raise ValueError('Incorrect truncation range')
 38 |         eps = torch.finfo(self.a.dtype).eps
 39 |         self._dtype_min_gt_0 = eps
 40 |         self._dtype_max_lt_1 = 1 - eps
 41 |         self._little_phi_a = self._little_phi(self.a)
 42 |         self._little_phi_b = self._little_phi(self.b)
 43 |         self._big_phi_a = self._big_phi(self.a)
 44 |         self._big_phi_b = self._big_phi(self.b)
 45 |         self._Z = (self._big_phi_b - self._big_phi_a).clamp_min(eps)
 46 |         self._log_Z = self._Z.log()
 47 |         little_phi_coeff_a = torch.nan_to_num(self.a, nan=math.nan)
 48 |         little_phi_coeff_b = torch.nan_to_num(self.b, nan=math.nan)
 49 |         self._lpbb_m_lpaa_d_Z = (self._little_phi_b * little_phi_coeff_b - self._little_phi_a * little_phi_coeff_a) / self._Z
 50 |         self._mean = -(self._little_phi_b - self._little_phi_a) / self._Z
 51 |         self._variance = 1 - self._lpbb_m_lpaa_d_Z - ((self._little_phi_b - self._little_phi_a) / self._Z) ** 2
 52 |         self._entropy = CONST_LOG_SQRT_2PI_E + self._log_Z - 0.5 * self._lpbb_m_lpaa_d_Z
 53 | 
 54 |     @constraints.dependent_property
 55 |     def support(self):
 56 |         return constraints.interval(self.a, self.b)
 57 | 
 58 |     @property
 59 |     def mean(self):
 60 |         return self._mean
 61 | 
 62 |     @property
 63 |     def variance(self):
 64 |         return self._variance
 65 | 
 66 |     @property
 67 |     def entropy(self):
 68 |         return self._entropy
 69 | 
 70 |     @property
 71 |     def auc(self):
 72 |         return self._Z
 73 | 
 74 |     @staticmethod
 75 |     def _little_phi(x):
 76 |         return (-(x ** 2) * 0.5).exp() * CONST_INV_SQRT_2PI
 77 | 
 78 |     @staticmethod
 79 |     def _big_phi(x):
 80 |         return 0.5 * (1 + (x * CONST_INV_SQRT_2).erf())
 81 | 
 82 |     @staticmethod
 83 |     def _inv_big_phi(x):
 84 |         return CONST_SQRT_2 * (2 * x - 1).erfinv()
 85 | 
 86 |     def cdf(self, value):
 87 |         if self._validate_args:
 88 |             self._validate_sample(value)
 89 |         return ((self._big_phi(value) - self._big_phi_a) / self._Z).clamp(0, 1)
 90 | 
 91 |     def icdf(self, value):
 92 |         return self._inv_big_phi(self._big_phi_a + value * self._Z)
 93 | 
 94 |     def log_prob(self, value):
 95 |         if self._validate_args:
 96 |             self._validate_sample(value)
 97 |         return CONST_LOG_INV_SQRT_2PI - self._log_Z - (value ** 2) * 0.5
 98 | 
 99 |     def rsample(self, sample_shape=torch.Size()):
100 |         shape = self._extended_shape(sample_shape)
101 |         p = torch.empty(shape, device=self.a.device).uniform_(self._dtype_min_gt_0, self._dtype_max_lt_1)
102 |         return self.icdf(p)
103 | 
104 | 
105 | class TruncatedNormal(TruncatedStandardNormal):
106 |     """
107 |     Truncated Normal distribution
108 |     https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
109 |     """
110 | 
111 |     has_rsample = True
112 | 
113 |     def __init__(self, loc, scale, a, b, validate_args=None):
114 |         self.loc, self.scale, a, b = broadcast_all(loc, scale, a, b)
115 |         a = (a - self.loc) / self.scale
116 |         b = (b - self.loc) / self.scale
117 |         super(TruncatedNormal, self).__init__(a, b, validate_args=validate_args)
118 |         self._log_scale = self.scale.log()
119 |         self._mean = self._mean * self.scale + self.loc
120 |         self._variance = self._variance * self.scale ** 2
121 |         self._entropy += self._log_scale
122 | 
123 |     def _to_std_rv(self, value):
124 |         return (value - self.loc) / self.scale
125 | 
126 |     def _from_std_rv(self, value):
127 |         return value * self.scale + self.loc
128 | 
129 |     def cdf(self, value):
130 |         return super(TruncatedNormal, self).cdf(self._to_std_rv(value))
131 | 
132 |     def icdf(self, value):
133 |         return self._from_std_rv(super(TruncatedNormal, self).icdf(value))
134 | 
135 |     def log_prob(self, value):
136 |         return super(TruncatedNormal, self).log_prob(self._to_std_rv(value)) - self._log_scale
137 | 


--------------------------------------------------------------------------------
/Pendulum/gym_models/envs/pendulum.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | import numpy as np
  3 | import gym
  4 | from gym import spaces
  5 | from gym.utils import seeding
  6 | from os import path
  7 | from gym.envs.classic_control import rendering
  8 | 
  9 | 
 10 | 
 11 | class InvertedPendulum(gym.Env):
 12 |     """
 13 |     Description:
 14 |     """
 15 | 
 16 | 
 17 |     # metadata = {'render.modes': ['console']}
 18 |     metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 30}
 19 | 
 20 |     def __init__(self, 
 21 |                  tau: float = 5e-2,
 22 |                  m: float = 1,
 23 |                  g: float = 9.8,
 24 |                  l: float = 1,
 25 |                  initial_state: Union[float, float] = [0.0, 0.],
 26 |                  theta_safety_bounds: Union[float, float] = [-1.0, 1.0],
 27 |                  thetadot_safety_bounds: Union[float, float] = [-np.inf, np.inf],
 28 |                  theta_des: float = [0.],
 29 |                  torque_bounds: Union[float, float] = [-15., 15.],
 30 |                  max_steps: int = 200
 31 |                  ):
 32 |         super(InvertedPendulum, self).__init__()
 33 | 
 34 |         self._tau = tau
 35 |         self.g = g
 36 |         self.l = l
 37 |         self.m = m
 38 |         self.torque_bounds = torque_bounds
 39 |         self.initial_state = initial_state
 40 |         self.theta_safety_bounds = theta_safety_bounds
 41 |         self.thetadot_safety_bounds = thetadot_safety_bounds
 42 |         self.x1_des = 0.
 43 |         self.action_space = spaces.Box(self.torque_bounds[0], self.torque_bounds[1], shape=(1,), dtype=np.float64)
 44 |         high = np.array([1.0, 1.0, 8.0])
 45 |         self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)
 46 |         # self.observation_space = spaces.Box(-np.inf, np.inf, shape=(3,), dtype=np.float64)
 47 |         # self._state = np.array(self.initial_state)
 48 |         self.seed()
 49 |         self.max_steps = max_steps
 50 |         self.count = 0
 51 |         self.action_dim=1
 52 |         
 53 |         # rendering stuff
 54 |         self.viewer = None
 55 | 
 56 | 
 57 |     @property
 58 |     def tau(self):
 59 |         return self._tau
 60 |     @tau.setter
 61 |     def tau(self, value: float):
 62 |         if value>1e-1:
 63 |             print("discretizing time is too high, consider reducing for better results")
 64 |         self._tau = value
 65 | 
 66 |     @property
 67 |     def state(self):
 68 |         # return self._state
 69 |         return self._get_obs()
 70 | 
 71 |     def seed(self, seed=None):
 72 |         # not used
 73 |         self.np_random, seed = seeding.np_random(seed)
 74 |         return [seed]
 75 | 
 76 |     ### New reward
 77 |     def _angle_normalize(self, x):
 78 |         return ((x + np.pi) % (2 * np.pi)) - np.pi
 79 | 
 80 |     def reward(self, action):
 81 |         cost = self._angle_normalize(self._state[0])**2 + 0.1 * self._state[1]**2 \
 82 |                 + 0.001 * (action ** 2)
 83 |         return -float(cost)
 84 |     ### end new reward
 85 | 
 86 |     def _get_obs(self):
 87 |         theta, thetadot = self._state
 88 |         return np.array([np.cos(theta), np.sin(theta), thetadot])
 89 | 
 90 |     def step(self, action: float):
 91 |         self.count += 1
 92 |         action = np.clip(action, self.torque_bounds[0], self.torque_bounds[1]) 
 93 |         self.last_u = action # for use in rendering
 94 |         c1 = ((3 * self.g)/(2 * self.l))
 95 |         c2 = (3 /(self.m * (self.l ** 2)))
 96 |         theta, thetadot = self._state[0], self._state[1]
 97 |         theta_new = theta + (self._tau * thetadot) + (self._tau ** 2) * ( c1 * np.sin(theta) + c2 * action)
 98 |         thetadot_new = thetadot + (self._tau) * ( c1 * np.sin(theta) + c2 * action)
 99 |         self._state = np.array([theta_new, thetadot_new], dtype=float)
100 | 
101 |         theta_min, theta_max = self.theta_safety_bounds[0], self.theta_safety_bounds[1]
102 |         thetadot_min, thetadot_max = self.thetadot_safety_bounds
103 |         done = False
104 | 
105 |         # if (theta_new < theta_min) or \
106 |         #    (theta_new > theta_max) or \
107 |         #    (thetadot_new < thetadot_min) or \
108 |         #    (thetadot_new > thetadot_max) or \
109 |         #    (self.count > self.max_steps):
110 |         #     done = True
111 | 
112 |         if self.count > self.max_steps:
113 |             done = True
114 |             
115 |         return self._get_obs().flatten(), self.reward(action), done, {}
116 | 
117 |         ### OLD
118 |         # return self._state, self.reward(action), done, {}
119 |         
120 |     ### OLD
121 |     # def reset(self):
122 |     #     self._state = np.array(self.initial_state)
123 |     #     self.count = 0
124 |     #     return self._state)
125 | 
126 |     def reset(self):
127 |         self._state = np.array(self.initial_state)
128 |         # self._state = np.random.uniform(-np.pi, np.pi, size=(2,))
129 |         self.count = 0
130 |         return self._get_obs()
131 | 
132 |     def render(self, mode="human"):
133 |         if self.viewer is None:
134 |             self.viewer = rendering.Viewer(500, 500)
135 |             self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)
136 |             rod = rendering.make_capsule(1, 0.2)
137 |             rod.set_color(0.8, 0.3, 0.3)
138 |             self.pole_transform = rendering.Transform()
139 |             rod.add_attr(self.pole_transform)
140 |             self.viewer.add_geom(rod)
141 |             axle = rendering.make_circle(0.05)
142 |             axle.set_color(0, 0, 0)
143 |             self.viewer.add_geom(axle)
144 |             fname = path.join(path.dirname(__file__), "assets/clockwise.png")
145 |             self.img = rendering.Image(fname, 1.0, 1.0)
146 |             self.imgtrans = rendering.Transform()
147 |             self.img.add_attr(self.imgtrans)
148 | 
149 |         self.viewer.add_onetime(self.img)
150 |         self.pole_transform.set_rotation(self._state[0] + np.pi / 2)
151 |         if self.last_u:
152 |             self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2)
153 | 
154 |         return self.viewer.render(return_rgb_array=mode == "rgb_array")
155 | 
156 |     def close(self):
157 |         if self.viewer:
158 |             self.viewer.close()
159 |             self.viewer = None
160 |     
161 |     ### OLD
162 |     # def render(self, mode='console'):
163 |     #     if mode != 'console'metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 30}:
164 |     #         raise NotImplementedError()
165 |     #     print("not implemented")
166 | 
167 |     # def close(self):
168 |     #    pass
169 |     
170 |     def cbf(self, state=None, eta: float = 0.99):
171 |         """
172 |         Calculates CBF constraint set at a given state. Default is
173 |         the current state.
174 |         """
175 | 
176 |         state = state if state is not None else self._state
177 | 
178 |         if (eta>1-1e-3) or (eta<1e-5):
179 |             raise ValueError("eta should be inside (0, 1)")
180 |         c1 = ((3 * self.g)/(2 * self.l))
181 |         c2 = (3 /(self.m * (self.l ** 2)))
182 |         #theta, thetadot = np.arcsin(obs[i][1]), obs[i][2]
183 |         #theta, thetadot = np.arcsin(state[1]), state[2]
184 |         #the above line can replace line 185
185 |         theta, thetadot = state[0], state[1]
186 |         theta_min, theta_max = self.theta_safety_bounds[0], self.theta_safety_bounds[1]
187 |         thetadot_min, thetadot_max = self.thetadot_safety_bounds[0], self.thetadot_safety_bounds[1]
188 |         u_min1 = (1/c2) * (((1 / (self._tau **2)) * (-eta * (theta - theta_min) - self._tau * thetadot)) - c1 * np.sin(theta) )
189 |         u_max1 = (1/c2) * (((1 / (self._tau **2)) * ( eta * (theta_max - theta) - self._tau * thetadot)) - c1 * np.sin(theta) )
190 | 
191 |         
192 |         u_min2 = (1/c2) * (((1 / (self._tau)) * (-eta * (thetadot - thetadot_min))) - c1 * np.sin(theta) )
193 |         u_max2 = (1/c2) * (((1 / (self._tau)) * ( eta * (thetadot_max - thetadot))) - c1 * np.sin(theta) )
194 | 
195 |         u_min = max(u_min1, u_min2, self.torque_bounds[0])
196 |         u_max = min(u_max1, u_max2, self.torque_bounds[1])
197 |         
198 |         u_min=self.torque_bounds[0]
199 |         u_max=self.torque_bounds[1]
200 |         if u_min>u_max:
201 |             raise ValueError("Infeasible")
202 |         else:
203 |             return [u_min, u_max]


--------------------------------------------------------------------------------
/experiments/projection_guassian/ppo_proj.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.distributions as td
  7 | from torch.nn import functional as F
  8 | import gym
  9 | from gym import spaces
 10 | import numpy as np
 11 | from typing import NamedTuple
 12 | import warnings
 13 | from matplotlib import pyplot as plt
 14 | import os
 15 | import csv
 16 | 
 17 | from stable_baselines3.common.utils import obs_as_tensor
 18 | from stable_baselines3.common.preprocessing import (
 19 |     get_obs_shape, get_action_dim
 20 | )
 21 | 
 22 | 
 23 | class RolloutBufferSamples(NamedTuple):
 24 |     observations: torch.Tensor
 25 |     actions: torch.Tensor
 26 |     old_values: torch.Tensor
 27 |     old_log_prob: torch.Tensor
 28 |     advantages: torch.Tensor
 29 |     returns: torch.Tensor
 30 | 
 31 | 
 32 | class RolloutBuffer:
 33 |     
 34 |     def __init__(self,
 35 |                  buffer_size,
 36 |                  observation_space,
 37 |                  action_space,
 38 |                  gamma=0.90,
 39 |                  device='cpu'):
 40 |         
 41 |         self.buffer_size = buffer_size
 42 |         self.observation_space = observation_space
 43 |         self.action_space = action_space
 44 |         self.gamma = gamma
 45 |         self.device = device
 46 |         self.obs_shape = get_obs_shape(self.observation_space)
 47 |         self.action_dim = get_action_dim(self.action_space)
 48 |         
 49 |         self.reset()
 50 |         
 51 |     def reset(self):
 52 |         
 53 |         self.observations = np.zeros(
 54 |             (self.buffer_size,) + self.obs_shape, dtype=np.float32
 55 |         )
 56 |         self.actions = np.zeros(
 57 |             (self.buffer_size, self.action_dim), dtype=np.float32
 58 |         )
 59 |         self.rewards = np.zeros(
 60 |             (self.buffer_size,), dtype=np.float32
 61 |         )
 62 |         self.episode_starts = np.zeros(
 63 |             (self.buffer_size,), dtype=np.float32
 64 |         )
 65 |         self.values = np.zeros(
 66 |             (self.buffer_size,), dtype=np.float32
 67 |         )
 68 |         self.log_probs = np.zeros(
 69 |             (self.buffer_size,), dtype=np.float32
 70 |         )
 71 |         self.advantages = np.zeros(
 72 |             (self.buffer_size,), dtype=np.float32
 73 |         )
 74 |         
 75 |         self.full = False
 76 |         self.pos = 0
 77 |         
 78 |     def compute_returns_and_advantage(self, last_value, done):
 79 |         
 80 |         last_value = last_value.clone().cpu().numpy().flatten()
 81 |         
 82 |         discounted_reward = 0
 83 |         for step in reversed(range(self.buffer_size)):
 84 |             if step == self.buffer_size - 1:
 85 |                 next_non_terminal = 1.0 - done
 86 |                 next_value = last_value
 87 |             else:
 88 |                 next_non_terminal = 1.0 - self.episode_starts[step + 1]
 89 |                 next_value = self.values[step + 1]
 90 |             discounted_reward = self.rewards[step] + \
 91 |                 self.gamma * discounted_reward * next_non_terminal
 92 |             self.advantages[step] = discounted_reward - self.values[step]
 93 |         self.returns = self.advantages + self.values
 94 |         
 95 |     def add(self, obs, action, reward, episode_start, value, log_prob):
 96 |         
 97 |         if len(log_prob.shape) == 0:
 98 |             log_prob = log_prob.reshape(-1, 1)
 99 |         
100 |         if isinstance(self.observation_space, spaces.Discrete):
101 |             obs = obs.reshape((1,) + self.obs_shape)
102 |             
103 |         self.observations[self.pos] = np.array(obs).copy()
104 |         self.actions[self.pos] = np.array(action).copy()
105 |         self.rewards[self.pos] = np.array(reward).copy()
106 |         self.episode_starts[self.pos] = np.array(episode_start).copy()
107 |         self.values[self.pos] = value.clone().cpu().numpy().flatten()
108 |         self.log_probs[self.pos] = log_prob.clone().cpu().numpy()
109 |         self.pos += 1
110 |         if self.pos == self.buffer_size:
111 |             self.full = True
112 |             
113 |     def get(self, batch_size=None):
114 |         assert self.full, ""
115 |         indices = np.random.permutation(self.buffer_size)
116 | 
117 |         # Return everything, don't create minibatches
118 |         if batch_size is None:
119 |             batch_size = self.buffer_size
120 | 
121 |         start_idx = 0
122 |         while start_idx < self.buffer_size:
123 |             yield self._get_samples(indices[start_idx : start_idx + batch_size])
124 |             start_idx += batch_size
125 | 
126 |     def _get_samples(self, batch_inds):
127 |         data = (
128 |             self.observations[batch_inds],
129 |             self.actions[batch_inds],
130 |             self.values[batch_inds].flatten(),
131 |             self.log_probs[batch_inds].flatten(),
132 |             self.advantages[batch_inds].flatten(),
133 |             self.returns[batch_inds].flatten(),
134 |         )
135 |         return RolloutBufferSamples(*tuple(map(self.to_torch, data)))
136 |     
137 |     def to_torch(self, array, copy=True):
138 |         if copy:
139 |             return torch.tensor(array).to(self.device)
140 |         return torch.as_tensor(array).to(self.device)
141 | 
142 | 
143 | class PolicyNetwork(nn.Module):
144 |     """Base class for stochastic policy networks."""
145 | 
146 |     def __init__(self):
147 |         super().__init__()
148 | 
149 |     def forward(self, state):
150 |         """Take state as input, then output the parameters of the policy."""
151 | 
152 |         raise NotImplemented("forward not implemented.")
153 | 
154 |     def sample(self, state):
155 |         """
156 |         Sample an action based on the model parameters given the current state.
157 |         """
158 | 
159 |         raise NotImplemented("sample not implemented.")
160 | 
161 |     def log_probs(self, obs, actions):
162 |         """
163 |         Return log probabilities for each state-action pair.
164 |         """
165 | 
166 |         raise NotImplemented("log_probs not implemented.")
167 | 
168 |     def entropy(self, obs):
169 |         """
170 |         Return entropy of the policy for each state.
171 |         """
172 | 
173 |         raise NotImplemented("entropy not implemented.")
174 | 
175 | 
176 | class GaussianPolicyBase(PolicyNetwork):
177 |     """
178 |     Base class for Gaussian policy.
179 | 
180 |     Desired network needs to be implemented.
181 |     """
182 | 
183 |     def __init__(self, action_dim):
184 | 
185 |         super().__init__()
186 | 
187 |         self.action_dim = action_dim
188 | 
189 |     def _get_covs(self, log_stds):
190 |         batch_size = log_stds.shape[0]
191 |         stds = log_stds.exp().reshape(batch_size, 1, 1)
192 |         covs = stds * torch.eye(self.action_dim).repeat(batch_size, 1, 1)
193 |         return covs
194 | 
195 |     def sample(self, obs, no_log_prob=False):
196 |         mean, log_std = self.forward(obs)
197 |         cov = log_std.exp() * torch.eye(self.action_dim)
198 |         dist = td.MultivariateNormal(mean, cov)
199 |         action = dist.rsample()
200 |         return action if no_log_prob else (action, dist.log_prob(action))
201 | 
202 |     def log_probs(self, obs, actions):
203 |         means, log_stds = self.forward(obs)
204 |         covs = self._get_covs(log_stds)
205 |         dists = td.MultivariateNormal(means, covs)
206 |         return dists.log_prob(actions)
207 | 
208 |     def entropy(self, obs):
209 |         means, log_stds = self.forward(obs)
210 |         covs = self._get_covs(log_stds)
211 |         dists = td.MultivariateNormal(means, covs)
212 |         return dists.entropy()
213 | 
214 | 
215 | class GaussianPolicy(GaussianPolicyBase):
216 |     """
217 |     Gaussian policy using a two-layer, two-headed MLP with ReLU activation.
218 |     """
219 | 
220 |     def __init__(self, obs_dim, action_dim,
221 |                  min_action_val=-20.0 * np.array([1, 1]),
222 |                  max_action_val=20.0 * np.array([1, 1]),
223 |                  hidden_layer1_size=64,
224 |                  hidden_layer2_size=64):
225 | 
226 |         super().__init__(action_dim)
227 | 
228 |         self.base_net = nn.Sequential(
229 |             nn.Linear(obs_dim, hidden_layer1_size),
230 |             nn.ReLU(),
231 |             nn.Linear(hidden_layer1_size, hidden_layer2_size),
232 |             nn.ReLU(),
233 |         )
234 | 
235 |         self.mean_net = nn.Sequential(
236 |             nn.Linear(hidden_layer2_size, action_dim),
237 |             nn.Hardtanh(min_action_val[0], max_action_val[0]),
238 |             nn.Hardtanh(min_action_val[1], max_action_val[1])
239 |         )
240 | 
241 |         self.log_std_net = nn.Sequential(
242 |             nn.Linear(hidden_layer2_size, 1),
243 |         )
244 | 
245 |     def forward(self, obs):
246 |         x = self.base_net(obs)
247 |         mean = self.mean_net(x)
248 |         log_std = self.log_std_net(x)
249 |         return mean, log_std
250 | 
251 | 
252 | class PPOBase:
253 |     def __init__(self,
254 |                  env,
255 |                  policy,
256 |                  value_function,
257 |                  policy_lr,
258 |                  value_lr,
259 |                  entropy_coef=0.0,
260 |                  clip_range=0.2,
261 |                  n_epochs=10,
262 |                  batch_size=64,
263 |                  weight_decay=0.0,
264 |                  gamma=0.99,
265 |                  buffer_size=2048,
266 |                  enable_cuda=True,
267 |                  policy_optimizer=torch.optim.Adam,
268 |                  value_optimizer=torch.optim.Adam,
269 |                  grad_clip_radius=None):
270 | 
271 |         warnings.warn('This PPO implementation currently contains hacks for ' + \
272 |                       'returning information about CBF-related safety.')
273 | 
274 |         self.env = env
275 |         self.pi = policy
276 |         self.v = value_function
277 |         self.entropy_coef = entropy_coef
278 |         self.clip_range = clip_range
279 |         self.n_epochs = n_epochs
280 |         self.batch_size = batch_size
281 | 
282 |         self.__cuda_enabled = enable_cuda
283 |         self.enable_cuda(self.__cuda_enabled, warn=False)
284 |         # NOTE: self.device is defined when self.enable_cuda is called!
285 | 
286 |         self.pi_optim = policy_optimizer(self.pi.parameters(),
287 |                                          lr=policy_lr,
288 |                                          weight_decay=weight_decay)
289 |         self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr)
290 |         self.grad_clip_radius = grad_clip_radius
291 | 
292 |         self.rollout_buffer = RolloutBuffer(
293 |             buffer_size,
294 |             env.observation_space,
295 |             env.action_space,
296 |             device=self.device,
297 |             gamma=gamma
298 |         )
299 | 
300 |     @property
301 |     def cuda_enabled(self):
302 |         return self.__cuda_enabled
303 | 
304 |     def enable_cuda(self, enable_cuda=True, warn=True):
305 |         """Enable or disable cuda and update models."""
306 |         
307 |         if warn:
308 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
309 |                           "initializing optimizers can give errors when using "
310 |                           "optimizers other than SGD or Adam!")
311 |         
312 |         self.__cuda_enabled = enable_cuda
313 |         self.device = torch.device(
314 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
315 |                 else 'cpu')
316 |         self.pi.to(self.device)
317 |         self.v.to(self.device)
318 |         
319 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
320 |         """
321 |         Load policy and value functions. Copy them to target functions.
322 | 
323 |         This method is for evaluation only. Use load_checkpoint to continue
324 |         training.
325 |         """
326 |         
327 |         models = torch.load(filename)
328 | 
329 |         self.pi.load_state_dict(models['pi_state_dict'])
330 |         self.v.load_state_dict(models['v_state_dict'])
331 | 
332 |         self.pi.eval()
333 |         self.v.eval()
334 |             
335 |         self.enable_cuda(enable_cuda, warn=False)
336 |         
337 |     def save_checkpoint(self, filename):
338 |         """Save state_dicts of models and optimizers."""
339 |         
340 |         torch.save({
341 |                 'using_cuda': self.__cuda_enabled,
342 |                 'pi_state_dict': self.pi.state_dict(),
343 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
344 |                 'v_state_dict': self.v.state_dict(),
345 |                 'v_optimizer_state_dict': self.v_optim.state_dict(),
346 |         }, filename)
347 |     
348 |     def load_checkpoint(self, filename, continue_training=True):
349 |         """Load state_dicts for models and optimizers."""
350 |         
351 |         checkpoint = torch.load(filename)
352 |         
353 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
354 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
355 |         self.v.load_state_dict(models['v_state_dict'])
356 |         self.v_optim.load_state_dict(models['v_optimizer_state_dict'])
357 |         
358 |         if continue_training:
359 |             self.pi.train()
360 |             self.v.train()
361 |         else:
362 |             self.pi.eval()
363 |             self.v.eval()
364 |         
365 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
366 | 
367 |     def collect_rollout(self, env, rollout_length):
368 |         """
369 |         Perform a rollout and fill the rollout buffer.
370 |         """
371 | 
372 |         self._last_obs = env.reset()
373 |         self._last_episode_start = np.zeros(1)
374 |         n_steps = 0
375 |         self.rollout_buffer.reset()
376 | 
377 |         num_unsafe_steps = 0
378 |         x_t=[]
379 |         y_t=[]
380 |         
381 |         local_flag_done = False
382 |         while n_steps < rollout_length:
383 |             action_dim=get_action_dim(env.action_space)
384 | 
385 |             
386 |             with torch.no_grad():
387 |                 # Convert to pytorch tensor or to TensorDict
388 |                 obs_tensor = obs_as_tensor(self._last_obs, self.device).float()
389 |                 action, log_prob = self.pi.sample(obs_tensor, action_dim)
390 |                 value = self.v(obs_tensor)
391 |             action = action.cpu().numpy()
392 | 
393 |             # Rescale and perform action
394 |             clipped_action = action
395 |             # Clip the actions to avoid out of bound error
396 |             if isinstance(self.env.action_space, gym.spaces.Box):
397 |                 clipped_action = np.clip(action, self.env.action_space.low,
398 |                                          self.env.action_space.high)
399 |             elif isinstance(self.env.action_space, gym.spaces.Discrete):
400 |                 clipped_action = int(clipped_action)
401 | 
402 |             new_obs, reward, done, info = env.step(clipped_action)
403 | 
404 | 
405 |             if abs(new_obs[0]-env.obstacle[0])<0.1 and abs(new_obs[1]-env.obstacle[1])<0.1:
406 |                 print("crash")
407 | 
408 |             n_steps += 1
409 | 
410 |             if isinstance(self.env.action_space, gym.spaces.Discrete):
411 |                 # Reshape in case of discrete action
412 |                 action = action.reshape(-1, 1)
413 | 
414 |             self.rollout_buffer.add(self._last_obs, action, reward,
415 |                                     self._last_episode_start, value, log_prob)
416 |             self._last_obs = new_obs.flatten()
417 |             self._last_episode_start = done
418 | 
419 |             if done == 0 and local_flag_done == False:
420 |                 x_t.append(new_obs[0])
421 |                 y_t.append(new_obs[1])
422 |                 
423 |             elif done == 1 and n_steps>1:
424 |                 local_flag_done = True
425 |     
426 |         plt.xlim(np.double(env.min_x),np.double(env.max_x))
427 |         plt.ylim(np.double(env.min_y),np.double(env.max_y))
428 |         plt.xlabel('X axis')
429 |         plt.ylabel('Y-axis')
430 |         plt.plot(x_t,y_t)
431 |         plt.plot(env.goal[0],env.goal[1],marker='o',color='red')
432 |         plt.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black')
433 | 
434 |         def f(x, y, xa, yb, a, b):
435 |             return (x - xa)**4/a**4 + (y - yb)**4/b**4
436 | 
437 |         # Define the point around which to plot
438 |         xa, yb = env.obstacle[0], env.obstacle[1]
439 | 
440 |         # Define the range of x and y values to plot
441 |         x_vals = np.linspace(xa - env.a_d, xa + env.a_d, 100)
442 |         y_vals = np.linspace(yb - env.b_d, yb + env.b_d, 100)
443 | 
444 |         # Create a grid of x and y values
445 |         X, Y = np.meshgrid(x_vals, y_vals)
446 | 
447 |         # Evaluate the function at each point in the grid
448 |         Z = f(X, Y, xa, yb, env.a_d, env.b_d)
449 | 
450 |         # Plot the function as a contour plot
451 |         
452 | 
453 |         folder_name_main = f"{{{env.date}}}"
454 |         os.makedirs(folder_name_main, exist_ok=True)
455 |         ##Change the current working directory to the newly created folder
456 |         os.chdir(folder_name_main)
457 |         
458 |         folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}"
459 |         os.makedirs(folder_name, exist_ok=True)
460 |         ##Change the current working directory to the newly created folder
461 |         os.chdir(folder_name)
462 | 
463 |         
464 |         folder_name_1 = f"{{lr={env.lr}_entr={env.entropy}_umin={env.umin[0]}_umax={env.umax[0]}_lyr=batch={env.layer_size}}}"
465 |         os.makedirs(folder_name_1, exist_ok=True)
466 |         os.chdir(folder_name_1)
467 | 
468 |         if (env.episodes)%1 == 0:
469 |             plt.savefig(f"ep={env.episodes}.png")        
470 |             with open(f"episode={env.episodes}.csv", 'w', newline='') as file:
471 |                 writer = csv.writer(file)
472 |                 writer.writerow(x_t)
473 |                 writer.writerow(y_t)
474 |         plt.contour(X, Y, Z, levels=[env.safety_dist])
475 | 
476 |         
477 |         # Return to the original working directory (optional)
478 |         os.chdir('..')
479 |         os.chdir('..')
480 |         os.chdir('..')
481 |         
482 |        
483 |         self.rollout_buffer.compute_returns_and_advantage(last_value=value,
484 |                                                           done=done)
485 |         
486 |         safety_rate = 100 * (1 - num_unsafe_steps / rollout_length)
487 | 
488 |         return np.sum(self.rollout_buffer.rewards), safety_rate    
489 | 
490 |     def train(self):
491 |         """
492 |         Train on the current rollout buffer.
493 |         """        
494 |         for epoch in range(self.n_epochs):
495 | 
496 |             # Do a complete pass on the rollout buffer
497 |             for rollout_data in self.rollout_buffer.get(self.batch_size):
498 | 
499 |                 actions = rollout_data.actions
500 |                 obs = rollout_data.observations
501 |                 values = self.v(obs).flatten()
502 |                 try:
503 |                     log_probs = self.pi.log_probs(obs, actions)
504 |                 except:
505 |                     print(self.pi.log_probs(obs, actions))
506 |                     import pdb; pdb.set_trace()
507 | 
508 |                 entropies = self.pi.entropy(obs)
509 |                 if log_probs.device!=actions.device:
510 |                     log_probs=log_probs.to('cuda:0')
511 |                     entropies=entropies.to('cuda:0')
512 |                 advantages = rollout_data.advantages
513 |                 advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
514 | 
515 |                 # ratio between old and new policy, should be one at the first iteration
516 |                 ratio = torch.exp(log_probs - rollout_data.old_log_prob)
517 | 
518 | 
519 |                 policy_loss_1 = advantages * ratio
520 |                 policy_loss_2 = advantages * torch.clamp(ratio,
521 |                                                          1 - self.clip_range,
522 |                                                          1 + self.clip_range)
523 |                 policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean() - \
524 |                         self.entropy_coef * entropies.mean()
525 | 
526 |                 self.pi_optim.zero_grad()
527 |                 policy_loss.backward()
528 |                 # Clip grad norm
529 |                 if self.grad_clip_radius is not None:
530 |                     torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
531 |                                                    self.grad_clip_radius)
532 |                 self.pi_optim.step()
533 | 
534 |                 value_loss = F.mse_loss(rollout_data.returns, values)
535 | 
536 |                 self.v_optim.zero_grad()
537 |                 value_loss.backward()
538 |                 # Clip grad norm
539 |                 if self.grad_clip_radius is not None:
540 |                     torch.nn.utils.clip_grad_norm_(self.v.parameters(),
541 |                                                    self.grad_clip_radius)
542 |                 self.v_optim.step()
543 | 


--------------------------------------------------------------------------------
/experiments/sampling_beta/ppo.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.distributions as td
  4 | from torch.nn import functional as F
  5 | import gym
  6 | from gym import spaces
  7 | import numpy as np
  8 | from typing import NamedTuple
  9 | import warnings
 10 | from matplotlib import pyplot as plt
 11 | import os
 12 | import csv
 13 | 
 14 | from stable_baselines3.common.utils import obs_as_tensor
 15 | from stable_baselines3.common.preprocessing import (
 16 |     get_obs_shape, get_action_dim
 17 | )
 18 | 
 19 | 
 20 | #from agents.TruncatedNormal import TruncatedNormal as tn
 21 | 
 22 | from wesutils import two_layer_net
 23 | 
 24 | class RolloutBufferSamples(NamedTuple):
 25 |     observations: torch.Tensor
 26 |     actions: torch.Tensor
 27 |     old_values: torch.Tensor
 28 |     old_log_prob: torch.Tensor
 29 |     advantages: torch.Tensor
 30 |     returns: torch.Tensor
 31 | 
 32 | 
 33 | class RolloutBuffer:
 34 |     
 35 |     def __init__(self,
 36 |                  buffer_size,
 37 |                  observation_space,
 38 |                  action_space,
 39 |                  gamma=0.90,
 40 |                  device='cpu'):
 41 |         
 42 |         self.buffer_size = buffer_size
 43 |         self.observation_space = observation_space
 44 |         self.action_space = action_space
 45 |         self.gamma = gamma
 46 |         self.device = device
 47 |         self.obs_shape = get_obs_shape(self.observation_space)
 48 |         self.action_dim = get_action_dim(self.action_space)
 49 |         
 50 |         self.reset()
 51 |         
 52 |     def reset(self):
 53 |         
 54 |         self.observations = np.zeros(
 55 |             (self.buffer_size,) + self.obs_shape, dtype=np.float32
 56 |         )
 57 |         self.actions = np.zeros(
 58 |             (self.buffer_size, self.action_dim), dtype=np.float32
 59 |         )
 60 |         self.rewards = np.zeros(
 61 |             (self.buffer_size,), dtype=np.float32
 62 |         )
 63 |         self.episode_starts = np.zeros(
 64 |             (self.buffer_size,), dtype=np.float32
 65 |         )
 66 |         self.values = np.zeros(
 67 |             (self.buffer_size,), dtype=np.float32
 68 |         )
 69 |         self.log_probs = np.zeros(
 70 |             (self.buffer_size,), dtype=np.float32
 71 |         )
 72 |         self.advantages = np.zeros(
 73 |             (self.buffer_size,), dtype=np.float32
 74 |         )
 75 |         
 76 |         self.full = False
 77 |         self.pos = 0
 78 |         
 79 |     def compute_returns_and_advantage(self, last_value, done):
 80 |         
 81 |         last_value = last_value.clone().cpu().numpy().flatten()
 82 |         
 83 |         discounted_reward = 0
 84 |         for step in reversed(range(self.buffer_size)):
 85 |             if step == self.buffer_size - 1:
 86 |                 next_non_terminal = 1.0 - done
 87 |                 next_value = last_value
 88 |             else:
 89 |                 next_non_terminal = 1.0 - self.episode_starts[step + 1]
 90 |                 next_value = self.values[step + 1]
 91 |             discounted_reward = self.rewards[step] + \
 92 |                 self.gamma * discounted_reward * next_non_terminal
 93 |             self.advantages[step] = discounted_reward - self.values[step]
 94 |         self.returns = self.advantages + self.values
 95 |         
 96 |     def add(self, obs, action, reward, episode_start, value, log_prob):
 97 |         
 98 |         if len(log_prob.shape) == 0:
 99 |             log_prob = log_prob.reshape(-1, 1)
100 |         
101 |         if isinstance(self.observation_space, spaces.Discrete):
102 |             obs = obs.reshape((1,) + self.obs_shape)
103 |             
104 |         self.observations[self.pos] = np.array(obs).copy()
105 |         self.actions[self.pos] = np.array(action).copy()
106 |         self.rewards[self.pos] = np.array(reward).copy()
107 |         self.episode_starts[self.pos] = np.array(episode_start).copy()
108 |         self.values[self.pos] = value.clone().cpu().numpy().flatten()
109 |         self.log_probs[self.pos] = log_prob.clone().cpu().numpy()
110 |         self.pos += 1
111 |         if self.pos == self.buffer_size:
112 |             self.full = True
113 |             
114 |     def get(self, batch_size=None):
115 |         assert self.full, ""
116 |         indices = np.random.permutation(self.buffer_size)
117 | 
118 |         # Return everything, don't create minibatches
119 |         if batch_size is None:
120 |             batch_size = self.buffer_size
121 | 
122 |         start_idx = 0
123 |         while start_idx < self.buffer_size:
124 |             yield self._get_samples(indices[start_idx : start_idx + batch_size])
125 |             start_idx += batch_size
126 | 
127 |     def _get_samples(self, batch_inds):
128 |         data = (
129 |             self.observations[batch_inds],
130 |             self.actions[batch_inds],
131 |             self.values[batch_inds].flatten(),
132 |             self.log_probs[batch_inds].flatten(),
133 |             self.advantages[batch_inds].flatten(),
134 |             self.returns[batch_inds].flatten(),
135 |         )
136 |         return RolloutBufferSamples(*tuple(map(self.to_torch, data)))
137 |     
138 |     def to_torch(self, array, copy=True):
139 |         if copy:
140 |             return torch.tensor(array).to(self.device)
141 |         return torch.as_tensor(array).to(self.device)
142 | 
143 | 
144 | class PolicyNetwork(nn.Module):
145 |     """Base class for stochastic policy networks."""
146 | 
147 |     def __init__(self):
148 |         super().__init__()
149 | 
150 |     def forward(self, state):
151 |         """Take state as input, then output the parameters of the policy."""
152 | 
153 |         raise NotImplemented("forward not implemented.")
154 | 
155 |     def sample(self, state):
156 |         """
157 |         Sample an action based on the model parameters given the current state.
158 |         """
159 | 
160 |         raise NotImplemented("sample not implemented.")
161 | 
162 |     def log_probs(self, obs, actions):
163 |         """
164 |         Return log probabilities for each state-action pair.
165 |         """
166 | 
167 |         raise NotImplemented("log_probs not implemented.")
168 | 
169 |     def entropy(self, obs):
170 |         """
171 |         Return entropy of the policy for each state.
172 |         """
173 | 
174 |         raise NotImplemented("entropy not implemented.")
175 | 
176 | 
177 | class BetaPolicyBase(PolicyNetwork):
178 |     """
179 |     Base class for Beta policy.
180 | 
181 |     Desired network needs to be implemented.
182 |     """
183 | 
184 |     def __init__(self, constraint_fn, action_dim, enable_cuda=False):
185 | 
186 |         super().__init__()
187 | 
188 |         self.device = torch.device(
189 |                 'cuda' if torch.cuda.is_available() and enable_cuda \
190 |                 else 'cpu')
191 |         self.constraint_fn = self._vectorize_f(constraint_fn, action_dim)
192 |         self.action_dim = action_dim
193 | 
194 |     def _vectorize_f(self, f, action_dim):
195 |         """
196 |         Converts a function f defined on 1D numpy arrays and outputting pairs of
197 |         scalars into a vectorized function accepting batches of
198 |         torch tensorized arrays and output pairs of torch tensors.
199 |         """
200 | 
201 |         def vectorized_f(obs, action_dim):
202 | 
203 |             obs = obs.cpu().detach().numpy()
204 | 
205 |             if len(obs.shape) == 1:  # check to see if obs is a batch or single obs
206 |                 batch_size = 1
207 |                 lbs, ubs = f(obs)
208 | 
209 |             else:
210 |                 batch_size = obs.shape[0]
211 |                 lbs = np.zeros([batch_size, self.action_dim])
212 |                 ubs = np.zeros([batch_size, self.action_dim])
213 |                 for i in range(batch_size):
214 |                     lbs[i], ubs[i] = f(obs[i])
215 | 
216 |             lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim)
217 |             ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim)
218 |             lbs = lbs.to(self.device)
219 |             ubs = ubs.to(self.device)
220 |             
221 |             return lbs, ubs
222 | 
223 |         return vectorized_f
224 | 
225 |     def sample(self, obs, action_dim, no_log_prob=False):
226 |         """
227 |         Sample from independent Beta distributions across each action_dim.
228 |         """
229 | 
230 |         assert len(obs.shape) == 1, 'obs must be a flat array'
231 | 
232 |         alphas, betas = self.forward(obs)
233 |         alphas, betas = torch.flatten(alphas), torch.flatten(betas)
234 |         dists = [
235 |             td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas)
236 |         ]
237 |         action_along_dims = [dist.rsample() for dist in dists]
238 |         action = torch.tensor(action_along_dims, requires_grad=True)
239 |         log_prob = torch.sum(torch.tensor([
240 |             dist.log_prob(a) for dist, a in zip(dists, action_along_dims)
241 |         ], requires_grad=True))
242 |         lb, ub = self.constraint_fn(obs, action_dim)
243 |         action = lb + (ub - lb) * action
244 |         return action if no_log_prob else (action, log_prob)
245 | 
246 |     def log_probs(self, obs, actions, action_dim):
247 |         alphas_arr, betas_arr = self.forward(obs)
248 |         dists = []
249 | 
250 |         alphas_arr_1 = alphas_arr[:,0]
251 |         alphas_arr_2 = alphas_arr[:,1]
252 |         betas_arr_1 = betas_arr[:,0]
253 |         betas_arr_2 = betas_arr[:,1]
254 |         try:
255 |             dists_1 = td.Beta(alphas_arr_1, betas_arr_1)
256 |         except:
257 |             import pdb; pdb.set_trace()
258 |             
259 |         try:
260 |              dists_2 = td.Beta(alphas_arr_2, betas_arr_2)
261 |         except:
262 |             import pdb; pdb.set_trace()
263 | 
264 |         for i in range(alphas_arr.shape[0]):
265 |             alphas = alphas_arr[i]
266 |             betas = betas_arr[i]
267 |             dists.append([
268 |                 td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas)
269 |             ])
270 | 
271 |         lbs, ubs = self.constraint_fn(obs, action_dim)
272 |         if lbs.device!=actions.device:
273 |             lbs = lbs.to('cuda:0')
274 |             ubs = ubs.to('cuda:0')
275 |         actions = (actions - lbs) / (ubs - lbs) 
276 |         actions = actions.clip(0, 1)
277 | 
278 |         log_probs = []
279 |         for action, action_dists in zip(actions, dists):
280 |             log_probs.append(
281 |                 torch.sum(torch.tensor([
282 |                     dim_dist.log_prob(dim_action) \
283 |                         for dim_dist, dim_action in zip(action_dists, action)
284 |                 ], requires_grad=True))
285 |             )
286 |         log_probs = torch.tensor(log_probs, requires_grad=True)
287 | 
288 |         return_new = dists_1.log_prob(actions[:,0]).flatten() + dists_2.log_prob(actions[:,1]).flatten()
289 | 
290 |         
291 |         return return_new 
292 | 
293 |     def entropy(self, obs):
294 |         """
295 |         Returns sum of entropies along each independent action dimension.
296 |         """
297 |         alphas_arr, betas_arr = self.forward(obs)
298 |         dists = []
299 |         for i in range(alphas_arr.shape[0]):
300 |             alphas = alphas_arr[i]
301 |             betas = betas_arr[i]
302 |             dists.append([
303 |                 td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas)
304 |             ])
305 |         entropies = torch.tensor(
306 |             [torch.sum(torch.tensor([dist.entropy() for dist in dist_list])) \
307 |              for dist_list in dists]
308 |         )
309 |         return entropies
310 | 
311 | 
312 | 
313 | class BetaPolicy(BetaPolicyBase):
314 |     """
315 |     Beta policy using a two-layer, two-headed MLP with ReLU activation.
316 |     """
317 | 
318 |     def __init__(self, obs_dim, constraint_fn, action_dim,
319 |                  hidden_layer1_size=64,
320 |                  hidden_layer2_size=64):
321 | 
322 |         super().__init__(constraint_fn, action_dim=action_dim)
323 | 
324 |         self.base_net = nn.Sequential(
325 |             nn.Linear(obs_dim, hidden_layer1_size),
326 |             nn.Tanh(),
327 |             nn.Linear(hidden_layer1_size, hidden_layer2_size),
328 |             nn.Tanh(),
329 |         )
330 | 
331 |         self.alpha_net = nn.Sequential(
332 |             nn.Linear(hidden_layer2_size, action_dim),
333 |             nn.Softplus(),
334 |         )
335 | 
336 |         self.beta_net = nn.Sequential(
337 |             nn.Linear(hidden_layer2_size, action_dim),
338 |             nn.Softplus(),
339 |         )
340 | 
341 |     def forward(self, obs):
342 | 
343 |         x = self.base_net(obs)
344 |         alpha = 1.0 + self.alpha_net(x)
345 |         beta = 1.0 + self.beta_net(x)
346 | 
347 |         return alpha, beta
348 | 
349 | 
350 | class CategoricalPolicy(PolicyNetwork):
351 |     """
352 |     Base class for categorical policy.
353 | 
354 |     Desired network needs to be implemented.
355 |     """
356 | 
357 |     def __init__(self, num_actions):
358 | 
359 |         super().__init__()
360 | 
361 |         self.num_actions = num_actions
362 | 
363 |     def sample(self, obs, no_log_prob=False):
364 |         logits = self.forward(obs)
365 |         dist = td.Categorical(logits=logits)
366 |         action = dist.sample(sample_shape=torch.tensor([1]))
367 |         return action if no_log_prob else (action, dist.log_prob(action))
368 | 
369 |     def log_probs(self, obs, actions):
370 |         dists = td.Categorical(logits=self.forward(obs))
371 |         return dists.log_prob(actions.flatten())
372 | 
373 |     def entropy(self, obs):
374 |         dists = td.Categorical(logits=self.forward(obs))
375 |         return dists.entropy()
376 | 
377 | 
378 | class CategoricalPolicyTwoLayer(CategoricalPolicy):
379 |     """
380 |     Categorical policy using a fully connected two-layer network.
381 |     """
382 | 
383 |     def __init__(self, state_dim, num_actions,
384 |                  hidden_layer1_size=64,
385 |                  hidden_layer2_size=64,
386 |                  init_std=0.001):
387 | 
388 |         super().__init__(num_actions)
389 | 
390 |         self.init_std = init_std
391 | 
392 |         self.linear1 = nn.Linear(state_dim, hidden_layer1_size)
393 |         self.linear2 = nn.Linear(hidden_layer1_size, hidden_layer2_size)
394 |         self.linear3 = nn.Linear(hidden_layer2_size, num_actions)
395 |         nn.init.normal_(self.linear1.weight, std=init_std)
396 |         nn.init.normal_(self.linear2.weight, std=init_std)
397 |         nn.init.normal_(self.linear3.weight, std=init_std)
398 | 
399 |     def forward(self, state):
400 |         x = F.relu(self.linear1(state))
401 |         x = F.relu(self.linear2(x))
402 |         output = self.linear3(x)
403 |         return output
404 | 
405 | 
406 | class PPOBase:
407 |     def __init__(self,
408 |                  env,
409 |                  policy,
410 |                  value_function,
411 |                  policy_lr,
412 |                  value_lr,
413 |                  entropy_coef=0.0,
414 |                  clip_range=0.2,
415 |                  n_epochs=10,
416 |                  batch_size=64,
417 |                  weight_decay=0.0,
418 |                  gamma=0.99,
419 |                  buffer_size=2048,
420 |                  enable_cuda=True,
421 |                  policy_optimizer=torch.optim.Adam,
422 |                  value_optimizer=torch.optim.Adam,
423 |                  grad_clip_radius=None):
424 | 
425 |         warnings.warn('This PPO implementation currently contains hacks for ' + \
426 |                       'returning information about CBF-related safety.')
427 | 
428 |         self.env = env
429 |         self.pi = policy
430 |         self.v = value_function
431 |         self.entropy_coef = entropy_coef
432 |         self.clip_range = clip_range
433 |         self.n_epochs = n_epochs
434 |         self.batch_size = batch_size
435 | 
436 |         self.__cuda_enabled = enable_cuda
437 |         self.enable_cuda(self.__cuda_enabled, warn=False)
438 |         # NOTE: self.device is defined when self.enable_cuda is called!
439 | 
440 |         self.pi_optim = policy_optimizer(self.pi.parameters(),
441 |                                          lr=policy_lr,
442 |                                          weight_decay=weight_decay)
443 |         self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr)
444 |         self.grad_clip_radius = grad_clip_radius
445 | 
446 |         self.rollout_buffer = RolloutBuffer(
447 |             buffer_size,
448 |             env.observation_space,
449 |             env.action_space,
450 |             device=self.device,
451 |             gamma=gamma
452 |         )
453 | 
454 |     @property
455 |     def cuda_enabled(self):
456 |         return self.__cuda_enabled
457 | 
458 |     def enable_cuda(self, enable_cuda=True, warn=True):
459 |         """Enable or disable cuda and update models."""
460 |         
461 |         if warn:
462 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
463 |                           "initializing optimizers can give errors when using "
464 |                           "optimizers other than SGD or Adam!")
465 |         
466 |         self.__cuda_enabled = enable_cuda
467 |         self.device = torch.device(
468 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
469 |                 else 'cpu')
470 |         self.pi.to(self.device)
471 |         self.v.to(self.device)
472 |         
473 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
474 |         """
475 |         Load policy and value functions. Copy them to target functions.
476 | 
477 |         This method is for evaluation only. Use load_checkpoint to continue
478 |         training.
479 |         """
480 |         
481 |         models = torch.load(filename)
482 | 
483 |         self.pi.load_state_dict(models['pi_state_dict'])
484 |         self.v.load_state_dict(models['v_state_dict'])
485 | 
486 |         self.pi.eval()
487 |         self.v.eval()
488 |             
489 |         self.enable_cuda(enable_cuda, warn=False)
490 |         
491 |     def save_checkpoint(self, filename):
492 |         """Save state_dicts of models and optimizers."""
493 |         
494 |         torch.save({
495 |                 'using_cuda': self.__cuda_enabled,
496 |                 'pi_state_dict': self.pi.state_dict(),
497 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
498 |                 'v_state_dict': self.v.state_dict(),
499 |                 'v_optimizer_state_dict': self.v_optim.state_dict(),
500 |         }, filename)
501 |     
502 |     def load_checkpoint(self, filename, continue_training=True):
503 |         """Load state_dicts for models and optimizers."""
504 |         
505 |         checkpoint = torch.load(filename)
506 |         
507 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
508 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
509 |         self.v.load_state_dict(models['v_state_dict'])
510 |         self.v_optim.load_state_dict(models['v_optimizer_state_dict'])
511 |         
512 |         if continue_training:
513 |             self.pi.train()
514 |             self.v.train()
515 |         else:
516 |             self.pi.eval()
517 |             self.v.eval()
518 |         
519 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
520 | 
521 |     def collect_rollout(self, env, rollout_length):
522 |         """
523 |         Perform a rollout and fill the rollout buffer.
524 |         """
525 | 
526 |         self._last_obs = env.reset()
527 |         self._last_episode_start = np.zeros(1)
528 |         n_steps = 0
529 |         self.rollout_buffer.reset()
530 | 
531 |         num_unsafe_steps = 0
532 |         x_t=[]
533 |         y_t=[]
534 |         
535 |         local_flag_done = False
536 |         while n_steps < rollout_length:
537 |           
538 |             action_dim=get_action_dim(env.action_space)
539 |            
540 |             with torch.no_grad():
541 |                 # Convert to pytorch tensor or to TensorDict
542 |                 obs_tensor = obs_as_tensor(self._last_obs, self.device).float()
543 |                 action, log_prob = self.pi.sample(obs_tensor, action_dim)
544 |                 value = self.v(obs_tensor)
545 |             action = action.cpu().numpy()
546 | 
547 |             # Rescale and perform action
548 |             clipped_action = action
549 |             # Clip the actions to avoid out of bound error
550 |             if isinstance(self.env.action_space, gym.spaces.Box):
551 |                 clipped_action = np.clip(action, self.env.action_space.low,
552 |                                          self.env.action_space.high)
553 |             elif isinstance(self.env.action_space, gym.spaces.Discrete):
554 |                 clipped_action = int(clipped_action)
555 | 
556 |             new_obs, reward, done, info = env.step(clipped_action)
557 | 
558 |             x_t.append(new_obs[0])
559 |             y_t.append(new_obs[1])
560 |             if abs(new_obs[0]-env.obstacle[0])<0.1 and abs(new_obs[1]-env.obstacle[1])<0.1:
561 |                 print("crash")
562 | 
563 |             n_steps += 1
564 | 
565 |             if isinstance(self.env.action_space, gym.spaces.Discrete):
566 |                 # Reshape in case of discrete action
567 |                 action = action.reshape(-1, 1)
568 | 
569 |             self.rollout_buffer.add(self._last_obs, action, reward,
570 |                                     self._last_episode_start, value, log_prob)
571 |             self._last_obs = new_obs.flatten()
572 |             self._last_episode_start = done
573 | 
574 |             if n_steps == rollout_length:
575 |                 env.reset()
576 | 
577 |    
578 |         plt.xlim(np.double(env.min_x),np.double(env.max_x))
579 |         plt.ylim(np.double(env.min_y),np.double(env.max_y))
580 |         plt.xlabel('X axis')
581 |         plt.ylabel('Y-axis')
582 |         plt.plot(x_t,y_t)
583 |         plt.plot(env.goal[0],env.goal[1],marker='o',color='red')
584 |         plt.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black')
585 |         
586 |         
587 |         def f(x, y, xa, yb, a, b):
588 |             return (x - xa)**4/a**4 + (y - yb)**4/b**4
589 | 
590 |         # Define the point around which to plot
591 |         xa, yb = env.obstacle[0], env.obstacle[1]
592 | 
593 |         # Define the range of x and y values to plot
594 |         x_vals = np.linspace(xa - env.a_d, xa + env.a_d, 100)
595 |         y_vals = np.linspace(yb - env.b_d, yb + env.b_d, 100)
596 | 
597 |         # Create a grid of x and y values
598 |         X, Y = np.meshgrid(x_vals, y_vals)
599 | 
600 |         # Evaluate the function at each point in the grid
601 |         Z = f(X, Y, xa, yb, env.a_d, env.b_d)
602 | 
603 |         # Plot the function as a contour plot
604 |         ##Create a folder in the current directory
605 |         folder_name_main = f"{{{env.date}}}"
606 |         os.makedirs(folder_name_main, exist_ok=True)
607 |         ##Change the current working directory to the newly created folder
608 |         os.chdir(folder_name_main)
609 |         
610 |         folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}"
611 |         os.makedirs(folder_name, exist_ok=True)
612 |         ##Change the current working directory to the newly created folder
613 |         os.chdir(folder_name)
614 | 
615 |         folder_name_1 = f"{{lr={env.lr}_entr={env.entropy}_umin={env.umin[0]}_umax={env.umax[0]}_lyr=batch={env.layer_size}}}"
616 |         os.makedirs(folder_name_1, exist_ok=True)
617 |         os.chdir(folder_name_1)
618 | 
619 |         with open(f"episode={env.episodes}.csv", 'w', newline='') as file:
620 |             writer = csv.writer(file)
621 |             writer.writerow(x_t)
622 |             writer.writerow(y_t)
623 |         if (env.episodes)%1 == 0:
624 |             plt.savefig(f"ep={env.episodes}.png")
625 |         plt.contour(X, Y, Z, levels=[env.safety_dist])
626 | 
627 |         os.chdir('..')
628 |         os.chdir('..')
629 |         os.chdir('..')
630 |         
631 | 
632 |         self.rollout_buffer.compute_returns_and_advantage(last_value=value,
633 |                                                           done=done)
634 |         
635 |         safety_rate = 100 * (1 - num_unsafe_steps / rollout_length)
636 | 
637 |         return np.sum(self.rollout_buffer.rewards), safety_rate    
638 | 
639 |     def train(self):
640 |         """
641 |         Train on the current rollout buffer.
642 |         """        
643 |         #action_dim = get_action_dim(self.action_space)
644 |         for epoch in range(self.n_epochs):
645 | 
646 |             # Do a complete pass on the rollout buffer
647 |             for rollout_data in self.rollout_buffer.get(self.batch_size):
648 | 
649 |                 actions = rollout_data.actions
650 |                 obs = rollout_data.observations
651 |                 values = self.v(obs).flatten()
652 |                 try:
653 |                     log_probs = self.pi.log_probs(obs, actions, actions.shape[1])
654 |                 except:
655 |                     print(self.pi.log_probs(obs, actions, actions.shape[1]))
656 |                     import pdb; pdb.set_trace()
657 | 
658 |                 entropies = self.pi.entropy(obs)
659 |                 if log_probs.device!=actions.device:
660 |                     log_probs=log_probs.to('cuda:0')
661 |                     entropies=entropies.to('cuda:0')
662 |                 advantages = rollout_data.advantages
663 |                 advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
664 | 
665 |                 # ratio between old and new policy, should be one at the first iteration
666 |                 ratio = torch.exp(log_probs - rollout_data.old_log_prob)
667 | 
668 |                 policy_loss_1 = advantages * ratio
669 |                 policy_loss_2 = advantages * torch.clamp(ratio,
670 |                                                          1 - self.clip_range,
671 |                                                          1 + self.clip_range)
672 |                 policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean() - \
673 |                         self.entropy_coef * entropies.mean()
674 | 
675 |                 self.pi_optim.zero_grad()
676 |                 policy_loss.backward()
677 |                 # Clip grad norm
678 |                 if self.grad_clip_radius is not None:
679 |                     torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
680 |                                                    self.grad_clip_radius)
681 |                 self.pi_optim.step()
682 | 
683 |                 value_loss = F.mse_loss(rollout_data.returns, values)
684 | 
685 |                 self.v_optim.zero_grad()
686 |                 value_loss.backward()
687 |                 # Clip grad norm
688 |                 if self.grad_clip_radius is not None:
689 |                     torch.nn.utils.clip_grad_norm_(self.v.parameters(),
690 |                                                    self.grad_clip_radius)
691 |                 self.v_optim.step()
692 | 


--------------------------------------------------------------------------------
/Pendulum/agents/agents.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import os
  4 | import yaml
  5 | from scipy import integrate
  6 | from wesutils.utils import GaussianPolicyTwoLayer, two_layer_net
  7 | from functools import reduce
  8 | from operator import mul
  9 | 
 10 | 
 11 | class GaussianPolicyCBF(GaussianPolicyTwoLayer):
 12 |     """
 13 |     Modified version of the standard Gaussian policy for use in CBF-
 14 |     constrained settings. Provides access to the pdf of the policy at
 15 |     a given state as well as utilities for directly manipulating the
 16 |     parameters of the policy. Requires a CBF upon initilization. The
 17 |     CBF is assumed to accept torch tensor representations of the state.
 18 | 
 19 |     NOTE: This version of the policy assumes that action_dim=1 and that
 20 |     the CBF returns intervals.
 21 |     """
 22 | 
 23 |     def __init__(self, cbf, state_dim, action_dim=1,
 24 |                  simple_cov=True,
 25 |                  hidden_layer1_size=32,
 26 |                  hidden_layer2_size=32,
 27 |                  activation='sigmoid',
 28 |                  log_std_min=-20, log_std_max=3,
 29 |                  weight_init_std=0.0001):
 30 | 
 31 |         assert action_dim == 1, "Action dimension must be 1"
 32 | 
 33 |         super().__init__(
 34 |             state_dim, action_dim,
 35 |             simple_cov=simple_cov,
 36 |             hidden_layer1_size=hidden_layer1_size,
 37 |             hidden_layer2_size=hidden_layer2_size,
 38 |             activation=activation,
 39 |             log_std_min=log_std_min, log_std_max=log_std_max,
 40 |             weight_init_std=weight_init_std
 41 |         )
 42 | 
 43 |         self.cbf = cbf
 44 |         self.param_shape = tuple(p.shape for p in self.parameters())
 45 |         self.param_size = sum(reduce(mul, shape) for shape in self.param_shape)
 46 | 
 47 |     def to(self, device):
 48 |         super().to(device)
 49 |         self.device = device
 50 | 
 51 |     @property
 52 |     def params(self):
 53 |         """Get policy model parameters."""
 54 |         return torch.cat([p.data.reshape(-1) for p in self.parameters()])
 55 | 
 56 |     @params.setter
 57 |     def params(self, new_values):
 58 |         """Set policy model parameters."""
 59 |         assert new_values.size()[0] == self.param_size, "Error"
 60 | 
 61 |         index = 0
 62 | 
 63 |         for param in self.parameters():
 64 |             size = reduce(mul, param.shape)
 65 |             block = new_values[index:index+size].reshape(param.shape)
 66 |             param.data.copy_(block)
 67 |             index += size
 68 | 
 69 |     def _numpy_original_pdf(self, state):
 70 |         """
 71 |         Return numpy version of the pdf of the untruncated policy at the
 72 |         state provided.
 73 |         """
 74 |         
 75 |         mean, cov = self.forward(state)
 76 |         mean, cov = mean.detach().numpy(), cov.detach().numpy()
 77 |         K = np.float_power((2 * np.pi)**len(mean) * np.linalg.det(cov), -0.5)
 78 |         inv = np.linalg.inv(cov)
 79 | 
 80 |         def pdf(action):
 81 |             return K * np.exp(
 82 |                 -0.5 * (action - mean).dot(inv.dot(action - mean))
 83 |             ).flatten()
 84 | 
 85 |         return pdf
 86 | 
 87 |     def _torch_original_pdf(self, state):
 88 |         """
 89 |         Return torch version of the pdf of the untruncated policy at the
 90 |         state provided. Detach is not called, so all computations herein
 91 |         are reflected in the computation graph.
 92 |         """
 93 | 
 94 |         # import pdb; pdb.set_trace()
 95 | 
 96 |         mean, cov = self.forward(state)
 97 |         K = torch.float_power((2 * np.pi)**len(mean) * torch.linalg.det(cov), -0.5)
 98 |         inv = torch.linalg.inv(cov).squeeze(dim=0)
 99 | 
100 |         def pdf(action):
101 |             return K * torch.exp(-0.5 * torch.matmul(
102 |                 action - mean, torch.matmul(inv, action - mean)))
103 | 
104 |         return pdf
105 | 
106 |     def get_numpy_pdf(self, state):
107 |         """
108 |         Return the pdf of the original Gaussian pdf truncated to the set C(x).
109 |         """
110 |         
111 |         lb, ub = self.cbf(state=state)
112 |         original_pdf = self._numpy_original_pdf(state)
113 |         normalization = integrate.quad(original_pdf, lb, ub)[0]
114 | 
115 |         def pdf(action):
116 |             return original_pdf(action) / normalization if lb <= action <= ub \
117 |                 else 0
118 | 
119 |         return pdf
120 | 
121 |     def sample(self, state, sample_cutoff=100,
122 |                no_log_prob=False, num_log_prob_samples=1000):
123 |         """
124 |         Repeatedly sample from the original Gaussian policy until an action
125 |         lying within the CBF constraint set is generated.
126 | 
127 |         sample_cutoff specifies the number of times to sample using the
128 |         original Gaussian policy before simply uniformly generating an
129 |         action from the CBF constraint set.
130 |         """
131 | 
132 |         lb, ub = self.cbf(state=state)
133 |         lb = float(lb)
134 |         ub = float(ub)
135 |         state = torch.FloatTensor(state).reshape(1, len(state)).to(self.device)
136 |         orig_pdf = self._torch_original_pdf(state)
137 | 
138 |         if not no_log_prob:
139 |             actions = lb + (ub - lb) * torch.rand(
140 |                 num_log_prob_samples, self.action_dim, 1, device=self.device
141 |             )
142 |             log_prob = orig_pdf(actions).sum().log() # log pi_theta (C(x) | x)
143 | 
144 |         for _ in range(sample_cutoff):
145 | 
146 |             action, orig_log_prob = super().sample(state)
147 | 
148 |             if lb <= action <= ub:
149 |                 log_prob = orig_log_prob - log_prob # log pi^C_theta 
150 |                                                     # = log pi_theta - log pi_theta(C(x) | x)
151 |                 return action, log_prob if not no_log_prob else action
152 | 
153 |         action = lb + (ub - lb) * torch.rand(1, 1)
154 |         orig_log_prob = orig_pdf(action).log()
155 |         return action, (orig_log_prob - log_prob) if not no_log_prob else action
156 | 
157 | 
158 | class CBFREINFORCEAgent:
159 |     """
160 |     Agent for training a CBF-constrained version of the classic REINFORCE
161 |     algorithm.
162 |     """
163 | 
164 |     def __init__(self,
165 |                  ### agent parameters
166 |                  state_dim, action_dim, cbf,
167 |                  policy_lr, discount_factor,
168 |                  num_log_prob_samples=1000,
169 |                  enable_cuda=False,
170 |                  optimizer=torch.optim.Adam,
171 |                  grad_clip_radius=None,
172 |                  ### policy parameters
173 |                  simple_cov=True,
174 |                  hidden_layer1_size=32,
175 |                  hidden_layer2_size=32,
176 |                  activation='relu',
177 |                  log_std_min=-20, log_std_max=3,
178 |                  weight_init_std=0.0001):
179 | 
180 |         assert action_dim == 1, "Action dimension must be 1 in this version"
181 | 
182 |         self.pi = GaussianPolicyCBF(
183 |             cbf=cbf, state_dim=state_dim, action_dim=action_dim,
184 |             simple_cov=simple_cov,
185 |             hidden_layer1_size=hidden_layer1_size,
186 |             hidden_layer2_size=hidden_layer2_size,
187 |             activation=activation,
188 |             log_std_min=log_std_min, log_std_max=log_std_max,
189 |             weight_init_std=weight_init_std
190 |         )
191 | 
192 |         self.gamma = discount_factor
193 | 
194 |         self.pi_optim = optimizer(self.pi.parameters(), lr=policy_lr)
195 |         self.grad_clip_radius = grad_clip_radius
196 | 
197 |         self.__cuda_enabled = enable_cuda
198 |         self.enable_cuda(self.__cuda_enabled, warn=False)
199 |         # NOTE: self.device is defined when self.enable_cuda is called!
200 | 
201 |     @property
202 |     def cuda_enabled(self):
203 |         return self.__cuda_enabled
204 | 
205 |     def enable_cuda(self, enable_cuda=True, warn=True):
206 |         """Enable or disable cuda and update models."""
207 |         
208 |         if warn:
209 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
210 |                           "initializing optimizers can give errors when using "
211 |                           "optimizers other than SGD or Adam!")
212 |         
213 |         self.__cuda_enabled = enable_cuda
214 |         self.device = torch.device(
215 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
216 |                 else 'cpu')
217 |         self.pi.to(self.device)
218 |         
219 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
220 |         """Load policy and value functions. Copy them to target functions."""
221 |         
222 |         models = torch.load(filename)
223 | 
224 |         self.pi.load_state_dict(models['pi_state_dict'])
225 |         
226 |         if continue_training:
227 |             self.pi.train()
228 |         else:
229 |             self.pi.eval()
230 |             
231 |         self.enable_cuda(enable_cuda, warn=False)
232 |         
233 |     def save_checkpoint(self, filename):
234 |         """Save state_dicts of models and optimizers."""
235 |         
236 |         torch.save({
237 |                 'using_cuda': self.__cuda_enabled,
238 |                 'pi_state_dict': self.pi.state_dict(),
239 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
240 |         }, filename)
241 |     
242 |     def load_checkpoint(self, filename, continue_training=True):
243 |         """Load state_dicts for models and optimizers."""
244 |         
245 |         checkpoint = torch.load(filename)
246 |         
247 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
248 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
249 |         
250 |         if continue_training:
251 |             self.pi.train()
252 |             
253 |         else:
254 |             self.pi.eval()
255 |         
256 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
257 | 
258 |     def update(self, env, rollout_length, sample_cutoff=100):
259 |         """
260 |         Perform a single rollout and corresponding gradient update.
261 |         Return the total reward accumulated during the rollout.
262 |         """
263 | 
264 |         rewards, log_probs = [], []
265 |         num_steps = 0
266 | 
267 |         state = env.state
268 | 
269 |         for _ in range(rollout_length):
270 |             action, log_prob = self.pi.sample(state,
271 |                                               sample_cutoff=sample_cutoff)
272 |             state, reward, done, _ = env.step(action.cpu().detach().numpy())
273 |             rewards.append(float(reward))
274 |             log_probs.append(log_prob)
275 |             
276 |             if done:
277 |                 break
278 | 
279 |             num_steps += 1
280 | 
281 |         G = 0
282 |         pi_loss = 0
283 | 
284 |         for i in range(len(rewards) - 1, -1, -1):
285 |             G = rewards[i] + self.gamma * G
286 |             pi_loss = pi_loss + (self.gamma ** i) * G * log_probs[i]
287 | 
288 |         pi_loss = -pi_loss
289 | 
290 |         self.pi_optim.zero_grad()
291 |         pi_loss.backward()
292 |         if self.grad_clip_radius is not None:
293 |             torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
294 |                                            self.grad_clip_radius)
295 |         self.pi_optim.step()
296 | 
297 |         return np.mean(rewards)
298 | 
299 |     def train(self, env, num_episodes, rollout_length,
300 |               output_dir, args_list,
301 |               reset_env=True,
302 |               sample_cutoff=100):
303 |         """
304 |         Train on the environment.
305 |         """
306 | 
307 |         episode_mean_rewards = []
308 | 
309 |         for i in range(num_episodes):
310 |             if reset_env:
311 |                 env.reset()
312 |             mean_reward = self.update(env, rollout_length,
313 |                                       sample_cutoff=sample_cutoff)
314 |             cbf = [float(elem) for elem in env.cbf(env.state)]
315 |             print(
316 |                 f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}, C(x) = [{cbf[0]:.2f}, {cbf[1]:.2f}]')
317 |             episode_mean_rewards.append(mean_reward)
318 | 
319 |         rewards_filename = os.path.join(output_dir, 'episode_rewards')
320 |         np.save(rewards_filename, episode_mean_rewards)
321 | 
322 |         hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml')
323 |         with open(hyperparams_filename, 'w') as f:
324 |             yaml.dump(args_list, f)
325 | 
326 | 
327 | 
328 | class CBFACAgent:
329 |     """
330 |     Agent for training a CBF-constrained version of the classic actor-critic
331 |     algorithm.
332 |     """
333 | 
334 |     def __init__(self,
335 |                  ### agent parameters
336 |                  state_dim, action_dim,
337 |                  policy_lr, value_lr, discount_factor,
338 |                  cbf=None,
339 |                  num_log_prob_samples=1000,
340 |                  enable_cuda=False,
341 |                  policy_optimizer=torch.optim.Adam,
342 |                  value_optimizer=torch.optim.Adam,
343 |                  grad_clip_radius=None,
344 |                  ### policy parameters
345 |                  simple_cov=True,
346 |                  policy_hidden_layer1_size=32,
347 |                  policy_hidden_layer2_size=32,
348 |                  policy_activation='relu',
349 |                  log_std_min=-20, log_std_max=3,
350 |                  weight_init_std=0.0001,
351 |                  # value function parameters
352 |                  value_hidden_layer1_size=32,
353 |                  value_hidden_layer2_size=32,
354 |                  value_activation='ReLU'):
355 | 
356 |         assert action_dim == 1, "Action dimension must be 1 in this version"
357 | 
358 |         self.pi = GaussianPolicyCBF(
359 |             cbf=cbf, state_dim=state_dim, action_dim=action_dim,
360 |             simple_cov=simple_cov,
361 |             hidden_layer1_size=policy_hidden_layer1_size,
362 |             hidden_layer2_size=policy_hidden_layer2_size,
363 |             activation=policy_activation,
364 |             log_std_min=log_std_min, log_std_max=log_std_max,
365 |             weight_init_std=weight_init_std
366 |         )
367 | 
368 |         self.v = two_layer_net(
369 |             input_dim=state_dim, output_dim=1,
370 |             hidden_layer1_size=value_hidden_layer1_size,
371 |             hidden_layer2_size=value_hidden_layer2_size,
372 |             activation=value_activation,
373 |         )
374 | 
375 |         self.gamma = discount_factor
376 | 
377 |         self.pi_optim = policy_optimizer(self.pi.parameters(), lr=policy_lr)
378 |         self.grad_clip_radius = grad_clip_radius
379 | 
380 |         self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr)
381 |         self.grad_clip_radius = grad_clip_radius
382 | 
383 |         self.__cuda_enabled = enable_cuda
384 |         self.enable_cuda(self.__cuda_enabled, warn=False)
385 |         # NOTE: self.device is defined when self.enable_cuda is called!
386 | 
387 |     @property
388 |     def cuda_enabled(self):
389 |         return self.__cuda_enabled
390 | 
391 |     def enable_cuda(self, enable_cuda=True, warn=True):
392 |         """Enable or disable cuda and update models."""
393 |         
394 |         if warn:
395 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
396 |                           "initializing optimizers can give errors when using "
397 |                           "optimizers other than SGD or Adam!")
398 |         
399 |         self.__cuda_enabled = enable_cuda
400 |         self.device = torch.device(
401 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
402 |                 else 'cpu')
403 |         self.pi.to(self.device)
404 |         self.v.to(self.device)
405 |         
406 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
407 |         """Load policy and value functions. Copy them to target functions."""
408 |         
409 |         models = torch.load(filename)
410 | 
411 |         self.pi.load_state_dict(models['pi_state_dict'])
412 |         self.v.load_state_dict(models['v_state_dict'])
413 |         
414 |         if continue_training:
415 |             self.pi.train()
416 |             self.v.train()
417 |         else:
418 |             self.pi.eval()
419 |             self.v.eval()
420 |             
421 |         self.enable_cuda(enable_cuda, warn=False)
422 |         
423 |     def save_checkpoint(self, filename):
424 |         """Save state_dicts of models and optimizers."""
425 |         
426 |         torch.save({
427 |                 'using_cuda': self.__cuda_enabled,
428 |                 'pi_state_dict': self.pi.state_dict(),
429 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
430 |                 'v_state_dict': self.v.state_dict(),
431 |                 'v_optimizer_state_dict': self.v_optim.state_dict()
432 |         }, filename)
433 |     
434 |     def load_checkpoint(self, filename, continue_training=True):
435 |         """Load state_dicts for models and optimizers."""
436 |         
437 |         checkpoint = torch.load(filename)
438 |         
439 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
440 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
441 |         self.v.load_state_dict(checkpoint['v_state_dict'])
442 |         self.v_optim.load_state_dict(checkpoint['v_optimizer_state_dict'])
443 |         
444 |         if continue_training:
445 |             self.pi.train()
446 |             self.v.train()
447 |             
448 |         else:
449 |             self.pi.eval()
450 |             self.v.eval()
451 |         
452 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
453 | 
454 |     def update(self, env, episode_length, sample_cutoff=100):
455 |         """
456 |         Perform a single episode and corresponding gradient update.
457 |         Return the total reward accumulated during the rollout.
458 |         """
459 | 
460 |         states, actions, rewards, next_states, log_probs = [], [], [], [], []
461 | 
462 |         state = env.state
463 | 
464 |         for _ in range(episode_length):
465 |             states.append(state)
466 | 
467 |             action, log_prob = self.pi.sample(state,
468 |                                               sample_cutoff=sample_cutoff)
469 |             actions.append(action)
470 |             log_probs.append(log_prob)
471 |             state, reward, done, _ = env.step(action.cpu().detach().numpy())
472 |             rewards.append(reward)
473 |             next_states.append(state)
474 | 
475 |             if done:
476 |                 break
477 | 
478 |         next_states.append(env.state)
479 | 
480 |         v_loss = 0
481 |         pi_loss = 0
482 | 
483 |         for state, action, reward, next_state, log_prob in zip(
484 |             states, actions, rewards, next_states, log_probs):
485 |             state = torch.FloatTensor(state).reshape(1, len(state))
486 |             next_state = torch.FloatTensor(next_state).reshape(1, len(next_state))
487 |             with torch.no_grad():
488 |                 v_target = float(reward) + self.gamma * self.v(next_state)
489 |                 td_error = v_target - self.v(state)
490 |             v_loss += (v_target - self.v(state))**2
491 |             pi_loss = pi_loss + td_error * log_prob
492 | 
493 |         v_loss = v_loss / len(states)
494 |         pi_loss = pi_loss / len(states)
495 |         pi_loss = -pi_loss
496 | 
497 |         self.pi_optim.zero_grad()
498 |         self.v_optim.zero_grad()
499 | 
500 |         if self.grad_clip_radius is not None:
501 |             torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
502 |                                            self.grad_clip_radius)
503 |             torch.nn.utils.clip_grad_norm_(self.v.parameters(),
504 |                                            self.grad_clip_radius)
505 | 
506 |         self.v_optim.step()
507 |         self.pi_optim.step()
508 | 
509 |         return np.mean(rewards)
510 | 
511 |     def train(self, env, num_episodes, rollout_length,
512 |               output_dir, args_list,
513 |               reset_env=True,
514 |               sample_cutoff=100):
515 |         """
516 |         Train on the environment.
517 |         """
518 | 
519 |         episode_mean_rewards = []
520 | 
521 |         for i in range(num_episodes):
522 |             if reset_env:
523 |                 env.reset()
524 |             mean_reward = self.update(env, rollout_length,
525 |                                       sample_cutoff=sample_cutoff)
526 |             episode_mean_rewards.append(mean_reward)
527 |             cbf = [float(elem) for elem in env.cbf(env.state)]
528 |             print(
529 |                 f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}, C(x) = [{cbf[0]:.2f}, {cbf[1]:.2f}]')
530 | 
531 |         rewards_filename = os.path.join(output_dir, 'episode_rewards')
532 |         np.save(rewards_filename, episode_mean_rewards)
533 | 
534 |         hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml')
535 |         with open(hyperparams_filename, 'w') as f:
536 |             yaml.dump(args_list, f)
537 | 
538 | 
539 | class VanillaACAgent:
540 |     """
541 |     Agent for training the classic actor-critic algorithm.
542 |     """
543 | 
544 |     def __init__(self,
545 |                  ### agent parameters
546 |                  state_dim, action_dim,
547 |                  policy_lr, value_lr, discount_factor,
548 |                  enable_cuda=False,
549 |                  policy_optimizer=torch.optim.Adam,
550 |                  value_optimizer=torch.optim.Adam,
551 |                  grad_clip_radius=None,
552 |                  ### policy parameters
553 |                  simple_cov=True,
554 |                  policy_hidden_layer1_size=32,
555 |                  policy_hidden_layer2_size=32,
556 |                  policy_activation='relu',
557 |                  log_std_min=-20, log_std_max=3,
558 |                  weight_init_std=0.0001,
559 |                  # value function parameters
560 |                  value_hidden_layer1_size=32,
561 |                  value_hidden_layer2_size=32,
562 |                  value_activation='ReLU',
563 |                  cbf=None,
564 |                  num_log_prob_samples=None):
565 | 
566 |         assert action_dim == 1, "Action dimension must be 1 in this version"
567 | 
568 |         self.pi = GaussianPolicyTwoLayer(
569 |             state_dim=state_dim, action_dim=action_dim,
570 |             simple_cov=simple_cov,
571 |             hidden_layer1_size=policy_hidden_layer1_size,
572 |             hidden_layer2_size=policy_hidden_layer2_size,
573 |             activation=policy_activation,
574 |             log_std_min=log_std_min, log_std_max=log_std_max,
575 |             weight_init_std=weight_init_std
576 |         )
577 | 
578 |         self.v = two_layer_net(
579 |             input_dim=state_dim, output_dim=1,
580 |             hidden_layer1_size=value_hidden_layer1_size,
581 |             hidden_layer2_size=value_hidden_layer2_size,
582 |             activation=value_activation,
583 |         )
584 | 
585 |         self.gamma = discount_factor
586 | 
587 |         self.pi_optim = policy_optimizer(self.pi.parameters(), lr=policy_lr)
588 |         self.grad_clip_radius = grad_clip_radius
589 | 
590 |         self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr)
591 |         self.grad_clip_radius = grad_clip_radius
592 | 
593 |         self.__cuda_enabled = enable_cuda
594 |         self.enable_cuda(self.__cuda_enabled, warn=False)
595 |         # NOTE: self.device is defined when self.enable_cuda is called!
596 | 
597 |     @property
598 |     def cuda_enabled(self):
599 |         return self.__cuda_enabled
600 | 
601 |     def enable_cuda(self, enable_cuda=True, warn=True):
602 |         """Enable or disable cuda and update models."""
603 |         
604 |         if warn:
605 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
606 |                           "initializing optimizers can give errors when using "
607 |                           "optimizers other than SGD or Adam!")
608 |         
609 |         self.__cuda_enabled = enable_cuda
610 |         self.device = torch.device(
611 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
612 |                 else 'cpu')
613 |         self.pi.to(self.device)
614 |         self.v.to(self.device)
615 |         
616 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
617 |         """Load policy and value functions. Copy them to target functions."""
618 |         
619 |         models = torch.load(filename)
620 | 
621 |         self.pi.load_state_dict(models['pi_state_dict'])
622 |         self.v.load_state_dict(models['v_state_dict'])
623 |         
624 |         if continue_training:
625 |             self.pi.train()
626 |             self.v.train()
627 |         else:
628 |             self.pi.eval()
629 |             self.v.eval()
630 |             
631 |         self.enable_cuda(enable_cuda, warn=False)
632 |         
633 |     def save_checkpoint(self, filename):
634 |         """Save state_dicts of models and optimizers."""
635 |         
636 |         torch.save({
637 |                 'using_cuda': self.__cuda_enabled,
638 |                 'pi_state_dict': self.pi.state_dict(),
639 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
640 |                 'v_state_dict': self.v.state_dict(),
641 |                 'v_optimizer_state_dict': self.v_optim.state_dict()
642 |         }, filename)
643 |     
644 |     def load_checkpoint(self, filename, continue_training=True):
645 |         """Load state_dicts for models and optimizers."""
646 |         
647 |         checkpoint = torch.load(filename)
648 |         
649 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
650 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
651 |         self.v.load_state_dict(checkpoint['v_state_dict'])
652 |         self.v_optim.load_state_dict(checkpoint['v_optimizer_state_dict'])
653 |         
654 |         if continue_training:
655 |             self.pi.train()
656 |             self.v.train()
657 |             
658 |         else:
659 |             self.pi.eval()
660 |             self.v.eval()
661 |         
662 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
663 | 
664 |     def update(self, env, episode_length, sample_cutoff=100):
665 |         """
666 |         Perform a single episode and corresponding gradient update.
667 |         Return the mean reward accumulated during the rollout.
668 |         """
669 | 
670 |         states, actions, rewards, next_states, log_probs = [], [], [], [], []
671 | 
672 |         state = env.reset()
673 | 
674 |         for _ in range(episode_length):
675 |             states.append(state)
676 | 
677 |             action, log_prob = self.pi.sample(
678 |                 torch.FloatTensor(state).reshape(1, len(state)).to(self.device)
679 |             )
680 |             actions.append(action)
681 |             log_probs.append(log_prob)
682 |             state, reward, done, _ = env.step(action.cpu().detach().numpy())
683 |             rewards.append(reward)
684 |             next_states.append(state)
685 |             if done:
686 |                 break
687 | 
688 |         next_states.append(env.state)
689 | 
690 |         v_loss = 0
691 |         pi_loss = 0
692 | 
693 |         for state, action, reward, next_state, log_prob in zip(
694 |             states, actions, rewards, next_states, log_probs):
695 |             state = torch.FloatTensor(state).reshape(1, len(state))
696 |             next_state = torch.FloatTensor(next_state).reshape(1, len(next_state))
697 |             with torch.no_grad():
698 |                 v_target = float(reward) + self.gamma * self.v(next_state)
699 |                 td_error = v_target - self.v(state)
700 |             v_loss += (v_target - self.v(state))**2
701 |             pi_loss = pi_loss + td_error * log_prob
702 | 
703 |         pi_loss = pi_loss / len(states)
704 |         v_loss = v_loss / len(states)
705 |         pi_loss = -pi_loss
706 | 
707 |         self.pi_optim.zero_grad()
708 |         self.v_optim.zero_grad()
709 | 
710 |         if self.grad_clip_radius is not None:
711 |             torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
712 |                                            self.grad_clip_radius)
713 |             torch.nn.utils.clip_grad_norm_(self.v.parameters(),
714 |                                            self.grad_clip_radius)
715 | 
716 |         self.v_optim.step()
717 |         self.pi_optim.step()
718 | 
719 |         return np.mean(rewards)
720 | 
721 |     def train(self, env, num_episodes, rollout_length,
722 |               output_dir, args_list,
723 |               reset_env=True,
724 |               sample_cutoff=100):
725 |         """
726 |         Train on the environment.
727 |         """
728 | 
729 |         episode_mean_rewards = []
730 | 
731 |         for i in range(num_episodes):
732 |             if reset_env:
733 |                 env.reset()
734 |             mean_reward = self.update(env, rollout_length,
735 |                                       sample_cutoff=sample_cutoff)
736 |             episode_mean_rewards.append(mean_reward)
737 |             print(
738 |                 f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}')
739 | 
740 |         # rewards_filename = os.path.join(output_dir, 'episode_rewards')
741 |         # np.save(rewards_filename, episode_mean_rewards)
742 | 
743 |         # hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml')
744 |         # with open(hyperparams_filename, 'w') as f:
745 |         #     yaml.dump(args_list, f)
746 | 


--------------------------------------------------------------------------------
/agents/agents.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import os
  4 | import yaml
  5 | from scipy import integrate
  6 | from wesutils.utils import GaussianPolicyTwoLayer, two_layer_net
  7 | from functools import reduce
  8 | from operator import mul
  9 | import warnings
 10 | 
 11 | class GaussianPolicyCBF(GaussianPolicyTwoLayer):
 12 |     """
 13 |     Modified version of the standard Gaussian policy for use in CBF-
 14 |     constrained settings. Provides access to the pdf of the policy at
 15 |     a given state as well as utilities for directly manipulating the
 16 |     parameters of the policy. Requires a CBF upon initilization. The
 17 |     CBF is assumed to accept torch tensor representations of the state.
 18 | 
 19 |     NOTE: This version of the policy assumes that action_dim=1 and that
 20 |     the CBF returns intervals.
 21 |     """
 22 | 
 23 |     def __init__(self, cbf, state_dim, action_dim=1,
 24 |                  simple_cov=True,
 25 |                  hidden_layer1_size=32,
 26 |                  hidden_layer2_size=32,
 27 |                  activation='sigmoid',
 28 |                  log_std_min=-20, log_std_max=3,
 29 |                  weight_init_std=0.0001):
 30 | 
 31 |         assert action_dim == 1, "Action dimension must be 1"
 32 | 
 33 |         super().__init__(
 34 |             state_dim, action_dim,
 35 |             simple_cov=simple_cov,
 36 |             hidden_layer1_size=hidden_layer1_size,
 37 |             hidden_layer2_size=hidden_layer2_size,
 38 |             activation=activation,
 39 |             log_std_min=log_std_min, log_std_max=log_std_max,
 40 |             weight_init_std=weight_init_std
 41 |         )
 42 | 
 43 |         self.cbf = cbf
 44 |         self.param_shape = tuple(p.shape for p in self.parameters())
 45 |         self.param_size = sum(reduce(mul, shape) for shape in self.param_shape)
 46 | 
 47 |     def to(self, device):
 48 |         super().to(device)
 49 |         self.device = device
 50 | 
 51 |     @property
 52 |     def params(self):
 53 |         """Get policy model parameters."""
 54 |         return torch.cat([p.data.reshape(-1) for p in self.parameters()])
 55 | 
 56 |     @params.setter
 57 |     def params(self, new_values):
 58 |         """Set policy model parameters."""
 59 |         assert new_values.size()[0] == self.param_size, "Error"
 60 | 
 61 |         index = 0
 62 | 
 63 |         for param in self.parameters():
 64 |             size = reduce(mul, param.shape)
 65 |             block = new_values[index:index+size].reshape(param.shape)
 66 |             param.data.copy_(block)
 67 |             index += size
 68 | 
 69 |     def _numpy_original_pdf(self, state):
 70 |         """
 71 |         Return numpy version of the pdf of the untruncated policy at the
 72 |         state provided.
 73 |         """
 74 |         
 75 |         mean, cov = self.forward(state)
 76 |         mean, cov = mean.detach().numpy(), cov.detach().numpy()
 77 |         K = np.float_power((2 * np.pi)**len(mean) * np.linalg.det(cov), -0.5)
 78 |         inv = np.linalg.inv(cov)
 79 | 
 80 |         def pdf(action):
 81 |             return K * np.exp(
 82 |                 -0.5 * (action - mean).dot(inv.dot(action - mean))
 83 |             ).flatten()
 84 | 
 85 |         return pdf
 86 | 
 87 |     def _torch_original_pdf(self, state):
 88 |         """
 89 |         Return torch version of the pdf of the untruncated policy at the
 90 |         state provided. Detach is not called, so all computations herein
 91 |         are reflected in the computation graph.
 92 |         """
 93 | 
 94 |         # import pdb; pdb.set_trace()
 95 | 
 96 |         mean, cov = self.forward(state)
 97 |         K = torch.float_power((2 * np.pi)**len(mean) * torch.linalg.det(cov), -0.5)
 98 |         inv = torch.linalg.inv(cov).squeeze(dim=0)
 99 | 
100 |         def pdf(action):
101 |             return K * torch.exp(-0.5 * torch.matmul(
102 |                 action - mean, torch.matmul(inv, action - mean)))
103 | 
104 |         return pdf
105 | 
106 |     def get_numpy_pdf(self, state):
107 |         """
108 |         Return the pdf of the original Gaussian pdf truncated to the set C(x).
109 |         """
110 |         
111 |         lb, ub = self.cbf(state=state)
112 |         original_pdf = self._numpy_original_pdf(state)
113 |         normalization = integrate.quad(original_pdf, lb, ub)[0]
114 | 
115 |         def pdf(action):
116 |             return original_pdf(action) / normalization if lb <= action <= ub \
117 |                 else 0
118 | 
119 |         return pdf
120 | 
121 |     def sample(self, state, sample_cutoff=100,
122 |                no_log_prob=False, num_log_prob_samples=1000):
123 |         """
124 |         Repeatedly sample from the original Gaussian policy until an action
125 |         lying within the CBF constraint set is generated.
126 | 
127 |         sample_cutoff specifies the number of times to sample using the
128 |         original Gaussian policy before simply uniformly generating an
129 |         action from the CBF constraint set.
130 |         """
131 | 
132 |         lb, ub = self.cbf(state=state)
133 |         lb = float(lb)
134 |         ub = float(ub)
135 |         state = torch.FloatTensor(state).reshape(1, len(state)).to(self.device)
136 |         orig_pdf = self._torch_original_pdf(state)
137 | 
138 |         if not no_log_prob:
139 |             actions = lb + (ub - lb) * torch.rand(
140 |                 num_log_prob_samples, self.action_dim, 1, device=self.device
141 |             )
142 |             log_prob = orig_pdf(actions).sum().log() # log pi_theta (C(x) | x)
143 | 
144 |         for _ in range(sample_cutoff):
145 | 
146 |             action, orig_log_prob = super().sample(state)
147 | 
148 |             if lb <= action <= ub:
149 |                 log_prob = orig_log_prob - log_prob # log pi^C_theta 
150 |                                                     # = log pi_theta - log pi_theta(C(x) | x)
151 |                 return action, log_prob if not no_log_prob else action
152 | 
153 |         action = lb + (ub - lb) * torch.rand(1, 1)
154 |         orig_log_prob = orig_pdf(action).log()
155 |         return action, (orig_log_prob - log_prob) if not no_log_prob else action
156 | 
157 | 
158 | class CBFREINFORCEAgent:
159 |     """
160 |     Agent for training a CBF-constrained version of the classic REINFORCE
161 |     algorithm.
162 |     """
163 | 
164 |     def __init__(self,
165 |                  ### agent parameters
166 |                  state_dim, action_dim, cbf,
167 |                  policy_lr, discount_factor,
168 |                  num_log_prob_samples=1000,
169 |                  enable_cuda=False,
170 |                  optimizer=torch.optim.Adam,
171 |                  grad_clip_radius=None,
172 |                  ### policy parameters
173 |                  simple_cov=True,
174 |                  hidden_layer1_size=32,
175 |                  hidden_layer2_size=32,
176 |                  activation='relu',
177 |                  log_std_min=-20, log_std_max=3,
178 |                  weight_init_std=0.0001):
179 | 
180 |         assert action_dim == 1, "Action dimension must be 1 in this version"
181 | 
182 |         self.pi = GaussianPolicyCBF(
183 |             cbf=cbf, state_dim=state_dim, action_dim=action_dim,
184 |             simple_cov=simple_cov,
185 |             hidden_layer1_size=hidden_layer1_size,
186 |             hidden_layer2_size=hidden_layer2_size,
187 |             activation=activation,
188 |             log_std_min=log_std_min, log_std_max=log_std_max,
189 |             weight_init_std=weight_init_std
190 |         )
191 | 
192 |         self.gamma = discount_factor
193 | 
194 |         self.pi_optim = optimizer(self.pi.parameters(), lr=policy_lr)
195 |         self.grad_clip_radius = grad_clip_radius
196 | 
197 |         self.__cuda_enabled = enable_cuda
198 |         self.enable_cuda(self.__cuda_enabled, warn=False)
199 |         # NOTE: self.device is defined when self.enable_cuda is called!
200 | 
201 |     @property
202 |     def cuda_enabled(self):
203 |         return self.__cuda_enabled
204 | 
205 |     def enable_cuda(self, enable_cuda=True, warn=True):
206 |         """Enable or disable cuda and update models."""
207 |         
208 |         if warn:
209 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
210 |                           "initializing optimizers can give errors when using "
211 |                           "optimizers other than SGD or Adam!")
212 |         
213 |         self.__cuda_enabled = enable_cuda
214 |         self.device = torch.device(
215 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
216 |                 else 'cpu')
217 |         self.pi.to(self.device)
218 |         
219 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
220 |         """Load policy and value functions. Copy them to target functions."""
221 |         
222 |         models = torch.load(filename)
223 | 
224 |         self.pi.load_state_dict(models['pi_state_dict'])
225 |         
226 |         if continue_training:
227 |             self.pi.train()
228 |         else:
229 |             self.pi.eval()
230 |             
231 |         self.enable_cuda(enable_cuda, warn=False)
232 |         
233 |     def save_checkpoint(self, filename):
234 |         """Save state_dicts of models and optimizers."""
235 |         
236 |         torch.save({
237 |                 'using_cuda': self.__cuda_enabled,
238 |                 'pi_state_dict': self.pi.state_dict(),
239 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
240 |         }, filename)
241 |     
242 |     def load_checkpoint(self, filename, continue_training=True):
243 |         """Load state_dicts for models and optimizers."""
244 |         
245 |         checkpoint = torch.load(filename)
246 |         
247 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
248 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
249 |         
250 |         if continue_training:
251 |             self.pi.train()
252 |             
253 |         else:
254 |             self.pi.eval()
255 |         
256 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
257 | 
258 |     def update(self, env, rollout_length, sample_cutoff=100):
259 |         """
260 |         Perform a single rollout and corresponding gradient update.
261 |         Return the total reward accumulated during the rollout.
262 |         """
263 | 
264 |         rewards, log_probs = [], []
265 |         num_steps = 0
266 | 
267 |         state = env.state
268 | 
269 |         for _ in range(rollout_length):
270 |             action, log_prob = self.pi.sample(state,
271 |                                               sample_cutoff=sample_cutoff)
272 |             state, reward, done, _ = env.step(action.cpu().detach().numpy())
273 |             rewards.append(float(reward))
274 |             log_probs.append(log_prob)
275 |             
276 |             if done:
277 |                 break
278 | 
279 |             num_steps += 1
280 | 
281 |         G = 0
282 |         pi_loss = 0
283 | 
284 |         for i in range(len(rewards) - 1, -1, -1):
285 |             G = rewards[i] + self.gamma * G
286 |             pi_loss = pi_loss + (self.gamma ** i) * G * log_probs[i]
287 | 
288 |         pi_loss = -pi_loss
289 | 
290 |         self.pi_optim.zero_grad()
291 |         pi_loss.backward()
292 |         if self.grad_clip_radius is not None:
293 |             torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
294 |                                            self.grad_clip_radius)
295 |         self.pi_optim.step()
296 | 
297 |         return np.mean(rewards)
298 | 
299 |     def train(self, env, num_episodes, rollout_length,
300 |               output_dir, args_list,
301 |               reset_env=True,
302 |               sample_cutoff=100):
303 |         """
304 |         Train on the environment.
305 |         """
306 | 
307 |         episode_mean_rewards = []
308 | 
309 |         for i in range(num_episodes):
310 |             if reset_env:
311 |                 env.reset()
312 |             mean_reward = self.update(env, rollout_length,
313 |                                       sample_cutoff=sample_cutoff)
314 |             cbf = [float(elem) for elem in env.cbf(env.state)]
315 |             print(
316 |                 f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}, C(x) = [{cbf[0]:.2f}, {cbf[1]:.2f}]')
317 |             episode_mean_rewards.append(mean_reward)
318 | 
319 |         rewards_filename = os.path.join(output_dir, 'episode_rewards')
320 |         np.save(rewards_filename, episode_mean_rewards)
321 | 
322 |         hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml')
323 |         with open(hyperparams_filename, 'w') as f:
324 |             yaml.dump(args_list, f)
325 | 
326 | 
327 | 
328 | class CBFACAgent:
329 |     """
330 |     Agent for training a CBF-constrained version of the classic actor-critic
331 |     algorithm.
332 |     """
333 | 
334 |     def __init__(self,
335 |                  ### agent parameters
336 |                  state_dim, action_dim,
337 |                  policy_lr, value_lr, discount_factor,
338 |                  cbf=None,
339 |                  num_log_prob_samples=1000,
340 |                  enable_cuda=False,
341 |                  policy_optimizer=torch.optim.Adam,
342 |                  value_optimizer=torch.optim.Adam,
343 |                  grad_clip_radius=None,
344 |                  ### policy parameters
345 |                  simple_cov=True,
346 |                  policy_hidden_layer1_size=32,
347 |                  policy_hidden_layer2_size=32,
348 |                  policy_activation='relu',
349 |                  log_std_min=-20, log_std_max=3,
350 |                  weight_init_std=0.0001,
351 |                  # value function parameters
352 |                  value_hidden_layer1_size=32,
353 |                  value_hidden_layer2_size=32,
354 |                  value_activation='ReLU'):
355 | 
356 |         assert action_dim == 1, "Action dimension must be 1 in this version"
357 | 
358 |         self.pi = GaussianPolicyCBF(
359 |             cbf=cbf, state_dim=state_dim, action_dim=action_dim,
360 |             simple_cov=simple_cov,
361 |             hidden_layer1_size=policy_hidden_layer1_size,
362 |             hidden_layer2_size=policy_hidden_layer2_size,
363 |             activation=policy_activation,
364 |             log_std_min=log_std_min, log_std_max=log_std_max,
365 |             weight_init_std=weight_init_std
366 |         )
367 | 
368 |         self.v = two_layer_net(
369 |             input_dim=state_dim, output_dim=1,
370 |             hidden_layer1_size=value_hidden_layer1_size,
371 |             hidden_layer2_size=value_hidden_layer2_size,
372 |             activation=value_activation,
373 |         )
374 | 
375 |         self.gamma = discount_factor
376 | 
377 |         self.pi_optim = policy_optimizer(self.pi.parameters(), lr=policy_lr)
378 |         self.grad_clip_radius = grad_clip_radius
379 | 
380 |         self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr)
381 |         self.grad_clip_radius = grad_clip_radius
382 | 
383 |         self.__cuda_enabled = enable_cuda
384 |         self.enable_cuda(self.__cuda_enabled, warn=False)
385 |         # NOTE: self.device is defined when self.enable_cuda is called!
386 | 
387 |     @property
388 |     def cuda_enabled(self):
389 |         return self.__cuda_enabled
390 | 
391 |     def enable_cuda(self, enable_cuda=True, warn=True):
392 |         """Enable or disable cuda and update models."""
393 |         
394 |         if warn:
395 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
396 |                           "initializing optimizers can give errors when using "
397 |                           "optimizers other than SGD or Adam!")
398 |         
399 |         self.__cuda_enabled = enable_cuda
400 |         self.device = torch.device(
401 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
402 |                 else 'cpu')
403 |         self.pi.to(self.device)
404 |         self.v.to(self.device)
405 |         
406 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
407 |         """Load policy and value functions. Copy them to target functions."""
408 |         
409 |         models = torch.load(filename)
410 | 
411 |         self.pi.load_state_dict(models['pi_state_dict'])
412 |         self.v.load_state_dict(models['v_state_dict'])
413 |         
414 |         if continue_training:
415 |             self.pi.train()
416 |             self.v.train()
417 |         else:
418 |             self.pi.eval()
419 |             self.v.eval()
420 |             
421 |         self.enable_cuda(enable_cuda, warn=False)
422 |         
423 |     def save_checkpoint(self, filename):
424 |         """Save state_dicts of models and optimizers."""
425 |         
426 |         torch.save({
427 |                 'using_cuda': self.__cuda_enabled,
428 |                 'pi_state_dict': self.pi.state_dict(),
429 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
430 |                 'v_state_dict': self.v.state_dict(),
431 |                 'v_optimizer_state_dict': self.v_optim.state_dict()
432 |         }, filename)
433 |     
434 |     def load_checkpoint(self, filename, continue_training=True):
435 |         """Load state_dicts for models and optimizers."""
436 |         
437 |         checkpoint = torch.load(filename)
438 |         
439 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
440 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
441 |         self.v.load_state_dict(checkpoint['v_state_dict'])
442 |         self.v_optim.load_state_dict(checkpoint['v_optimizer_state_dict'])
443 |         
444 |         if continue_training:
445 |             self.pi.train()
446 |             self.v.train()
447 |             
448 |         else:
449 |             self.pi.eval()
450 |             self.v.eval()
451 |         
452 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
453 | 
454 |     def update(self, env, episode_length, sample_cutoff=100):
455 |         """
456 |         Perform a single episode and corresponding gradient update.
457 |         Return the total reward accumulated during the rollout.
458 |         """
459 | 
460 |         states, actions, rewards, next_states, log_probs = [], [], [], [], []
461 | 
462 |         state = env.state
463 | 
464 |         for _ in range(episode_length):
465 |             states.append(state)
466 | 
467 |             action, log_prob = self.pi.sample(state,
468 |                                               sample_cutoff=sample_cutoff)
469 |             actions.append(action)
470 |             log_probs.append(log_prob)
471 |             state, reward, done, _ = env.step(action.cpu().detach().numpy())
472 |             rewards.append(reward)
473 |             next_states.append(state)
474 | 
475 |             if done:
476 |                 break
477 | 
478 |         next_states.append(env.state)
479 | 
480 |         v_loss = 0
481 |         pi_loss = 0
482 | 
483 |         for state, action, reward, next_state, log_prob in zip(
484 |             states, actions, rewards, next_states, log_probs):
485 |             state = torch.FloatTensor(state).reshape(1, len(state))
486 |             next_state = torch.FloatTensor(next_state).reshape(1, len(next_state))
487 |             with torch.no_grad():
488 |                 v_target = float(reward) + self.gamma * self.v(next_state)
489 |                 td_error = v_target - self.v(state)
490 |             v_loss += (v_target - self.v(state))**2
491 |             pi_loss = pi_loss + td_error * log_prob
492 | 
493 |         v_loss = v_loss / len(states)
494 |         pi_loss = pi_loss / len(states)
495 |         pi_loss = -pi_loss
496 | 
497 |         self.pi_optim.zero_grad()
498 |         self.v_optim.zero_grad()
499 | 
500 |         if self.grad_clip_radius is not None:
501 |             torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
502 |                                            self.grad_clip_radius)
503 |             torch.nn.utils.clip_grad_norm_(self.v.parameters(),
504 |                                            self.grad_clip_radius)
505 | 
506 |         self.v_optim.step()
507 |         self.pi_optim.step()
508 | 
509 |         return np.mean(rewards)
510 | 
511 |     def train(self, env, num_episodes, rollout_length,
512 |               output_dir, args_list,
513 |               reset_env=True,
514 |               sample_cutoff=100):
515 |         """
516 |         Train on the environment.
517 |         """
518 | 
519 |         episode_mean_rewards = []
520 | 
521 |         for i in range(num_episodes):
522 |             if reset_env:
523 |                 env.reset()
524 |             mean_reward = self.update(env, rollout_length,
525 |                                       sample_cutoff=sample_cutoff)
526 |             episode_mean_rewards.append(mean_reward)
527 |             cbf = [float(elem) for elem in env.cbf(env.state)]
528 |             print(
529 |                 f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}, C(x) = [{cbf[0]:.2f}, {cbf[1]:.2f}]')
530 | 
531 |         rewards_filename = os.path.join(output_dir, 'episode_rewards')
532 |         np.save(rewards_filename, episode_mean_rewards)
533 | 
534 |         hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml')
535 |         with open(hyperparams_filename, 'w') as f:
536 |             yaml.dump(args_list, f)
537 | 
538 | 
539 | class VanillaACAgent:
540 |     """
541 |     Agent for training the classic actor-critic algorithm.
542 |     """
543 | 
544 |     def __init__(self,
545 |                  ### agent parameters
546 |                  state_dim, action_dim,
547 |                  policy_lr, value_lr, discount_factor,
548 |                  enable_cuda=False,
549 |                  policy_optimizer=torch.optim.Adam,
550 |                  value_optimizer=torch.optim.Adam,
551 |                  grad_clip_radius=None,
552 |                  ### policy parameters
553 |                  simple_cov=True,
554 |                  policy_hidden_layer1_size=32,
555 |                  policy_hidden_layer2_size=32,
556 |                  policy_activation='relu',
557 |                  log_std_min=-20, log_std_max=3,
558 |                  weight_init_std=0.0001,
559 |                  # value function parameters
560 |                  value_hidden_layer1_size=32,
561 |                  value_hidden_layer2_size=32,
562 |                  value_activation='ReLU',
563 |                  cbf=None,
564 |                  num_log_prob_samples=None):
565 | 
566 |         assert action_dim == 1, "Action dimension must be 1 in this version"
567 | 
568 |         self.pi = GaussianPolicyTwoLayer(
569 |             state_dim=state_dim, action_dim=action_dim,
570 |             simple_cov=simple_cov,
571 |             hidden_layer1_size=policy_hidden_layer1_size,
572 |             hidden_layer2_size=policy_hidden_layer2_size,
573 |             activation=policy_activation,
574 |             log_std_min=log_std_min, log_std_max=log_std_max,
575 |             weight_init_std=weight_init_std
576 |         )
577 | 
578 |         self.v = two_layer_net(
579 |             input_dim=state_dim, output_dim=1,
580 |             hidden_layer1_size=value_hidden_layer1_size,
581 |             hidden_layer2_size=value_hidden_layer2_size,
582 |             activation=value_activation,
583 |         )
584 | 
585 |         self.gamma = discount_factor
586 | 
587 |         self.pi_optim = policy_optimizer(self.pi.parameters(), lr=policy_lr)
588 |         self.grad_clip_radius = grad_clip_radius
589 | 
590 |         self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr)
591 |         self.grad_clip_radius = grad_clip_radius
592 | 
593 |         self.__cuda_enabled = enable_cuda
594 |         self.enable_cuda(self.__cuda_enabled, warn=False)
595 |         # NOTE: self.device is defined when self.enable_cuda is called!
596 | 
597 |     @property
598 |     def cuda_enabled(self):
599 |         return self.__cuda_enabled
600 | 
601 |     def enable_cuda(self, enable_cuda=True, warn=True):
602 |         """Enable or disable cuda and update models."""
603 |         
604 |         if warn:
605 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
606 |                           "initializing optimizers can give errors when using "
607 |                           "optimizers other than SGD or Adam!")
608 |         
609 |         self.__cuda_enabled = enable_cuda
610 |         self.device = torch.device(
611 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
612 |                 else 'cpu')
613 |         self.pi.to(self.device)
614 |         self.v.to(self.device)
615 |         
616 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
617 |         """Load policy and value functions. Copy them to target functions."""
618 |         
619 |         models = torch.load(filename)
620 | 
621 |         self.pi.load_state_dict(models['pi_state_dict'])
622 |         self.v.load_state_dict(models['v_state_dict'])
623 |         
624 |         if continue_training:
625 |             self.pi.train()
626 |             self.v.train()
627 |         else:
628 |             self.pi.eval()
629 |             self.v.eval()
630 |             
631 |         self.enable_cuda(enable_cuda, warn=False)
632 |         
633 |     def save_checkpoint(self, filename):
634 |         """Save state_dicts of models and optimizers."""
635 |         
636 |         torch.save({
637 |                 'using_cuda': self.__cuda_enabled,
638 |                 'pi_state_dict': self.pi.state_dict(),
639 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
640 |                 'v_state_dict': self.v.state_dict(),
641 |                 'v_optimizer_state_dict': self.v_optim.state_dict()
642 |         }, filename)
643 |     
644 |     def load_checkpoint(self, filename, continue_training=True):
645 |         """Load state_dicts for models and optimizers."""
646 |         
647 |         checkpoint = torch.load(filename)
648 |         
649 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
650 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
651 |         self.v.load_state_dict(checkpoint['v_state_dict'])
652 |         self.v_optim.load_state_dict(checkpoint['v_optimizer_state_dict'])
653 |         
654 |         if continue_training:
655 |             self.pi.train()
656 |             self.v.train()
657 |             
658 |         else:
659 |             self.pi.eval()
660 |             self.v.eval()
661 |         
662 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
663 | 
664 |     def update(self, env, episode_length, sample_cutoff=100):
665 |         """
666 |         Perform a single episode and corresponding gradient update.
667 |         Return the mean reward accumulated during the rollout.
668 |         """
669 | 
670 |         states, actions, rewards, next_states, log_probs = [], [], [], [], []
671 | 
672 |         state = env.reset()
673 | 
674 |         for _ in range(episode_length):
675 |             states.append(state)
676 | 
677 |             action, log_prob = self.pi.sample(
678 |                 torch.FloatTensor(state).reshape(1, len(state)).to(self.device)
679 |             )
680 |             actions.append(action)
681 |             log_probs.append(log_prob)
682 |             state, reward, done, _ = env.step(action.cpu().detach().numpy())
683 |             rewards.append(reward)
684 |             next_states.append(state)
685 |             if done:
686 |                 break
687 | 
688 |         next_states.append(env.state)
689 | 
690 |         v_loss = 0
691 |         pi_loss = 0
692 | 
693 |         for state, action, reward, next_state, log_prob in zip(
694 |             states, actions, rewards, next_states, log_probs):
695 |             state = torch.FloatTensor(state).reshape(1, len(state))
696 |             next_state = torch.FloatTensor(next_state).reshape(1, len(next_state))
697 |             with torch.no_grad():
698 |                 v_target = float(reward) + self.gamma * self.v(next_state)
699 |                 td_error = v_target - self.v(state)
700 |             v_loss += (v_target - self.v(state))**2
701 |             pi_loss = pi_loss + td_error * log_prob
702 | 
703 |         pi_loss = pi_loss / len(states)
704 |         v_loss = v_loss / len(states)
705 |         pi_loss = -pi_loss
706 | 
707 |         self.pi_optim.zero_grad()
708 |         self.v_optim.zero_grad()
709 | 
710 |         if self.grad_clip_radius is not None:
711 |             torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
712 |                                            self.grad_clip_radius)
713 |             torch.nn.utils.clip_grad_norm_(self.v.parameters(),
714 |                                            self.grad_clip_radius)
715 | 
716 |         self.v_optim.step()
717 |         self.pi_optim.step()
718 | 
719 |         return np.mean(rewards)
720 | 
721 |     def train(self, env, num_episodes, rollout_length,
722 |               output_dir, args_list,
723 |               reset_env=True,
724 |               sample_cutoff=100):
725 |         """
726 |         Train on the environment.
727 |         """
728 | 
729 |         episode_mean_rewards = []
730 | 
731 |         for i in range(num_episodes):
732 |             if reset_env:
733 |                 env.reset()
734 |             mean_reward = self.update(env, rollout_length,
735 |                                       sample_cutoff=sample_cutoff)
736 |             episode_mean_rewards.append(mean_reward)
737 |             print(
738 |                 f'Episode {i}: moving ave reward {np.mean(episode_mean_rewards[-20:]):.8f}, mean reward {mean_reward:.8f}')
739 | 
740 |         # rewards_filename = os.path.join(output_dir, 'episode_rewards')
741 |         # np.save(rewards_filename, episode_mean_rewards)
742 | 
743 |         # hyperparams_filename = os.path.join(output_dir, 'hyperparams.yml')
744 |         # with open(hyperparams_filename, 'w') as f:
745 |         #     yaml.dump(args_list, f)
746 | 


--------------------------------------------------------------------------------
/Pendulum/agents/ppo.py:
--------------------------------------------------------------------------------
   1 | import torch
   2 | import torch.nn as nn
   3 | import torch.distributions as td
   4 | from torch.nn import functional as F
   5 | import gym
   6 | from gym import spaces
   7 | import numpy as np
   8 | from typing import NamedTuple
   9 | import warnings
  10 | from matplotlib import pyplot as plt
  11 | 
  12 | from stable_baselines3.common.utils import obs_as_tensor
  13 | from stable_baselines3.common.preprocessing import (
  14 |     get_obs_shape, get_action_dim
  15 | )
  16 | 
  17 | 
  18 | #from agents.TruncatedNormal import TruncatedNormal as tn
  19 | 
  20 | from wesutils import two_layer_net
  21 | 
  22 | class RolloutBufferSamples(NamedTuple):
  23 |     observations: torch.Tensor
  24 |     actions: torch.Tensor
  25 |     old_values: torch.Tensor
  26 |     old_log_prob: torch.Tensor
  27 |     advantages: torch.Tensor
  28 |     returns: torch.Tensor
  29 | 
  30 | 
  31 | class RolloutBuffer:
  32 |     
  33 |     def __init__(self,
  34 |                  buffer_size,
  35 |                  observation_space,
  36 |                  action_space,
  37 |                  gamma=0.99,
  38 |                  device='cpu'):
  39 |         
  40 |         self.buffer_size = buffer_size
  41 |         self.observation_space = observation_space
  42 |         self.action_space = action_space
  43 |         self.gamma = gamma
  44 |         self.device = device
  45 |         self.obs_shape = get_obs_shape(self.observation_space)
  46 |         self.action_dim = get_action_dim(self.action_space)
  47 |         
  48 |         self.reset()
  49 |         
  50 |     def reset(self):
  51 |         
  52 |         self.observations = np.zeros(
  53 |             (self.buffer_size,) + self.obs_shape, dtype=np.float32
  54 |         )
  55 |         self.actions = np.zeros(
  56 |             (self.buffer_size, self.action_dim), dtype=np.float32
  57 |         )
  58 |         self.rewards = np.zeros(
  59 |             (self.buffer_size,), dtype=np.float32
  60 |         )
  61 |         self.episode_starts = np.zeros(
  62 |             (self.buffer_size,), dtype=np.float32
  63 |         )
  64 |         self.values = np.zeros(
  65 |             (self.buffer_size,), dtype=np.float32
  66 |         )
  67 |         self.log_probs = np.zeros(
  68 |             (self.buffer_size,), dtype=np.float32
  69 |         )
  70 |         self.advantages = np.zeros(
  71 |             (self.buffer_size,), dtype=np.float32
  72 |         )
  73 |         
  74 |         self.full = False
  75 |         self.pos = 0
  76 |         
  77 |     def compute_returns_and_advantage(self, last_value, done):
  78 |         
  79 |         last_value = last_value.clone().cpu().numpy().flatten()
  80 |         
  81 |         discounted_reward = 0
  82 |         for step in reversed(range(self.buffer_size)):
  83 |             if step == self.buffer_size - 1:
  84 |                 next_non_terminal = 1.0 - done
  85 |                 next_value = last_value
  86 |             else:
  87 |                 next_non_terminal = 1.0 - self.episode_starts[step + 1]
  88 |                 next_value = self.values[step + 1]
  89 |             discounted_reward = self.rewards[step] + \
  90 |                 self.gamma * discounted_reward * next_non_terminal
  91 |             self.advantages[step] = discounted_reward - self.values[step]
  92 |         self.returns = self.advantages + self.values
  93 |         
  94 |     def add(self, obs, action, reward, episode_start, value, log_prob):
  95 |         
  96 |         if len(log_prob.shape) == 0:
  97 |             log_prob = log_prob.reshape(-1, 1)
  98 |         
  99 |         if isinstance(self.observation_space, spaces.Discrete):
 100 |             obs = obs.reshape((1,) + self.obs_shape)
 101 |             
 102 |         self.observations[self.pos] = np.array(obs).copy()
 103 |         self.actions[self.pos] = np.array(action).copy()
 104 |         self.rewards[self.pos] = np.array(reward).copy()
 105 |         self.episode_starts[self.pos] = np.array(episode_start).copy()
 106 |         self.values[self.pos] = value.clone().cpu().numpy().flatten()
 107 |         self.log_probs[self.pos] = log_prob.clone().cpu().numpy()
 108 |         self.pos += 1
 109 |         if self.pos == self.buffer_size:
 110 |             self.full = True
 111 |             
 112 |     def get(self, batch_size=None):
 113 |         assert self.full, ""
 114 |         indices = np.random.permutation(self.buffer_size)
 115 | 
 116 |         # Return everything, don't create minibatches
 117 |         if batch_size is None:
 118 |             batch_size = self.buffer_size
 119 | 
 120 |         start_idx = 0
 121 |         while start_idx < self.buffer_size:
 122 |             yield self._get_samples(indices[start_idx : start_idx + batch_size])
 123 |             start_idx += batch_size
 124 | 
 125 |     def _get_samples(self, batch_inds):
 126 |         data = (
 127 |             self.observations[batch_inds],
 128 |             self.actions[batch_inds],
 129 |             self.values[batch_inds].flatten(),
 130 |             self.log_probs[batch_inds].flatten(),
 131 |             self.advantages[batch_inds].flatten(),
 132 |             self.returns[batch_inds].flatten(),
 133 |         )
 134 |         return RolloutBufferSamples(*tuple(map(self.to_torch, data)))
 135 |     
 136 |     def to_torch(self, array, copy=True):
 137 |         if copy:
 138 |             return torch.tensor(array).to(self.device)
 139 |         return torch.as_tensor(array).to(self.device)
 140 | 
 141 | 
 142 | class PolicyNetwork(nn.Module):
 143 |     """Base class for stochastic policy networks."""
 144 | 
 145 |     def __init__(self):
 146 |         super().__init__()
 147 | 
 148 |     def forward(self, state):
 149 |         """Take state as input, then output the parameters of the policy."""
 150 | 
 151 |         raise NotImplemented("forward not implemented.")
 152 | 
 153 |     def sample(self, state):
 154 |         """
 155 |         Sample an action based on the model parameters given the current state.
 156 |         """
 157 | 
 158 |         raise NotImplemented("sample not implemented.")
 159 | 
 160 |     def log_probs(self, obs, actions):
 161 |         """
 162 |         Return log probabilities for each state-action pair.
 163 |         """
 164 | 
 165 |         raise NotImplemented("log_probs not implemented.")
 166 | 
 167 |     def entropy(self, obs):
 168 |         """
 169 |         Return entropy of the policy for each state.
 170 |         """
 171 | 
 172 |         raise NotImplemented("entropy not implemented.")
 173 | 
 174 | 
 175 | class GaussianPolicyBase(PolicyNetwork):
 176 |     """
 177 |     Base class for Gaussian policy.
 178 | 
 179 |     Desired network needs to be implemented.
 180 |     """
 181 | 
 182 |     def __init__(self, action_dim):
 183 | 
 184 |         super().__init__()
 185 | 
 186 |         self.action_dim = action_dim
 187 | 
 188 |     def _get_covs(self, log_stds):
 189 |         batch_size = log_stds.shape[0]
 190 |         stds = log_stds.exp().reshape(batch_size, 1, 1)
 191 |         covs = stds * torch.eye(self.action_dim).repeat(batch_size, 1, 1)
 192 |         return covs
 193 | 
 194 |     def sample(self, obs, no_log_prob=False):
 195 |         mean, log_std = self.forward(obs)
 196 |         cov = log_std.exp() * torch.eye(self.action_dim)
 197 |         dist = td.MultivariateNormal(mean, cov)
 198 |         action = dist.rsample()
 199 |         return action if no_log_prob else (action, dist.log_prob(action))
 200 | 
 201 |     def log_probs(self, obs, actions):
 202 |         means, log_stds = self.forward(obs)
 203 |         covs = self._get_covs(log_stds)
 204 |         dists = td.MultivariateNormal(means, covs)
 205 |         return dists.log_prob(actions)
 206 | 
 207 |     def entropy(self, obs):
 208 |         means, log_stds = self.forward(obs)
 209 |         covs = self._get_covs(log_stds)
 210 |         dists = td.MultivariateNormal(means, covs)
 211 |         return dists.entropy()
 212 | 
 213 | 
 214 | class GaussianPolicy(GaussianPolicyBase):
 215 |     """
 216 |     Gaussian policy using a two-layer, two-headed MLP with ReLU activation.
 217 |     """
 218 | 
 219 |     def __init__(self, obs_dim, action_dim,
 220 |                  min_action_val=-1.0,
 221 |                  max_action_val=1.0,
 222 |                  hidden_layer1_size=64,
 223 |                  hidden_layer2_size=64):
 224 | 
 225 |         super().__init__(action_dim)
 226 | 
 227 |         self.base_net = nn.Sequential(
 228 |             nn.Linear(obs_dim, hidden_layer1_size),
 229 |             nn.ReLU(),
 230 |             nn.Linear(hidden_layer1_size, hidden_layer2_size),
 231 |             nn.ReLU(),
 232 |         )
 233 | 
 234 |         self.mean_net = nn.Sequential(
 235 |             nn.Linear(hidden_layer2_size, action_dim),
 236 |             nn.Hardtanh(min_action_val, max_action_val),
 237 |         )
 238 | 
 239 |         self.log_std_net = nn.Sequential(
 240 |             nn.Linear(hidden_layer2_size, 1),
 241 |         )
 242 | 
 243 |     def forward(self, obs):
 244 |         x = self.base_net(obs)
 245 |         mean = self.mean_net(x)
 246 |         log_std = self.log_std_net(x)
 247 |         return mean, log_std
 248 | 
 249 | 
 250 | # class TruncatedNormalPolicyBase(PolicyNetwork):
 251 | #     """
 252 | #     Base class for TruncatedNormal policy. Action dimension must be 1.
 253 | #     Uses a function (e.g., a CBF) to convert observations into bounds for
 254 | #     the TruncatedNormal distribution.
 255 | 
 256 | #     Desired network needs to be implemented.
 257 | #     """
 258 | 
 259 | #     def __init__(self, constraint_fn):
 260 | 
 261 | #         super().__init__()
 262 | 
 263 | #         self.constraint_fn = self._vectorize_f(constraint_fn)
 264 | 
 265 | #     def _get_dist(self, obs):
 266 | #         mean, log_std = self.forward(obs)
 267 | #         std = log_std.exp()
 268 | #         lb, ub = self.constraint_fn(obs)
 269 | #         dist = tn(mean, std, lb, ub)
 270 | 
 271 | #         return dist
 272 | 
 273 | #     def sample(self, obs, no_log_prob=False):
 274 | #         dist = self._get_dist(obs)
 275 | #         lb = dist.scale * dist.a + dist.loc
 276 | #         ub = dist.scale * dist.b + dist.loc
 277 | #         action = dist.rsample()
 278 | 
 279 | #         for _ in range(100):
 280 | #             if (action > lb) and (action < ub):
 281 | #                 break
 282 | #         if (action < lb) or (action > ub):
 283 | #             print("OOPS! Resampling...")
 284 | #             action = lb + (ub - lb) * torch.rand(size=(1, 1), requires_grad=True)
 285 | 
 286 | #         try:
 287 | #             log_prob = dist.log_prob(action)
 288 | #         except ValueError:
 289 | #             import pdb; pdb.set_trace()
 290 | 
 291 | #         return action if no_log_prob else (action, log_prob)
 292 | 
 293 | #     def log_probs(self, obs, actions):
 294 | #         dists = self._get_dist(obs)
 295 | #         return dists.log_prob(actions).flatten()
 296 | 
 297 | #     def entropy(self, obs):
 298 | #         dists = self._get_dist(obs)
 299 | #         return dists.entropy
 300 | 
 301 | #     def _vectorize_f(self, f):
 302 | #         """
 303 | #         Converts a function f defined on 1D numpy arrays and outputting pairs of
 304 | #         scalars into a vectorized function accepting batches of
 305 | #         torch tensorized arrays and output pairs of torch tensors.
 306 | #         """
 307 | 
 308 | #         def vectorized_cbf(obs):
 309 | 
 310 | #             obs = obs.cpu().detach().numpy()
 311 | #             lbs, ubs = [], []
 312 | 
 313 | #             if len(obs.shape) == 1:
 314 | #                 batch_size = 1
 315 | #                 lb, ub = f(obs)
 316 | #                 lbs.append(lb)
 317 | #                 ubs.append(ub)
 318 | 
 319 | #             else:
 320 | #                 batch_size = obs.shape[0]
 321 | #                 for i in range(batch_size):
 322 | #                     lb, ub = f(obs[i])
 323 | #                     lbs.append(lb)
 324 | #                     ubs.append(ub)
 325 | 
 326 | #             lbs = torch.FloatTensor(lbs).reshape(batch_size, 1)
 327 | #             ubs = torch.FloatTensor(ubs).reshape(batch_size, 1)
 328 |             
 329 | #             return lbs, ubs
 330 | 
 331 | #         return vectorized_cbf
 332 | 
 333 | 
 334 | # class TruncatedNormalPolicy(TruncatedNormalPolicyBase):
 335 | #     """
 336 | #     TruncatedNormal policy using a two-layer, two-headed MLP with ReLU
 337 | #     activation. Action dimension must be 1.
 338 | #     """
 339 | 
 340 | #     def __init__(self, obs_dim, constraint_fn,
 341 | #                  hidden_layer1_size=64,
 342 | #                  hidden_layer2_size=64,
 343 | #                  mean_min=-np.inf,
 344 | #                  mean_max=np.inf,
 345 | #                  log_std_lb=-10,
 346 | #                  log_std_ub=3):
 347 | 
 348 | #         super().__init__(constraint_fn)
 349 | 
 350 | #         self.base_net = nn.Sequential(
 351 | #             nn.Linear(obs_dim, hidden_layer1_size),
 352 | #             nn.ReLU(),
 353 | #             nn.Linear(hidden_layer1_size, hidden_layer2_size),
 354 | #             nn.ReLU(),
 355 | #         )
 356 | 
 357 | #         self.mean_net = nn.Sequential(
 358 | #             nn.Linear(hidden_layer2_size, 1),
 359 | #             # nn.Hardtanh(min_val=mean_min, max_val=mean_max)
 360 | #         )
 361 | 
 362 | #         self.log_std_net = nn.Sequential(
 363 | #             nn.Linear(hidden_layer2_size, 1),
 364 | #             # nn.Hardtanh(min_val=log_std_lb, max_val=log_std_ub)
 365 | #         )
 366 | 
 367 | #         def init_weights(m):
 368 | #             if isinstance(m, nn.Linear):
 369 | #                 torch.nn.init.normal_(m.weight, std=1.0)
 370 | 
 371 | #         self.base_net.apply(init_weights)
 372 | #         self.mean_net.apply(init_weights)
 373 | #         self.log_std_net.apply(init_weights)
 374 | 
 375 | #     def forward(self, obs):
 376 | #         x = self.base_net(obs)
 377 | #         mean = self.mean_net(x)
 378 | #         log_std = self.log_std_net(x)
 379 | #         return mean, log_std
 380 | 
 381 | 
 382 | class BetaPolicyBase(PolicyNetwork):
 383 |     """
 384 |     Base class for Beta policy.
 385 | 
 386 |     Desired network needs to be implemented.
 387 |     """
 388 | 
 389 |     def __init__(self, constraint_fn, action_dim, enable_cuda=False):
 390 | 
 391 |         super().__init__()
 392 | 
 393 |         self.device = torch.device(
 394 |                 'cuda' if torch.cuda.is_available() and enable_cuda \
 395 |                 else 'cpu')
 396 |         self.constraint_fn = self._vectorize_f(constraint_fn, action_dim)
 397 |         self.action_dim = action_dim
 398 | 
 399 |     def _vectorize_f(self, f, action_dim): #--vipul :added action_dim
 400 |         """
 401 |         Converts a function f defined on 1D numpy arrays and outputting pairs of
 402 |         scalars into a vectorized function accepting batches of
 403 |         torch tensorized arrays and output pairs of torch tensors.
 404 |         """
 405 | 
 406 |         def vectorized_f(obs, action_dim): #--vipul :added action_dim
 407 | 
 408 |             obs = obs.cpu().detach().numpy()
 409 | 
 410 |             if len(obs.shape) == 1:  # check to see if obs is a batch or single obs
 411 |                 batch_size = 1
 412 |                 lbs, ubs = f(obs)
 413 |                 lbs=np.array(lbs)
 414 |                 ubs=np.array(ubs)
 415 |                 #lbs = -5
 416 |                 #ubs = 5
 417 | 
 418 |             else:
 419 |                 batch_size = obs.shape[0]
 420 |                 lbs = np.zeros([batch_size, self.action_dim])
 421 |                 ubs = np.zeros([batch_size, self.action_dim])
 422 |                 for i in range(batch_size):
 423 |                     lbs[i], ubs[i] = f(obs[i])
 424 |                     #lbs[i] = -5
 425 |                     #ubs[i] = 5
 426 | 
 427 |             lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim)
 428 |             ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim)
 429 |             lbs = lbs.to(self.device)
 430 |             ubs = ubs.to(self.device)
 431 |             
 432 |             return lbs, ubs
 433 | 
 434 |         return vectorized_f
 435 | 
 436 |     def sample(self, obs, action_dim, no_log_prob=False):
 437 |         """
 438 |         Sample from independent Beta distributions across each action_dim.
 439 |         """
 440 | 
 441 |         assert len(obs.shape) == 1, 'obs must be a flat array'
 442 | 
 443 |         alphas, betas = self.forward(obs)
 444 |         alphas, betas = torch.flatten(alphas), torch.flatten(betas)
 445 |         dists = [
 446 |             td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas)
 447 |         ]
 448 |         
 449 |         #from original implementation 
 450 |         #to see if everything works well!
 451 |         dist_orig = td.Beta(alphas, betas)
 452 |         action_orig = dist_orig.rsample()
 453 |         log_prob_orig = dist_orig.log_prob(action_orig)
 454 |         #from original implementation
 455 |         
 456 |         action_along_dims = [dist.rsample() for dist in dists]
 457 |         action = torch.tensor(action_along_dims, requires_grad=True)
 458 |         #action = torch.tensor(action_along_dims)
 459 |         log_prob = torch.sum(torch.tensor([
 460 |             dist.log_prob(a) for dist, a in zip(dists, action_along_dims)
 461 |         ], requires_grad=True))
 462 |         # log_prob = torch.sum(torch.tensor([
 463 |         #     dist.log_prob(a) for dist, a in zip(dists, action_along_dims)
 464 |         # ]))
 465 |         
 466 |         lb, ub = self.constraint_fn(obs, action_dim)
 467 |         action = lb + (ub - lb) * action
 468 |         
 469 |         #below line commented out
 470 |         #return action if no_log_prob else (action, log_prob)
 471 |         
 472 |         
 473 |         #new lines added - 08.29.2023
 474 |         action_orig = lb + (ub - lb) * action_orig
 475 |         return action_orig if no_log_prob else (action_orig, log_prob_orig)
 476 | 
 477 |     def log_probs(self, obs, actions, action_dim):
 478 |         alphas_arr, betas_arr = self.forward(obs)
 479 |         #adding one line
 480 |         dists = td.Beta(alphas_arr, betas_arr)
 481 |         
 482 |         #commenting the below lines 08.26.2023
 483 |         dists_new = []
 484 |         for i in range(alphas_arr.shape[0]):
 485 |             alphas = alphas_arr[i]
 486 |             betas = betas_arr[i]
 487 |             dists_new.append([
 488 |                 td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas)
 489 |             ])
 490 | 
 491 |         lbs, ubs = self.constraint_fn(obs, action_dim)
 492 |         # if lbs.device!=actions.device:
 493 |         #     lbs = lbs.to('cuda:0')
 494 |         #     ubs = ubs.to('cuda:0')
 495 |         actions = (actions - lbs) / (ubs - lbs) 
 496 |         actions = actions.clip(0, 1)
 497 | 
 498 |         #commenting the below lines 08.26.2023
 499 |         log_probs = []
 500 |         for action, action_dists in zip(actions, dists_new):
 501 |             # log_probs.append(
 502 |             #     torch.sum(torch.tensor([
 503 |             #         dim_dist.log_prob(dim_action) \
 504 |             #             for dim_dist, dim_action in zip(action_dists, action)
 505 |             #     ], requires_grad=True))
 506 |             # )
 507 |             log_probs.append(
 508 |                 torch.sum(torch.tensor([
 509 |                     dim_dist.log_prob(dim_action) \
 510 |                         for dim_dist, dim_action in zip(action_dists, action)
 511 |                 ]))
 512 |             )    
 513 |         log_probs = torch.tensor(log_probs, requires_grad=True)
 514 |         # #log_probs = torch.tensor(log_probs)
 515 | 
 516 |         #return log_probs #commenting 08/25/23
 517 |         return dists.log_prob(actions).flatten()
 518 | 
 519 |     def entropy(self, obs):
 520 |         """
 521 |         Returns sum of entropies along each independent action dimension.
 522 |         """
 523 |         alphas_arr, betas_arr = self.forward(obs)
 524 |         dists = []
 525 |         for i in range(alphas_arr.shape[0]):
 526 |             alphas = alphas_arr[i]
 527 |             betas = betas_arr[i]
 528 |             dists.append([
 529 |                 td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas)
 530 |             ])
 531 |         entropies = torch.tensor(
 532 |             [torch.sum(torch.tensor([dist.entropy() for dist in dist_list])) \
 533 |              for dist_list in dists]
 534 |         )
 535 |         return entropies
 536 |         
 537 |         # # TODO: make this consistent with sample & log_probs defined above
 538 |         # alphas, betas = self.forward(obs)
 539 |         # dists = td.Beta(alphas, betas)
 540 |         # return dists.entropy()
 541 | 
 542 | 
 543 | ### NOTE: this is the BetaPolicyBase giving us the Dirichlet error
 544 | # class BetaPolicyBase(PolicyNetwork):
 545 | #     """
 546 | #     Base class for Beta policy.
 547 | # 
 548 | #     Desired network needs to be implemented.
 549 | #     """
 550 | # 
 551 | #     def __init__(self, constraint_fn, action_dim):
 552 | # 
 553 | #         super().__init__()
 554 | # 
 555 | #         self.constraint_fn = self._vectorize_f(constraint_fn, action_dim)
 556 | #         self.action_dim = action_dim
 557 | #         #assert self.action_dim == 1, 'Action dimension must be 1'  #-Line commented out--vipul
 558 | # 
 559 | #     def _vectorize_f(self, f, action_dim): #--vipul :added action_dim
 560 | #         """
 561 | #         Converts a function f defined on 1D numpy arrays and outputting pairs of
 562 | #         scalars into a vectorized function accepting batches of
 563 | #         torch tensorized arrays and output pairs of torch tensors.
 564 | #         """
 565 | # 
 566 | #         def vectorized_f(obs, action_dim): #--vipul :added action_dim
 567 | # 
 568 | #             obs = obs.cpu().detach().numpy()
 569 | # 
 570 | #             if len(obs.shape) == 1:  # check to see if obs is a batch or single obs
 571 | #                 batch_size = 1
 572 | #                 lbs, ubs = f(obs)
 573 | # 
 574 | #             else:
 575 | #                 batch_size = obs.shape[0]
 576 | #                 lbs = np.zeros([batch_size, self.action_dim])
 577 | #                 ubs = np.zeros([batch_size, self.action_dim])
 578 | #                 for i in range(batch_size):
 579 | #                     lbs[i], ubs[i] = f(obs[i])
 580 | # 
 581 | #             lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim)
 582 | #             ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim)
 583 | #             
 584 | #             return lbs, ubs
 585 | # 
 586 | #         return vectorized_f
 587 | # 
 588 | #     def sample(self, obs, action_dim, no_log_prob=False): #sample from interval is taken with a beta distribution -vipul
 589 | #         #sample definition changed to take in argument action_dim --vipul
 590 | #         alpha, beta = self.forward(obs)
 591 | #         dist = td.Beta(alpha, beta)
 592 | #         action = dist.rsample()
 593 | #         #log_prob = dist.log_prob(action) #original code
 594 | #         log_prob_vec = dist.log_prob(action) #vipul
 595 | #         log_prob = sum(log_prob_vec)
 596 | #         lb, ub = self.constraint_fn(obs, action_dim)
 597 | #         action = lb + (ub - lb) * action  #need to be changed? --vipul
 598 | #         #code change nedded! -vipul
 599 | #         return action if no_log_prob else (action, log_prob)
 600 | # 
 601 | #     def log_probs(self, obs, actions, action_dim): #function modified 
 602 | #         alphas, betas = self.forward(obs)
 603 | #         dists = td.Beta(alphas, betas)
 604 | #         lbs, ubs = self.constraint_fn(obs, action_dim)
 605 | #         actions = (actions - lbs) / (ubs - lbs) 
 606 | #         #need to be changed? --vipul
 607 | #         #yes, change needed!
 608 | #         return sum(dists.log_prob(actions).flatten())
 609 | # 
 610 | #     def entropy(self, obs):
 611 | #         alphas, betas = self.forward(obs)
 612 | #         dists = td.Beta(alphas, betas)
 613 | #         return dists.entropy()
 614 | ###
 615 | 
 616 | 
 617 | class BetaPolicy(BetaPolicyBase):
 618 |     """
 619 |     Beta policy using a two-layer, two-headed MLP with ReLU activation.
 620 |     """
 621 | 
 622 |     def __init__(self, obs_dim, constraint_fn, action_dim, #vipul : action_dim=1 replaced by just action_dim
 623 |                  hidden_layer1_size=64,
 624 |                  hidden_layer2_size=64):
 625 | 
 626 |         super().__init__(constraint_fn, action_dim=action_dim)
 627 | 
 628 |         self.base_net = nn.Sequential(
 629 |             nn.Linear(obs_dim, hidden_layer1_size),
 630 |             nn.Tanh(),
 631 |             nn.Linear(hidden_layer1_size, hidden_layer2_size),
 632 |             nn.Tanh(),
 633 |         )
 634 | 
 635 |         self.alpha_net = nn.Sequential(
 636 |             nn.Linear(hidden_layer2_size, action_dim), #vipul : 1 replaced with action_dim
 637 |             nn.Softplus(),
 638 |         )
 639 | 
 640 |         self.beta_net = nn.Sequential(
 641 |             nn.Linear(hidden_layer2_size, action_dim), #vipul : 1 replaced with action_dim
 642 |             nn.Softplus(),
 643 |         )
 644 | 
 645 |     def forward(self, obs):
 646 | 
 647 |         x = self.base_net(obs)
 648 |         alpha = 1.0 + self.alpha_net(x) #is there any change needed? --vipul
 649 |         beta = 1.0 + self.beta_net(x)   #is there any change needed? --vipul
 650 | 
 651 |         return alpha, beta
 652 | 
 653 | 
 654 | class CategoricalPolicy(PolicyNetwork):
 655 |     """
 656 |     Base class for categorical policy.
 657 | 
 658 |     Desired network needs to be implemented.
 659 |     """
 660 | 
 661 |     def __init__(self, num_actions):
 662 | 
 663 |         super().__init__()
 664 | 
 665 |         self.num_actions = num_actions
 666 | 
 667 |     def sample(self, obs, no_log_prob=False):
 668 |         logits = self.forward(obs)
 669 |         dist = td.Categorical(logits=logits)
 670 |         action = dist.sample(sample_shape=torch.tensor([1]))
 671 |         return action if no_log_prob else (action, dist.log_prob(action))
 672 | 
 673 |     def log_probs(self, obs, actions):
 674 |         dists = td.Categorical(logits=self.forward(obs))
 675 |         return dists.log_prob(actions.flatten())
 676 | 
 677 |     def entropy(self, obs):
 678 |         dists = td.Categorical(logits=self.forward(obs))
 679 |         return dists.entropy()
 680 | 
 681 | 
 682 | class CategoricalPolicyTwoLayer(CategoricalPolicy):
 683 |     """
 684 |     Categorical policy using a fully connected two-layer network.
 685 |     """
 686 | 
 687 |     def __init__(self, state_dim, num_actions,
 688 |                  hidden_layer1_size=64,
 689 |                  hidden_layer2_size=64,
 690 |                  init_std=0.001):
 691 | 
 692 |         super().__init__(num_actions)
 693 | 
 694 |         self.init_std = init_std
 695 | 
 696 |         self.linear1 = nn.Linear(state_dim, hidden_layer1_size)
 697 |         self.linear2 = nn.Linear(hidden_layer1_size, hidden_layer2_size)
 698 |         self.linear3 = nn.Linear(hidden_layer2_size, num_actions)
 699 |         nn.init.normal_(self.linear1.weight, std=init_std)
 700 |         nn.init.normal_(self.linear2.weight, std=init_std)
 701 |         nn.init.normal_(self.linear3.weight, std=init_std)
 702 | 
 703 |     def forward(self, state):
 704 |         x = F.relu(self.linear1(state))
 705 |         x = F.relu(self.linear2(x))
 706 |         output = self.linear3(x)
 707 |         return output
 708 | 
 709 | 
 710 | class PPOBase:
 711 |     def __init__(self,
 712 |                  env,
 713 |                  policy,
 714 |                  value_function,
 715 |                  policy_lr,
 716 |                  value_lr,
 717 |                  entropy_coef=0.0,
 718 |                  clip_range=0.2,
 719 |                  n_epochs=10,
 720 |                  batch_size=64,
 721 |                  weight_decay=0.0,
 722 |                  gamma=0.99,
 723 |                  buffer_size=2048,
 724 |                  enable_cuda=True,
 725 |                  policy_optimizer=torch.optim.Adam,
 726 |                  value_optimizer=torch.optim.Adam,
 727 |                  grad_clip_radius=None):
 728 | 
 729 |         warnings.warn('This PPO implementation currently contains hacks for ' + \
 730 |                       'returning information about CBF-related safety.')
 731 | 
 732 |         self.env = env
 733 |         self.pi = policy
 734 |         self.v = value_function
 735 |         self.entropy_coef = entropy_coef
 736 |         self.clip_range = clip_range
 737 |         self.n_epochs = n_epochs
 738 |         self.batch_size = batch_size
 739 | 
 740 |         self.__cuda_enabled = enable_cuda
 741 |         self.enable_cuda(self.__cuda_enabled, warn=False)
 742 |         # NOTE: self.device is defined when self.enable_cuda is called!
 743 | 
 744 |         self.pi_optim = policy_optimizer(self.pi.parameters(),
 745 |                                          lr=policy_lr,
 746 |                                          weight_decay=weight_decay)
 747 |         self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr)
 748 |         self.grad_clip_radius = grad_clip_radius
 749 | 
 750 |         self.rollout_buffer = RolloutBuffer(
 751 |             buffer_size,
 752 |             env.observation_space,
 753 |             env.action_space,
 754 |             device=self.device,
 755 |             gamma=gamma
 756 |         )
 757 | 
 758 |     @property
 759 |     def cuda_enabled(self):
 760 |         return self.__cuda_enabled
 761 | 
 762 |     def enable_cuda(self, enable_cuda=True, warn=True):
 763 |         """Enable or disable cuda and update models."""
 764 |         
 765 |         if warn:
 766 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
 767 |                           "initializing optimizers can give errors when using "
 768 |                           "optimizers other than SGD or Adam!")
 769 |         
 770 |         self.__cuda_enabled = enable_cuda
 771 |         self.device = torch.device(
 772 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
 773 |                 else 'cpu')
 774 |         self.pi.to(self.device)
 775 |         self.v.to(self.device)
 776 |         
 777 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
 778 |         """
 779 |         Load policy and value functions. Copy them to target functions.
 780 | 
 781 |         This method is for evaluation only. Use load_checkpoint to continue
 782 |         training.
 783 |         """
 784 |         
 785 |         models = torch.load(filename)
 786 | 
 787 |         self.pi.load_state_dict(models['pi_state_dict'])
 788 |         self.v.load_state_dict(models['v_state_dict'])
 789 | 
 790 |         self.pi.eval()
 791 |         self.v.eval()
 792 |             
 793 |         self.enable_cuda(enable_cuda, warn=False)
 794 |         
 795 |     def save_checkpoint(self, filename):
 796 |         """Save state_dicts of models and optimizers."""
 797 |         
 798 |         torch.save({
 799 |                 'using_cuda': self.__cuda_enabled,
 800 |                 'pi_state_dict': self.pi.state_dict(),
 801 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
 802 |                 'v_state_dict': self.v.state_dict(),
 803 |                 'v_optimizer_state_dict': self.v_optim.state_dict(),
 804 |         }, filename)
 805 |     
 806 |     def load_checkpoint(self, filename, continue_training=True):
 807 |         """Load state_dicts for models and optimizers."""
 808 |         
 809 |         checkpoint = torch.load(filename)
 810 |         
 811 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
 812 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
 813 |         self.v.load_state_dict(models['v_state_dict'])
 814 |         self.v_optim.load_state_dict(models['v_optimizer_state_dict'])
 815 |         
 816 |         if continue_training:
 817 |             self.pi.train()
 818 |             self.v.train()
 819 |         else:
 820 |             self.pi.eval()
 821 |             self.v.eval()
 822 |         
 823 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
 824 | 
 825 |     def collect_rollout(self, env, rollout_length):
 826 |         """
 827 |         Perform a rollout and fill the rollout buffer.
 828 |         """
 829 | 
 830 |         self._last_obs = env.reset()
 831 |         self._last_episode_start = np.zeros(1)
 832 |         n_steps = 0
 833 |         self.rollout_buffer.reset()
 834 | 
 835 |         num_unsafe_steps = 0
 836 |         x_t=[]
 837 |         y_t=[]
 838 |         while n_steps < rollout_length:
 839 | 
 840 |             #vipul--- I removed these lines for now!
 841 |             #as they are pendulum specific
 842 |             # if env._state[0] < env.theta_safety_bounds[0] - 1e-8 or \
 843 |             #    env._state[0] > env.theta_safety_bounds[1] + 1e-8:
 844 |             #     num_unsafe_steps += 1
 845 |             
 846 |             action_dim=get_action_dim(env.action_space)
 847 |             #action_dim added --vipul
 848 |             
 849 |             with torch.no_grad():
 850 |                 # Convert to pytorch tensor or to TensorDict
 851 |                 obs_tensor = obs_as_tensor(self._last_obs, self.device).float()
 852 |                 action, log_prob = self.pi.sample(obs_tensor, action_dim)
 853 |                 value = self.v(obs_tensor)
 854 |             action = action.cpu().numpy()
 855 | 
 856 |             # Rescale and perform action
 857 |             clipped_action = action
 858 |             # Clip the actions to avoid out of bound error
 859 |             if isinstance(self.env.action_space, gym.spaces.Box):
 860 |                 clipped_action = np.clip(action, self.env.action_space.low,
 861 |                                          self.env.action_space.high)
 862 |             elif isinstance(self.env.action_space, gym.spaces.Discrete):
 863 |                 clipped_action = int(clipped_action)
 864 | 
 865 |             new_obs, reward, done, info = env.step(clipped_action)
 866 |             
 867 |             #adding the animation code here --vipul
 868 |             x_t.append(new_obs[0])
 869 |             y_t.append(new_obs[1])
 870 |             # if abs(new_obs[0]-env.obstacle[0])<0.1 and abs(new_obs[1]-env.obstacle[1])<0.1:
 871 |             #     print("crash")
 872 |             #     break
 873 | 
 874 |             n_steps += 1
 875 | 
 876 |             if isinstance(self.env.action_space, gym.spaces.Discrete):
 877 |                 # Reshape in case of discrete action
 878 |                 action = action.reshape(-1, 1)
 879 | 
 880 |             self.rollout_buffer.add(self._last_obs, action, reward,
 881 |                                     self._last_episode_start, value, log_prob)
 882 |             self._last_obs = new_obs.flatten()
 883 |             self._last_episode_start = done
 884 | 
 885 |             if n_steps == rollout_length:
 886 |                 env.reset()
 887 |         
 888 |         #code for plotting the quadrotor trajectory in an episode
 889 |         # plt.xlim(-200,200)
 890 |         # plt.ylim(-200,200)
 891 |         # plt.plot(x_t, y_t, color = 'red')
 892 |         # plt.pause(1)      
 893 |         plt.xlim(-1,14)
 894 |         plt.ylim(-1,14)
 895 |         plt.xlabel('X axis')
 896 |         plt.ylabel('Y-axis')
 897 |         plt.plot(x_t,y_t)
 898 |         # plt.plot(env.goal[0],env.goal[1],marker='o',color='red')
 899 |         # plt.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black')
 900 |         
 901 |         
 902 |         # def f(x, y, xa, yb, a, b):
 903 |         #     return (x - xa)**4/a**4 + (y - yb)**4/b**4
 904 | 
 905 |         # # Define the point around which to plot
 906 |         # xa, yb = env.obstacle[0], env.obstacle[1]
 907 | 
 908 |         # # Define the range of x and y values to plot
 909 |         # x_vals = np.linspace(xa - env.a_d, xa + env.a_d, 100)
 910 |         # y_vals = np.linspace(yb - env.b_d, yb + env.b_d, 100)
 911 | 
 912 |         # # Create a grid of x and y values
 913 |         # X, Y = np.meshgrid(x_vals, y_vals)
 914 | 
 915 |         # # Evaluate the function at each point in the grid
 916 |         # Z = f(X, Y, xa, yb, env.a_d, env.b_d)
 917 | 
 918 |         # # Plot the function as a contour plot
 919 |         # plt.contour(X, Y, Z, levels=[env.safety_dist])
 920 |         #quadrotor.plot_step(new_obs, u_hat_acc, state_hist, plot_handle)
 921 | 
 922 |         self.rollout_buffer.compute_returns_and_advantage(last_value=value,
 923 |                                                           done=done)
 924 |         
 925 |         safety_rate = 100 * (1 - num_unsafe_steps / rollout_length)
 926 | 
 927 |         #return np.sum(self.rollout_buffer.rewards) / np.sum(
 928 |             #self.rollout_buffer.episode_starts), safety_rate
 929 |         return np.sum(self.rollout_buffer.rewards), safety_rate    
 930 | 
 931 |     def train(self):
 932 |         """
 933 |         Train on the current rollout buffer.
 934 |         """        
 935 |         #action_dim = get_action_dim(self.action_space)
 936 |         for epoch in range(self.n_epochs):
 937 | 
 938 |             # Do a complete pass on the rollout buffer
 939 |             for rollout_data in self.rollout_buffer.get(self.batch_size):
 940 | 
 941 |                 actions = rollout_data.actions
 942 |                 obs = rollout_data.observations
 943 |                 values = self.v(obs).flatten()
 944 |                 log_probs = self.pi.log_probs(obs, actions, actions.shape[1])
 945 | 
 946 |                 entropies = self.pi.entropy(obs)
 947 |                 if log_probs.device!=actions.device:
 948 |                     log_probs=log_probs.to('cuda:0')
 949 |                     entropies=entropies.to('cuda:0')
 950 |                 advantages = rollout_data.advantages
 951 |                 
 952 |                 #08.14.2023 Vipul
 953 |                 #I commented out the below line
 954 |                 #08.26.2023
 955 |                 #I uncommented the below line
 956 |                 advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
 957 | 
 958 |                 # ratio between old and new policy, should be one at the first iteration
 959 |                 ratio = torch.exp(log_probs - rollout_data.old_log_prob)
 960 |                 
 961 |                 # if ratio[0]>1.001:
 962 |                 #     print(ratio)
 963 |                 #     import pdb; pdb.set_trace()
 964 |                 # clipped surrogate loss
 965 |                 policy_loss_1 = advantages * ratio
 966 |                 policy_loss_2 = advantages * torch.clamp(ratio,
 967 |                                                          1 - self.clip_range,
 968 |                                                          1 + self.clip_range)
 969 |                 policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean() - \
 970 |                         self.entropy_coef * entropies.mean()
 971 |                 #08/08/2023 -- Changed the entropy coefficient to 0.01--vipul
 972 |                 self.pi_optim.zero_grad()
 973 |                 policy_loss.backward()
 974 |                 # Clip grad norm
 975 |                 if self.grad_clip_radius is not None:
 976 |                     torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
 977 |                                                    self.grad_clip_radius)
 978 |                 self.pi_optim.step()
 979 | 
 980 |                 value_loss = F.mse_loss(rollout_data.returns, values)
 981 | 
 982 |                 self.v_optim.zero_grad()
 983 |                 value_loss.backward()
 984 |                 # Clip grad norm
 985 |                 if self.grad_clip_radius is not None:
 986 |                     torch.nn.utils.clip_grad_norm_(self.v.parameters(),
 987 |                                                    self.grad_clip_radius)
 988 |                 self.v_optim.step()
 989 | 
 990 |         #import pdb; pdb.set_trace()
 991 | 
 992 | #this definition is copied from stable-baselines3' website and DOES NOT belong to this code
 993 | # def obs_as_tensor(
 994 | #     obs: Union[np.ndarray, Dict[Union[str, int], np.ndarray]], device: th.device
 995 | # ) -> Union[th.Tensor, TensorDict]:
 996 | #     """
 997 | #     Moves the observation to the given device.
 998 | 
 999 | #     :param obs:
1000 | #     :param device: PyTorch device
1001 | #     :return: PyTorch tensor of the observation on a desired device.
1002 | #     """
1003 | #     if isinstance(obs, np.ndarray):
1004 | #         return th.as_tensor(obs, device=device)
1005 | #     elif isinstance(obs, dict):
1006 | #         return {key: th.as_tensor(_obs, device=device) for (key, _obs) in obs.items()}
1007 | #     else:
1008 | #         raise Exception(f"Unrecognized type of observation {type(obs)}")


--------------------------------------------------------------------------------
/agents/ppo.py:
--------------------------------------------------------------------------------
   1 | import torch
   2 | import torch.nn as nn
   3 | import torch.distributions as td
   4 | from torch.nn import functional as F
   5 | import gym
   6 | from gym import spaces
   7 | import numpy as np
   8 | from typing import NamedTuple
   9 | import warnings
  10 | from matplotlib import pyplot as plt
  11 | from matplotlib.animation import FuncAnimation
  12 | import datetime
  13 | import os
  14 | import math
  15 | import csv
  16 | 
  17 | from stable_baselines3.common.utils import obs_as_tensor
  18 | from stable_baselines3.common.preprocessing import (
  19 |     get_obs_shape, get_action_dim
  20 | )
  21 | 
  22 | 
  23 | #from agents.TruncatedNormal import TruncatedNormal as tn
  24 | 
  25 | from wesutils import two_layer_net
  26 | 
  27 | class RolloutBufferSamples(NamedTuple):
  28 |     observations: torch.Tensor
  29 |     actions: torch.Tensor
  30 |     old_values: torch.Tensor
  31 |     old_log_prob: torch.Tensor
  32 |     advantages: torch.Tensor
  33 |     returns: torch.Tensor
  34 | 
  35 | 
  36 | class RolloutBuffer:
  37 |     
  38 |     def __init__(self,
  39 |                  buffer_size,
  40 |                  observation_space,
  41 |                  action_space,
  42 |                  gamma=0.90,
  43 |                  device='cpu'):
  44 |         
  45 |         self.buffer_size = buffer_size
  46 |         self.observation_space = observation_space
  47 |         self.action_space = action_space
  48 |         self.gamma = gamma
  49 |         self.device = device
  50 |         self.obs_shape = get_obs_shape(self.observation_space)
  51 |         self.action_dim = get_action_dim(self.action_space)
  52 |         
  53 |         self.reset()
  54 |         
  55 |     def reset(self):
  56 |         
  57 |         self.observations = np.zeros(
  58 |             (self.buffer_size,) + self.obs_shape, dtype=np.float32
  59 |         )
  60 |         self.actions = np.zeros(
  61 |             (self.buffer_size, self.action_dim), dtype=np.float32
  62 |         )
  63 |         self.rewards = np.zeros(
  64 |             (self.buffer_size,), dtype=np.float32
  65 |         )
  66 |         self.episode_starts = np.zeros(
  67 |             (self.buffer_size,), dtype=np.float32
  68 |         )
  69 |         self.values = np.zeros(
  70 |             (self.buffer_size,), dtype=np.float32
  71 |         )
  72 |         self.log_probs = np.zeros(
  73 |             (self.buffer_size,), dtype=np.float32
  74 |         )
  75 |         self.advantages = np.zeros(
  76 |             (self.buffer_size,), dtype=np.float32
  77 |         )
  78 |         
  79 |         self.full = False
  80 |         self.pos = 0
  81 |         
  82 |     def compute_returns_and_advantage(self, last_value, done):
  83 |         
  84 |         last_value = last_value.clone().cpu().numpy().flatten()
  85 |         
  86 |         discounted_reward = 0
  87 |         for step in reversed(range(self.buffer_size)):
  88 |             if step == self.buffer_size - 1:
  89 |                 next_non_terminal = 1.0 - done
  90 |                 next_value = last_value
  91 |             else:
  92 |                 next_non_terminal = 1.0 - self.episode_starts[step + 1]
  93 |                 next_value = self.values[step + 1]
  94 |             discounted_reward = self.rewards[step] + \
  95 |                 self.gamma * discounted_reward * next_non_terminal
  96 |             self.advantages[step] = discounted_reward - self.values[step]
  97 |         self.returns = self.advantages + self.values
  98 |         
  99 |     def add(self, obs, action, reward, episode_start, value, log_prob):
 100 |         
 101 |         if len(log_prob.shape) == 0:
 102 |             log_prob = log_prob.reshape(-1, 1)
 103 |         
 104 |         if isinstance(self.observation_space, spaces.Discrete):
 105 |             obs = obs.reshape((1,) + self.obs_shape)
 106 |             
 107 |         self.observations[self.pos] = np.array(obs).copy()
 108 |         self.actions[self.pos] = np.array(action).copy()
 109 |         self.rewards[self.pos] = np.array(reward).copy()
 110 |         self.episode_starts[self.pos] = np.array(episode_start).copy()
 111 |         self.values[self.pos] = value.clone().cpu().numpy().flatten()
 112 |         self.log_probs[self.pos] = log_prob.clone().cpu().numpy()
 113 |         self.pos += 1
 114 |         if self.pos == self.buffer_size:
 115 |             self.full = True
 116 |             
 117 |     def get(self, batch_size=None):
 118 |         assert self.full, ""
 119 |         indices = np.random.permutation(self.buffer_size)
 120 | 
 121 |         # Return everything, don't create minibatches
 122 |         if batch_size is None:
 123 |             batch_size = self.buffer_size
 124 | 
 125 |         start_idx = 0
 126 |         while start_idx < self.buffer_size:
 127 |             yield self._get_samples(indices[start_idx : start_idx + batch_size])
 128 |             start_idx += batch_size
 129 | 
 130 |     def _get_samples(self, batch_inds):
 131 |         data = (
 132 |             self.observations[batch_inds],
 133 |             self.actions[batch_inds],
 134 |             self.values[batch_inds].flatten(),
 135 |             self.log_probs[batch_inds].flatten(),
 136 |             self.advantages[batch_inds].flatten(),
 137 |             self.returns[batch_inds].flatten(),
 138 |         )
 139 |         return RolloutBufferSamples(*tuple(map(self.to_torch, data)))
 140 |     
 141 |     def to_torch(self, array, copy=True):
 142 |         if copy:
 143 |             return torch.tensor(array).to(self.device)
 144 |         return torch.as_tensor(array).to(self.device)
 145 | 
 146 | 
 147 | class PolicyNetwork(nn.Module):
 148 |     """Base class for stochastic policy networks."""
 149 | 
 150 |     def __init__(self):
 151 |         super().__init__()
 152 | 
 153 |     def forward(self, state):
 154 |         """Take state as input, then output the parameters of the policy."""
 155 | 
 156 |         raise NotImplemented("forward not implemented.")
 157 | 
 158 |     def sample(self, state):
 159 |         """
 160 |         Sample an action based on the model parameters given the current state.
 161 |         """
 162 | 
 163 |         raise NotImplemented("sample not implemented.")
 164 | 
 165 |     def log_probs(self, obs, actions):
 166 |         """
 167 |         Return log probabilities for each state-action pair.
 168 |         """
 169 | 
 170 |         raise NotImplemented("log_probs not implemented.")
 171 | 
 172 |     def entropy(self, obs):
 173 |         """
 174 |         Return entropy of the policy for each state.
 175 |         """
 176 | 
 177 |         raise NotImplemented("entropy not implemented.")
 178 | 
 179 | 
 180 | class GaussianPolicyBase(PolicyNetwork):
 181 |     """
 182 |     Base class for Gaussian policy.
 183 | 
 184 |     Desired network needs to be implemented.
 185 |     """
 186 | 
 187 |     def __init__(self, action_dim):
 188 | 
 189 |         super().__init__()
 190 | 
 191 |         self.action_dim = action_dim
 192 | 
 193 |     def _get_covs(self, log_stds):
 194 |         batch_size = log_stds.shape[0]
 195 |         stds = log_stds.exp().reshape(batch_size, 1, 1)
 196 |         covs = stds * torch.eye(self.action_dim).repeat(batch_size, 1, 1)
 197 |         return covs
 198 | 
 199 |     def sample(self, obs, no_log_prob=False):
 200 |         mean, log_std = self.forward(obs)
 201 |         cov = log_std.exp() * torch.eye(self.action_dim)
 202 |         dist = td.MultivariateNormal(mean, cov)
 203 |         action = dist.rsample()
 204 |         return action if no_log_prob else (action, dist.log_prob(action))
 205 | 
 206 |     def log_probs(self, obs, actions):
 207 |         means, log_stds = self.forward(obs)
 208 |         covs = self._get_covs(log_stds)
 209 |         dists = td.MultivariateNormal(means, covs)
 210 |         return dists.log_prob(actions)
 211 | 
 212 |     def entropy(self, obs):
 213 |         means, log_stds = self.forward(obs)
 214 |         covs = self._get_covs(log_stds)
 215 |         dists = td.MultivariateNormal(means, covs)
 216 |         return dists.entropy()
 217 | 
 218 | 
 219 | class GaussianPolicy(GaussianPolicyBase):
 220 |     """
 221 |     Gaussian policy using a two-layer, two-headed MLP with ReLU activation.
 222 |     """
 223 | 
 224 |     def __init__(self, obs_dim, action_dim,
 225 |                  min_action_val=-20.0 * np.array([1, 1]),
 226 |                  max_action_val=20.0 * np.array([1, 1]),
 227 |                  hidden_layer1_size=64,
 228 |                  hidden_layer2_size=64):
 229 | 
 230 |         super().__init__(action_dim)
 231 | 
 232 |         self.base_net = nn.Sequential(
 233 |             nn.Linear(obs_dim, hidden_layer1_size),
 234 |             nn.ReLU(),
 235 |             nn.Linear(hidden_layer1_size, hidden_layer2_size),
 236 |             nn.ReLU(),
 237 |         )
 238 | 
 239 |         self.mean_net = nn.Sequential(
 240 |             nn.Linear(hidden_layer2_size, action_dim),
 241 |             nn.Hardtanh(min_action_val[0], max_action_val[0]),
 242 |             nn.Hardtanh(min_action_val[1], max_action_val[1])
 243 |         )
 244 | 
 245 |         self.log_std_net = nn.Sequential(
 246 |             nn.Linear(hidden_layer2_size, 1),
 247 |         )
 248 | 
 249 |     def forward(self, obs):
 250 |         x = self.base_net(obs)
 251 |         mean = self.mean_net(x)
 252 |         log_std = self.log_std_net(x)
 253 |         return mean, log_std
 254 | 
 255 | 
 256 | # class TruncatedNormalPolicyBase(PolicyNetwork):
 257 | #     """
 258 | #     Base class for TruncatedNormal policy. Action dimension must be 1.
 259 | #     Uses a function (e.g., a CBF) to convert observations into bounds for
 260 | #     the TruncatedNormal distribution.
 261 | 
 262 | #     Desired network needs to be implemented.
 263 | #     """
 264 | 
 265 | #     def __init__(self, constraint_fn):
 266 | 
 267 | #         super().__init__()
 268 | 
 269 | #         self.constraint_fn = self._vectorize_f(constraint_fn)
 270 | 
 271 | #     def _get_dist(self, obs):
 272 | #         mean, log_std = self.forward(obs)
 273 | #         std = log_std.exp()
 274 | #         lb, ub = self.constraint_fn(obs)
 275 | #         dist = tn(mean, std, lb, ub)
 276 | 
 277 | #         return dist
 278 | 
 279 | #     def sample(self, obs, no_log_prob=False):
 280 | #         dist = self._get_dist(obs)
 281 | #         lb = dist.scale * dist.a + dist.loc
 282 | #         ub = dist.scale * dist.b + dist.loc
 283 | #         action = dist.rsample()
 284 | 
 285 | #         for _ in range(100):
 286 | #             if (action > lb) and (action < ub):
 287 | #                 break
 288 | #         if (action < lb) or (action > ub):
 289 | #             print("OOPS! Resampling...")
 290 | #             action = lb + (ub - lb) * torch.rand(size=(1, 1), requires_grad=True)
 291 | 
 292 | #         try:
 293 | #             log_prob = dist.log_prob(action)
 294 | #         except ValueError:
 295 | #             import pdb; pdb.set_trace()
 296 | 
 297 | #         return action if no_log_prob else (action, log_prob)
 298 | 
 299 | #     def log_probs(self, obs, actions):
 300 | #         dists = self._get_dist(obs)
 301 | #         return dists.log_prob(actions).flatten()
 302 | 
 303 | #     def entropy(self, obs):
 304 | #         dists = self._get_dist(obs)
 305 | #         return dists.entropy
 306 | 
 307 | #     def _vectorize_f(self, f):
 308 | #         """
 309 | #         Converts a function f defined on 1D numpy arrays and outputting pairs of
 310 | #         scalars into a vectorized function accepting batches of
 311 | #         torch tensorized arrays and output pairs of torch tensors.
 312 | #         """
 313 | 
 314 | #         def vectorized_cbf(obs):
 315 | 
 316 | #             obs = obs.cpu().detach().numpy()
 317 | #             lbs, ubs = [], []
 318 | 
 319 | #             if len(obs.shape) == 1:
 320 | #                 batch_size = 1
 321 | #                 lb, ub = f(obs)
 322 | #                 lbs.append(lb)
 323 | #                 ubs.append(ub)
 324 | 
 325 | #             else:
 326 | #                 batch_size = obs.shape[0]
 327 | #                 for i in range(batch_size):
 328 | #                     lb, ub = f(obs[i])
 329 | #                     lbs.append(lb)
 330 | #                     ubs.append(ub)
 331 | 
 332 | #             lbs = torch.FloatTensor(lbs).reshape(batch_size, 1)
 333 | #             ubs = torch.FloatTensor(ubs).reshape(batch_size, 1)
 334 |             
 335 | #             return lbs, ubs
 336 | 
 337 | #         return vectorized_cbf
 338 | 
 339 | 
 340 | # class TruncatedNormalPolicy(TruncatedNormalPolicyBase):
 341 | #     """
 342 | #     TruncatedNormal policy using a two-layer, two-headed MLP with ReLU
 343 | #     activation. Action dimension must be 1.
 344 | #     """
 345 | 
 346 | #     def __init__(self, obs_dim, constraint_fn,
 347 | #                  hidden_layer1_size=64,
 348 | #                  hidden_layer2_size=64,
 349 | #                  mean_min=-np.inf,
 350 | #                  mean_max=np.inf,
 351 | #                  log_std_lb=-10,
 352 | #                  log_std_ub=3):
 353 | 
 354 | #         super().__init__(constraint_fn)
 355 | 
 356 | #         self.base_net = nn.Sequential(
 357 | #             nn.Linear(obs_dim, hidden_layer1_size),
 358 | #             nn.ReLU(),
 359 | #             nn.Linear(hidden_layer1_size, hidden_layer2_size),
 360 | #             nn.ReLU(),
 361 | #         )
 362 | 
 363 | #         self.mean_net = nn.Sequential(
 364 | #             nn.Linear(hidden_layer2_size, 1),
 365 | #             # nn.Hardtanh(min_val=mean_min, max_val=mean_max)
 366 | #         )
 367 | 
 368 | #         self.log_std_net = nn.Sequential(
 369 | #             nn.Linear(hidden_layer2_size, 1),
 370 | #             # nn.Hardtanh(min_val=log_std_lb, max_val=log_std_ub)
 371 | #         )
 372 | 
 373 | #         def init_weights(m):
 374 | #             if isinstance(m, nn.Linear):
 375 | #                 torch.nn.init.normal_(m.weight, std=1.0)
 376 | 
 377 | #         self.base_net.apply(init_weights)
 378 | #         self.mean_net.apply(init_weights)
 379 | #         self.log_std_net.apply(init_weights)
 380 | 
 381 | #     def forward(self, obs):
 382 | #         x = self.base_net(obs)
 383 | #         mean = self.mean_net(x)
 384 | #         log_std = self.log_std_net(x)
 385 | #         return mean, log_std
 386 | 
 387 | 
 388 | class BetaPolicyBase(PolicyNetwork):
 389 |     """
 390 |     Base class for Beta policy.
 391 | 
 392 |     Desired network needs to be implemented.
 393 |     """
 394 | 
 395 |     def __init__(self, constraint_fn, action_dim, enable_cuda=False):
 396 | 
 397 |         super().__init__()
 398 | 
 399 |         self.device = torch.device(
 400 |                 'cuda' if torch.cuda.is_available() and enable_cuda \
 401 |                 else 'cpu')
 402 |         self.constraint_fn = self._vectorize_f(constraint_fn, action_dim)
 403 |         self.action_dim = action_dim
 404 | 
 405 |     def _vectorize_f(self, f, action_dim): #--vipul :added action_dim
 406 |         """
 407 |         Converts a function f defined on 1D numpy arrays and outputting pairs of
 408 |         scalars into a vectorized function accepting batches of
 409 |         torch tensorized arrays and output pairs of torch tensors.
 410 |         """
 411 | 
 412 |         def vectorized_f(obs, action_dim): #--vipul :added action_dim
 413 | 
 414 |             obs = obs.cpu().detach().numpy()
 415 | 
 416 |             if len(obs.shape) == 1:  # check to see if obs is a batch or single obs
 417 |                 batch_size = 1
 418 |                 lbs, ubs = f(obs)
 419 | 
 420 |             else:
 421 |                 batch_size = obs.shape[0]
 422 |                 lbs = np.zeros([batch_size, self.action_dim])
 423 |                 ubs = np.zeros([batch_size, self.action_dim])
 424 |                 for i in range(batch_size):
 425 |                     lbs[i], ubs[i] = f(obs[i])
 426 | 
 427 |             lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim)
 428 |             ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim)
 429 |             lbs = lbs.to(self.device)
 430 |             ubs = ubs.to(self.device)
 431 |             
 432 |             return lbs, ubs
 433 | 
 434 |         return vectorized_f
 435 | 
 436 |     def sample(self, obs, action_dim, no_log_prob=False):
 437 |         """
 438 |         Sample from independent Beta distributions across each action_dim.
 439 |         """
 440 | 
 441 |         assert len(obs.shape) == 1, 'obs must be a flat array'
 442 | 
 443 |         alphas, betas = self.forward(obs)
 444 |         alphas, betas = torch.flatten(alphas), torch.flatten(betas)
 445 |         dists = [
 446 |             td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas)
 447 |         ]
 448 |         action_along_dims = [dist.rsample() for dist in dists]
 449 |         action = torch.tensor(action_along_dims, requires_grad=True)
 450 |         log_prob = torch.sum(torch.tensor([
 451 |             dist.log_prob(a) for dist, a in zip(dists, action_along_dims)
 452 |         ], requires_grad=True))
 453 |         lb, ub = self.constraint_fn(obs, action_dim)
 454 |         action = lb + (ub - lb) * action
 455 |         return action if no_log_prob else (action, log_prob)
 456 | 
 457 |     def log_probs(self, obs, actions, action_dim):
 458 |         alphas_arr, betas_arr = self.forward(obs)
 459 |         dists = []
 460 |         #import pdb; pdb.set_trace()
 461 |         
 462 |         #08.28.2023 -vipul making last attempts
 463 |         alphas_arr_1 = alphas_arr[:,0]
 464 |         alphas_arr_2 = alphas_arr[:,1]
 465 |         betas_arr_1 = betas_arr[:,0]
 466 |         betas_arr_2 = betas_arr[:,1]
 467 |         try:
 468 |             dists_1 = td.Beta(alphas_arr_1, betas_arr_1)
 469 |         except:
 470 |             import pdb; pdb.set_trace()
 471 |             
 472 |         try:
 473 |              dists_2 = td.Beta(alphas_arr_2, betas_arr_2)
 474 |         except:
 475 |             import pdb; pdb.set_trace()
 476 |         #08.23.2023 -vipul done making last attempts
 477 |         
 478 |         for i in range(alphas_arr.shape[0]):
 479 |             alphas = alphas_arr[i]
 480 |             betas = betas_arr[i]
 481 |             dists.append([
 482 |                 td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas)
 483 |             ])
 484 | 
 485 |         lbs, ubs = self.constraint_fn(obs, action_dim)
 486 |         if lbs.device!=actions.device:
 487 |             lbs = lbs.to('cuda:0')
 488 |             ubs = ubs.to('cuda:0')
 489 |         actions = (actions - lbs) / (ubs - lbs) 
 490 |         actions = actions.clip(0, 1)
 491 | 
 492 |         log_probs = []
 493 |         for action, action_dists in zip(actions, dists):
 494 |             log_probs.append(
 495 |                 torch.sum(torch.tensor([
 496 |                     dim_dist.log_prob(dim_action) \
 497 |                         for dim_dist, dim_action in zip(action_dists, action)
 498 |                 ], requires_grad=True))
 499 |             )
 500 |         log_probs = torch.tensor(log_probs, requires_grad=True)
 501 |         
 502 |         #08.28.2023 -vipul making last attempts
 503 |         return_new = dists_1.log_prob(actions[:,0]).flatten() + dists_2.log_prob(actions[:,1]).flatten()
 504 |         #08.23.2023 -vipul done making last attempts
 505 |         
 506 |         #return log_probs #--original return  08.28.2023
 507 |         
 508 |         return return_new #--vipul's return  08.28.2023
 509 | 
 510 |     def entropy(self, obs):
 511 |         """
 512 |         Returns sum of entropies along each independent action dimension.
 513 |         """
 514 |         alphas_arr, betas_arr = self.forward(obs)
 515 |         dists = []
 516 |         for i in range(alphas_arr.shape[0]):
 517 |             alphas = alphas_arr[i]
 518 |             betas = betas_arr[i]
 519 |             dists.append([
 520 |                 td.Beta(alpha, beta) for alpha, beta in zip(alphas, betas)
 521 |             ])
 522 |         entropies = torch.tensor(
 523 |             [torch.sum(torch.tensor([dist.entropy() for dist in dist_list])) \
 524 |              for dist_list in dists]
 525 |         )
 526 |         return entropies
 527 |         
 528 |         # # TODO: make this consistent with sample & log_probs defined above
 529 |         # alphas, betas = self.forward(obs)
 530 |         # dists = td.Beta(alphas, betas)
 531 |         # return dists.entropy()
 532 | 
 533 | 
 534 | ### NOTE: this is the BetaPolicyBase giving us the Dirichlet error
 535 | # class BetaPolicyBase(PolicyNetwork):
 536 | #     """
 537 | #     Base class for Beta policy.
 538 | # 
 539 | #     Desired network needs to be implemented.
 540 | #     """
 541 | # 
 542 | #     def __init__(self, constraint_fn, action_dim):
 543 | # 
 544 | #         super().__init__()
 545 | # 
 546 | #         self.constraint_fn = self._vectorize_f(constraint_fn, action_dim)
 547 | #         self.action_dim = action_dim
 548 | #         #assert self.action_dim == 1, 'Action dimension must be 1'  #-Line commented out--vipul
 549 | # 
 550 | #     def _vectorize_f(self, f, action_dim): #--vipul :added action_dim
 551 | #         """
 552 | #         Converts a function f defined on 1D numpy arrays and outputting pairs of
 553 | #         scalars into a vectorized function accepting batches of
 554 | #         torch tensorized arrays and output pairs of torch tensors.
 555 | #         """
 556 | # 
 557 | #         def vectorized_f(obs, action_dim): #--vipul :added action_dim
 558 | # 
 559 | #             obs = obs.cpu().detach().numpy()
 560 | # 
 561 | #             if len(obs.shape) == 1:  # check to see if obs is a batch or single obs
 562 | #                 batch_size = 1
 563 | #                 lbs, ubs = f(obs)
 564 | # 
 565 | #             else:
 566 | #                 batch_size = obs.shape[0]
 567 | #                 lbs = np.zeros([batch_size, self.action_dim])
 568 | #                 ubs = np.zeros([batch_size, self.action_dim])
 569 | #                 for i in range(batch_size):
 570 | #                     lbs[i], ubs[i] = f(obs[i])
 571 | # 
 572 | #             lbs = torch.FloatTensor(lbs).reshape(batch_size, self.action_dim)
 573 | #             ubs = torch.FloatTensor(ubs).reshape(batch_size, self.action_dim)
 574 | #             
 575 | #             return lbs, ubs
 576 | # 
 577 | #         return vectorized_f
 578 | # 
 579 | #     def sample(self, obs, action_dim, no_log_prob=False): #sample from interval is taken with a beta distribution -vipul
 580 | #         #sample definition changed to take in argument action_dim --vipul
 581 | #         alpha, beta = self.forward(obs)
 582 | #         dist = td.Beta(alpha, beta)
 583 | #         action = dist.rsample()
 584 | #         #log_prob = dist.log_prob(action) #original code
 585 | #         log_prob_vec = dist.log_prob(action) #vipul
 586 | #         log_prob = sum(log_prob_vec)
 587 | #         lb, ub = self.constraint_fn(obs, action_dim)
 588 | #         action = lb + (ub - lb) * action  #need to be changed? --vipul
 589 | #         #code change nedded! -vipul
 590 | #         return action if no_log_prob else (action, log_prob)
 591 | # 
 592 | #     def log_probs(self, obs, actions, action_dim): #function modified 
 593 | #         alphas, betas = self.forward(obs)
 594 | #         dists = td.Beta(alphas, betas)
 595 | #         lbs, ubs = self.constraint_fn(obs, action_dim)
 596 | #         actions = (actions - lbs) / (ubs - lbs) 
 597 | #         #need to be changed? --vipul
 598 | #         #yes, change needed!
 599 | #         return sum(dists.log_prob(actions).flatten())
 600 | # 
 601 | #     def entropy(self, obs):
 602 | #         alphas, betas = self.forward(obs)
 603 | #         dists = td.Beta(alphas, betas)
 604 | #         return dists.entropy()
 605 | ###
 606 | 
 607 | 
 608 | class BetaPolicy(BetaPolicyBase):
 609 |     """
 610 |     Beta policy using a two-layer, two-headed MLP with ReLU activation.
 611 |     """
 612 | 
 613 |     def __init__(self, obs_dim, constraint_fn, action_dim, #vipul : action_dim=1 replaced by just action_dim
 614 |                  hidden_layer1_size=64,
 615 |                  hidden_layer2_size=64):
 616 | 
 617 |         super().__init__(constraint_fn, action_dim=action_dim)
 618 | 
 619 |         self.base_net = nn.Sequential(
 620 |             nn.Linear(obs_dim, hidden_layer1_size),
 621 |             nn.Tanh(),
 622 |             nn.Linear(hidden_layer1_size, hidden_layer2_size),
 623 |             nn.Tanh(),
 624 |         )
 625 | 
 626 |         self.alpha_net = nn.Sequential(
 627 |             nn.Linear(hidden_layer2_size, action_dim), #vipul : 1 replaced with action_dim
 628 |             nn.Softplus(),
 629 |         )
 630 | 
 631 |         self.beta_net = nn.Sequential(
 632 |             nn.Linear(hidden_layer2_size, action_dim), #vipul : 1 replaced with action_dim
 633 |             nn.Softplus(),
 634 |         )
 635 | 
 636 |     def forward(self, obs):
 637 | 
 638 |         x = self.base_net(obs)
 639 |         # if math.isnan(x[0].item()):
 640 |         #     import pdb; pdb.set_trace()
 641 |         alpha = 1.0 + self.alpha_net(x) #is there any change needed? --vipul
 642 |         beta = 1.0 + self.beta_net(x)   #is there any change needed? --vipul
 643 | 
 644 |         return alpha, beta
 645 | 
 646 | 
 647 | class CategoricalPolicy(PolicyNetwork):
 648 |     """
 649 |     Base class for categorical policy.
 650 | 
 651 |     Desired network needs to be implemented.
 652 |     """
 653 | 
 654 |     def __init__(self, num_actions):
 655 | 
 656 |         super().__init__()
 657 | 
 658 |         self.num_actions = num_actions
 659 | 
 660 |     def sample(self, obs, no_log_prob=False):
 661 |         logits = self.forward(obs)
 662 |         dist = td.Categorical(logits=logits)
 663 |         action = dist.sample(sample_shape=torch.tensor([1]))
 664 |         return action if no_log_prob else (action, dist.log_prob(action))
 665 | 
 666 |     def log_probs(self, obs, actions):
 667 |         dists = td.Categorical(logits=self.forward(obs))
 668 |         return dists.log_prob(actions.flatten())
 669 | 
 670 |     def entropy(self, obs):
 671 |         dists = td.Categorical(logits=self.forward(obs))
 672 |         return dists.entropy()
 673 | 
 674 | 
 675 | class CategoricalPolicyTwoLayer(CategoricalPolicy):
 676 |     """
 677 |     Categorical policy using a fully connected two-layer network.
 678 |     """
 679 | 
 680 |     def __init__(self, state_dim, num_actions,
 681 |                  hidden_layer1_size=64,
 682 |                  hidden_layer2_size=64,
 683 |                  init_std=0.001):
 684 | 
 685 |         super().__init__(num_actions)
 686 | 
 687 |         self.init_std = init_std
 688 | 
 689 |         self.linear1 = nn.Linear(state_dim, hidden_layer1_size)
 690 |         self.linear2 = nn.Linear(hidden_layer1_size, hidden_layer2_size)
 691 |         self.linear3 = nn.Linear(hidden_layer2_size, num_actions)
 692 |         nn.init.normal_(self.linear1.weight, std=init_std)
 693 |         nn.init.normal_(self.linear2.weight, std=init_std)
 694 |         nn.init.normal_(self.linear3.weight, std=init_std)
 695 | 
 696 |     def forward(self, state):
 697 |         x = F.relu(self.linear1(state))
 698 |         x = F.relu(self.linear2(x))
 699 |         output = self.linear3(x)
 700 |         return output
 701 | 
 702 | 
 703 | class PPOBase:
 704 |     def __init__(self,
 705 |                  env,
 706 |                  policy,
 707 |                  value_function,
 708 |                  policy_lr,
 709 |                  value_lr,
 710 |                  entropy_coef=0.0,
 711 |                  clip_range=0.2,
 712 |                  n_epochs=10,
 713 |                  batch_size=64,
 714 |                  weight_decay=0.0,
 715 |                  gamma=0.99,
 716 |                  buffer_size=2048,
 717 |                  enable_cuda=True,
 718 |                  policy_optimizer=torch.optim.Adam,
 719 |                  value_optimizer=torch.optim.Adam,
 720 |                  grad_clip_radius=None):
 721 | 
 722 |         warnings.warn('This PPO implementation currently contains hacks for ' + \
 723 |                       'returning information about CBF-related safety.')
 724 | 
 725 |         self.env = env
 726 |         self.pi = policy
 727 |         self.v = value_function
 728 |         self.entropy_coef = entropy_coef
 729 |         self.clip_range = clip_range
 730 |         self.n_epochs = n_epochs
 731 |         self.batch_size = batch_size
 732 | 
 733 |         self.__cuda_enabled = enable_cuda
 734 |         self.enable_cuda(self.__cuda_enabled, warn=False)
 735 |         # NOTE: self.device is defined when self.enable_cuda is called!
 736 | 
 737 |         self.pi_optim = policy_optimizer(self.pi.parameters(),
 738 |                                          lr=policy_lr,
 739 |                                          weight_decay=weight_decay)
 740 |         self.v_optim = value_optimizer(self.v.parameters(), lr=value_lr)
 741 |         self.grad_clip_radius = grad_clip_radius
 742 | 
 743 |         self.rollout_buffer = RolloutBuffer(
 744 |             buffer_size,
 745 |             env.observation_space,
 746 |             env.action_space,
 747 |             device=self.device,
 748 |             gamma=gamma
 749 |         )
 750 | 
 751 |     @property
 752 |     def cuda_enabled(self):
 753 |         return self.__cuda_enabled
 754 | 
 755 |     def enable_cuda(self, enable_cuda=True, warn=True):
 756 |         """Enable or disable cuda and update models."""
 757 |         
 758 |         if warn:
 759 |             warnings.warn("Converting models between 'cpu' and 'cuda' after "
 760 |                           "initializing optimizers can give errors when using "
 761 |                           "optimizers other than SGD or Adam!")
 762 |         
 763 |         self.__cuda_enabled = enable_cuda
 764 |         self.device = torch.device(
 765 |                 'cuda' if torch.cuda.is_available() and self.__cuda_enabled \
 766 |                 else 'cpu')
 767 |         self.pi.to(self.device)
 768 |         self.v.to(self.device)
 769 |         
 770 |     def load_models(self, filename, enable_cuda=True, continue_training=True):
 771 |         """
 772 |         Load policy and value functions. Copy them to target functions.
 773 | 
 774 |         This method is for evaluation only. Use load_checkpoint to continue
 775 |         training.
 776 |         """
 777 |         
 778 |         models = torch.load(filename)
 779 | 
 780 |         self.pi.load_state_dict(models['pi_state_dict'])
 781 |         self.v.load_state_dict(models['v_state_dict'])
 782 | 
 783 |         self.pi.eval()
 784 |         self.v.eval()
 785 |             
 786 |         self.enable_cuda(enable_cuda, warn=False)
 787 |         
 788 |     def save_checkpoint(self, filename):
 789 |         """Save state_dicts of models and optimizers."""
 790 |         
 791 |         torch.save({
 792 |                 'using_cuda': self.__cuda_enabled,
 793 |                 'pi_state_dict': self.pi.state_dict(),
 794 |                 'pi_optimizer_state_dict': self.pi_optim.state_dict(),
 795 |                 'v_state_dict': self.v.state_dict(),
 796 |                 'v_optimizer_state_dict': self.v_optim.state_dict(),
 797 |         }, filename)
 798 |     
 799 |     def load_checkpoint(self, filename, continue_training=True):
 800 |         """Load state_dicts for models and optimizers."""
 801 |         
 802 |         checkpoint = torch.load(filename)
 803 |         
 804 |         self.pi.load_state_dict(checkpoint['pi_state_dict'])
 805 |         self.pi_optim.load_state_dict(checkpoint['pi_optimizer_state_dict'])
 806 |         self.v.load_state_dict(models['v_state_dict'])
 807 |         self.v_optim.load_state_dict(models['v_optimizer_state_dict'])
 808 |         
 809 |         if continue_training:
 810 |             self.pi.train()
 811 |             self.v.train()
 812 |         else:
 813 |             self.pi.eval()
 814 |             self.v.eval()
 815 |         
 816 |         self.enable_cuda(checkpoint['using_cuda'], warn=False)
 817 | 
 818 |     def collect_rollout(self, env, rollout_length):
 819 |         """
 820 |         Perform a rollout and fill the rollout buffer.
 821 |         """
 822 | 
 823 |         self._last_obs = env.reset()
 824 |         self._last_episode_start = np.zeros(1)
 825 |         n_steps = 0
 826 |         self.rollout_buffer.reset()
 827 | 
 828 |         num_unsafe_steps = 0
 829 |         x_t=[]
 830 |         y_t=[]
 831 |         
 832 |         local_flag_done = False
 833 |         while n_steps < rollout_length:
 834 | 
 835 |             #vipul--- I removed these lines for now!
 836 |             #as they are pendulum specific
 837 |             # if env._state[0] < env.theta_safety_bounds[0] - 1e-8 or \
 838 |             #    env._state[0] > env.theta_safety_bounds[1] + 1e-8:
 839 |             #     num_unsafe_steps += 1
 840 |             
 841 |             action_dim=get_action_dim(env.action_space)
 842 |             #action_dim added --vipul
 843 |             
 844 |             with torch.no_grad():
 845 |                 # Convert to pytorch tensor or to TensorDict
 846 |                 obs_tensor = obs_as_tensor(self._last_obs, self.device).float()
 847 |                 action, log_prob = self.pi.sample(obs_tensor, action_dim)
 848 |                 value = self.v(obs_tensor)
 849 |             action = action.cpu().numpy()
 850 | 
 851 |             # Rescale and perform action
 852 |             clipped_action = action
 853 |             # Clip the actions to avoid out of bound error
 854 |             if isinstance(self.env.action_space, gym.spaces.Box):
 855 |                 clipped_action = np.clip(action, self.env.action_space.low,
 856 |                                          self.env.action_space.high)
 857 |             elif isinstance(self.env.action_space, gym.spaces.Discrete):
 858 |                 clipped_action = int(clipped_action)
 859 | 
 860 |             new_obs, reward, done, info = env.step(clipped_action)
 861 | 
 862 |             #adding the animation code here --vipul
 863 |             x_t.append(new_obs[0])
 864 |             y_t.append(new_obs[1])
 865 |             if abs(new_obs[0]-env.obstacle[0])<0.1 and abs(new_obs[1]-env.obstacle[1])<0.1:
 866 |                 print("crash")
 867 |                 #break
 868 | 
 869 |             n_steps += 1
 870 | 
 871 |             if isinstance(self.env.action_space, gym.spaces.Discrete):
 872 |                 # Reshape in case of discrete action
 873 |                 action = action.reshape(-1, 1)
 874 | 
 875 |             self.rollout_buffer.add(self._last_obs, action, reward,
 876 |                                     self._last_episode_start, value, log_prob)
 877 |             self._last_obs = new_obs.flatten()
 878 |             self._last_episode_start = done
 879 | 
 880 |             if n_steps == rollout_length:
 881 |                 env.reset()
 882 |             # if done == 0 and local_flag_done == False:
 883 |             #     x_t.append(new_obs[0])
 884 |             #     y_t.append(new_obs[1])
 885 |                 
 886 |             # elif done == 1 and n_steps>1:
 887 |             #     local_flag_done = True
 888 | 
 889 | 
 890 | 
 891 |         # with open('my_array.csv', 'w', newline='') as file:
 892 |         #     writer = csv.writer(file)
 893 |         #     writer.writerow(x_t)
 894 |         #     writer.writerow(y_t)
 895 |         #code for plotting the quadrotor trajectory in an episode
 896 |         # plt.xlim(-200,200)
 897 |         # plt.ylim(-200,200)
 898 |         # plt.plot(x_t, y_t, color = 'red')
 899 |         # plt.pause(1)      
 900 |         plt.xlim(np.double(env.min_x),np.double(env.max_x))
 901 |         plt.ylim(np.double(env.min_y),np.double(env.max_y))
 902 |         plt.xlabel('X axis')
 903 |         plt.ylabel('Y-axis')
 904 |         plt.plot(x_t,y_t)
 905 |         plt.plot(env.goal[0],env.goal[1],marker='o',color='red')
 906 |         plt.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black')
 907 |         
 908 |         
 909 |         def f(x, y, xa, yb, a, b):
 910 |             return (x - xa)**4/a**4 + (y - yb)**4/b**4
 911 | 
 912 |         # Define the point around which to plot
 913 |         xa, yb = env.obstacle[0], env.obstacle[1]
 914 | 
 915 |         # Define the range of x and y values to plot
 916 |         x_vals = np.linspace(xa - env.a_d, xa + env.a_d, 100)
 917 |         y_vals = np.linspace(yb - env.b_d, yb + env.b_d, 100)
 918 | 
 919 |         # Create a grid of x and y values
 920 |         X, Y = np.meshgrid(x_vals, y_vals)
 921 | 
 922 |         # Evaluate the function at each point in the grid
 923 |         Z = f(X, Y, xa, yb, env.a_d, env.b_d)
 924 | 
 925 |         # Plot the function as a contour plot
 926 |         
 927 |         #ADDED - 09.1.23
 928 |         #plt.savefig(f"{env.date}_run={env.run}_device={env.device_run}_cbf={env.env_cbf}_lr={env.lr}_entr={env.entropy}_umin={env.umin[0]}_umax={env.umax[0]}_lyr=batch={env.layer_size}_roll={rollout_length}.png")
 929 |         ##Create a folder in the current directory
 930 |         folder_name_main = f"{{{env.date}}}"
 931 |         os.makedirs(folder_name_main, exist_ok=True)
 932 |         ##Change the current working directory to the newly created folder
 933 |         os.chdir(folder_name_main)
 934 |         
 935 |         folder_name = f"{{run={env.run}_dt={env.dt}_device={env.device_run}_cbf={env.env_cbf}_roll={rollout_length}}}"
 936 |         os.makedirs(folder_name, exist_ok=True)
 937 |         ##Change the current working directory to the newly created folder
 938 |         os.chdir(folder_name)
 939 |         
 940 |         #folder_name_0 = f"{{run={env.run}}}"
 941 | 
 942 |         
 943 |         folder_name_1 = f"{{lr={env.lr}_entr={env.entropy}_umin={env.umin[0]}_umax={env.umax[0]}_lyr=batch={env.layer_size}}}"
 944 |         os.makedirs(folder_name_1, exist_ok=True)
 945 |         os.chdir(folder_name_1)
 946 |         #ADDED - 09.1.23
 947 |         with open(f"episode={env.episodes}.csv", 'w', newline='') as file:
 948 |             writer = csv.writer(file)
 949 |             writer.writerow(x_t)
 950 |             writer.writerow(y_t)
 951 |         if (env.episodes)%1 == 0:
 952 |             #plt.text(10, 12, f"{env.episodes}", fontsize=10, color='blue', ha='right')
 953 |             plt.savefig(f"ep={env.episodes}.png")
 954 |         plt.contour(X, Y, Z, levels=[env.safety_dist])
 955 |         
 956 |         # Return to the original working directory (optional)
 957 |         os.chdir('..')
 958 |         os.chdir('..')
 959 |         os.chdir('..')
 960 |         
 961 |         #quadrotor.plot_step(new_obs, u_hat_acc, state_hist, plot_handle)
 962 |         
 963 |         # #creating animation -08.31.23
 964 |         # fig, ax = plt.subplots()
 965 |         # line, = ax.plot([], [], lw=2)
 966 |         # ax.contour(X, Y, Z, levels=[env.safety_dist])
 967 |         # ax.plot(env.goal[0],env.goal[1],marker='o',color='red')
 968 |         # ax.plot(env.obstacle[0],env.obstacle[1],marker='*',color='black')
 969 |         # def init():
 970 |         #     ax.set_xlim(-1, 14)
 971 |         #     ax.set_ylim(-1, 14)
 972 |         #     return line,
 973 |         # def update(frame):
 974 |         #     line.set_data(x_t[frame], y_t[frame])
 975 |         #     return line,
 976 |         # ani = FuncAnimation(fig, update, frames=len(x_t), init_func=init, blit=True)
 977 |         # ani.save('1st_sep_trajectroy_animation.gif', writer='pillow')
 978 |         
 979 |         self.rollout_buffer.compute_returns_and_advantage(last_value=value,
 980 |                                                           done=done)
 981 |         
 982 |         safety_rate = 100 * (1 - num_unsafe_steps / rollout_length)
 983 | 
 984 |         #return np.sum(self.rollout_buffer.rewards) / np.sum(
 985 |             #self.rollout_buffer.episode_starts), safety_rate
 986 |         return np.sum(self.rollout_buffer.rewards), safety_rate    
 987 | 
 988 |     def train(self):
 989 |         """
 990 |         Train on the current rollout buffer.
 991 |         """        
 992 |         #action_dim = get_action_dim(self.action_space)
 993 |         for epoch in range(self.n_epochs):
 994 | 
 995 |             # Do a complete pass on the rollout buffer
 996 |             for rollout_data in self.rollout_buffer.get(self.batch_size):
 997 | 
 998 |                 actions = rollout_data.actions
 999 |                 obs = rollout_data.observations
1000 |                 values = self.v(obs).flatten()
1001 |                 try:
1002 |                     log_probs = self.pi.log_probs(obs, actions, actions.shape[1])
1003 |                 except:
1004 |                     print(self.pi.log_probs(obs, actions, actions.shape[1]))
1005 |                     import pdb; pdb.set_trace()
1006 | 
1007 |                 entropies = self.pi.entropy(obs)
1008 |                 if log_probs.device!=actions.device:
1009 |                     log_probs=log_probs.to('cuda:0')
1010 |                     entropies=entropies.to('cuda:0')
1011 |                 advantages = rollout_data.advantages
1012 |                 advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
1013 | 
1014 |                 # ratio between old and new policy, should be one at the first iteration
1015 |                 ratio = torch.exp(log_probs - rollout_data.old_log_prob)
1016 |                 
1017 |                 # if ratio[0]>1.001:
1018 |                 #     print(ratio)
1019 |                 #     import pdb; pdb.set_trace()
1020 | 
1021 | 
1022 |                 policy_loss_1 = advantages * ratio
1023 |                 policy_loss_2 = advantages * torch.clamp(ratio,
1024 |                                                          1 - self.clip_range,
1025 |                                                          1 + self.clip_range)
1026 |                 policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean() - \
1027 |                         self.entropy_coef * entropies.mean()
1028 | 
1029 |                 self.pi_optim.zero_grad()
1030 |                 policy_loss.backward()
1031 |                 # Clip grad norm
1032 |                 if self.grad_clip_radius is not None:
1033 |                     torch.nn.utils.clip_grad_norm_(self.pi.parameters(),
1034 |                                                    self.grad_clip_radius)
1035 |                 self.pi_optim.step()
1036 | 
1037 |                 value_loss = F.mse_loss(rollout_data.returns, values)
1038 | 
1039 |                 self.v_optim.zero_grad()
1040 |                 value_loss.backward()
1041 |                 # Clip grad norm
1042 |                 if self.grad_clip_radius is not None:
1043 |                     torch.nn.utils.clip_grad_norm_(self.v.parameters(),
1044 |                                                    self.grad_clip_radius)
1045 |                 self.v_optim.step()
1046 | 
1047 |         # import pdb; pdb.set_trace()
1048 | 
1049 | #this definition is copied from stable-baselines3' website and DOES NOT belong to this code
1050 | # def obs_as_tensor(
1051 | #     obs: Union[np.ndarray, Dict[Union[str, int], np.ndarray]], device: th.device
1052 | # ) -> Union[th.Tensor, TensorDict]:
1053 | #     """
1054 | #     Moves the observation to the given device.
1055 | 
1056 | #     :param obs:
1057 | #     :param device: PyTorch device
1058 | #     :return: PyTorch tensor of the observation on a desired device.
1059 | #     """
1060 | #     if isinstance(obs, np.ndarray):
1061 | #         return th.as_tensor(obs, device=device)
1062 | #     elif isinstance(obs, dict):
1063 | #         return {key: th.as_tensor(_obs, device=device) for (key, _obs) in obs.items()}
1064 | #     else:
1065 | #         raise Exception(f"Unrecognized type of observation {type(obs)}")
1066 | 


--------------------------------------------------------------------------------