├── .gitignore ├── README.md ├── algorithms ├── cmbpo.py ├── rl_algorithm.py └── utils.py ├── buffers ├── cpobuffer.py ├── modelbuffer.py └── utils.py ├── cmbpo.yml ├── configs ├── baseconfig │ ├── __init__.py │ ├── base.py │ ├── main.py │ └── utils.py ├── cmbpo_antsafe.py ├── cmbpo_hcs.py ├── cmbpo_hs.py ├── cpo_hcs.py └── trpo_hcs.py ├── envs ├── __init__.py ├── mujoco_safety_gym │ ├── __init__.py │ └── envs │ │ ├── __init__.py │ │ ├── ant.py │ │ ├── ant_viz.py │ │ ├── assets │ │ ├── ant.xml │ │ ├── ant_viz.xml │ │ ├── fetch │ │ │ ├── pick_and_place.xml │ │ │ ├── push.xml │ │ │ ├── reach.xml │ │ │ ├── robot.xml │ │ │ ├── shared.xml │ │ │ └── slide.xml │ │ ├── half_cheetah.xml │ │ ├── hopper.xml │ │ ├── humanoid.xml │ │ └── textures │ │ │ ├── block.png │ │ │ └── block_hidden.png │ │ ├── fetch │ │ ├── pick_and_place.py │ │ ├── push.py │ │ ├── reach.py │ │ └── slide.py │ │ ├── fetch_env.py │ │ ├── half_cheetah.py │ │ ├── hopper.py │ │ ├── humanoid.py │ │ ├── mujoco_env.py │ │ └── robot_env.py ├── utils.py └── wrappers │ ├── __init__.py │ └── normalize_action.py ├── models ├── base_model.py ├── fake_env.py ├── pens │ ├── __init__.py │ ├── fc.py │ ├── logger.py │ ├── pe.py │ ├── pe_factory.py │ └── utils.py └── statics.py ├── network └── ac_network.py ├── policies ├── base_policy.py ├── cpo_policy.py └── utils.py ├── requirements.txt ├── samplers ├── __init__.py ├── base_sampler.py ├── cpo_sampler.py ├── model_sampler.py ├── simple_sampler.py └── utils.py ├── scripts ├── console_scripts.py └── run.py ├── setup.py └── utilities ├── instrument.py ├── logging.py ├── logx.py ├── mpi_tf.py ├── mpi_tools.py ├── serialization_utils.py ├── trust_region.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | *.stl 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | /environment/src/ 31 | /src/ 32 | /softlearning/environments/rllab/ 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | 110 | # soft learning specific things 111 | *.swp 112 | .idea 113 | *.mp4 114 | data/ 115 | vis/ 116 | tmp/ 117 | vendor/* 118 | .pkl 119 | 120 | 121 | .mujoco/ 122 | .vscode/ 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Constrained Model-Based Policy Optimization 2 | 3 |

4 | 5 | 6 |

7 | 8 | This repository contains code for Constrained Model-Based Policy Optimization (CMBPO), a model-based version of Constrained Policy Optimization (Achiam et al.). Installation, execution and code examples for the reproduction of the experiments described in [Safe Continuous Control with Constrained Model-Based Policy Optimization](https://arxiv.org/abs/2104.06922?context=cs) are provided below. 9 | 10 | # Prerequisites 11 | 12 | 1. The simulation experiments using [mujoco-Py](https://github.com/openai/mujoco-py) require a working install of [MuJoCo 2.0](https://www.roboti.us/license.html) and a valid license. 13 | 2. We use conda environments for installs (tested on conda 4.6 - 4.10), please refer to [Anaconda](https://docs.anaconda.com/anaconda/install/) for instructions. 14 | 15 | # Installation 16 | 17 | 1. Clone this repository 18 | ``` 19 | git clone https://github.com/anyboby/Constrained-Model-Based-Policy-Optimization.git 20 | ``` 21 | 2. Create a conda environment using the cmbpo yml-file 22 | ```sh 23 | cd Constrained-Model-Based-Policy-Optimization/ 24 | conda env create -f cmbpo.yml 25 | conda activate cmbpo 26 | pip install -e . 27 | ``` 28 | This should create a conda environment labeled 'cmbpo' with the necessary packages and modules. The number of required modules is limited, so it is worth taking a look at the [cmbpo.yml](cmbpo.yml) and [requirements.txt](requirements.txt) files in case of troubles with the installs. 29 | 30 | # Usage 31 | To start an experiment with cmbpo, run 32 | ```sh 33 | cmbpo run_local configs.baseconfig --config=configs.cmbpo_hcs --gpus=1 --trial-gpus=1 34 | ``` 35 | 36 | `-- config` specifies the configuration file for experiment (here: CMBPO for HalfCheetahSafe)\ 37 | `-- gpus` specifies the number of gpus to use 38 | 39 | A list of all available flags is provided in [baseconfig/utils](configs/baseconfig/utils.py). As of writing,only local running is supported. For further options, refer to the ray documentation. 40 | 41 | The `cmbpo` command uses the [console scripts](scripts/console_scripts.py) as an entry point for running experiments. A simple workflow of running experiments with ray-tune is illustrated in [run.py](scripts/run.py), which can be executed with 42 | ```sh 43 | python scripts/run.py configs.cmbpo_hcs 44 | ``` 45 | 46 | ## Algorithms 47 | Constrained Model-Based Policy Optimization aims at combining Constrained Policy Optimization with model-based data augmentation and reconciling constraint satisfaction with the entailed model-errors. 48 | 49 | This repository can therefore also be used to run experiments with model-free versions of Constrained Policy Optimization and Trust-Region Policy Optimization by configuring the `use_model` and `constrain_cost` flags accordingly in the experiment configurations (see [CPO - HalfCheetahSafe](configs/cpo_hcs.py) and [TRPO - HalfCheetahSafe](configs/trpo_hcs.py)): 50 | ```py 51 | 'use_model': False, # set to True for model-based 52 | 'constrain_cost': False, # set to True for cost-constrained optimziation 53 | ``` 54 | 55 | ## Adding new environments and running custom experiments 56 | Different environments can be tested by creating a config file in the [configs](configs/) directory. OpenAi gym environments can be loaded directly with the corresponding parameters, for example: 57 | ```py 58 | 'universe': 'gym', 59 | 'task': 'HalfCheetahSafe-v2', 60 | ``` 61 | Environments from other sources require an entry in the `ENVS_FUNCTIONS` dict in the [environment utils](envs/utils.py) that specifies how to create an instance of the environment. For example, the Gym environments are specified with the following entries: 62 | ```py 63 | def get_gym_env(): 64 | import gym 65 | 66 | return gym.make 67 | 68 | ENVS_FUNCTIONS = { 69 | 'gym':get_gym_env() 70 | } 71 | ``` 72 | 73 | ## Model-Learning with custom environments 74 | When using a model with custom environments, the model requires a few interfaces to function with the provided code. The [base model](models/base_model.py) should be inherited by a learned (or handcrafted) model and specify whether rewards, costs, and termination functions are predicted alongside the dynamics. 75 | 76 | By default our algorithm learns to predict rewards but assumes handcrafted cost- and termination-functions `c(s,a,s')` and `t(s,a,s')`. When adding a new environment, these functions should be defined (if not provided by the model) in the [statics](models/statics.py) file. For example, a default termination function that continues episodes for all states looks like this: 77 | ```py 78 | def no_done(obs, act, next_obs): 79 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) 80 | 81 | done = np.zeros(shape=obs.shape[:-1], dtype=np.bool) #always false 82 | done = done[...,None] 83 | return done 84 | ``` 85 | The static functions should then be linked by the environments' task name, such that the [Fake Environment](models/fake_env.py) correctly discovers them: 86 | ```py 87 | TERMS_BY_TASK = { 88 | 'default':no_done, 89 | 'HalfCheetah-v2':no_done, 90 | } 91 | ``` 92 | 93 | ## Hyperparameters 94 | Hyperparameters for a new experiment can be defined in the [configs](configs/) folder. The general form of our config files follows the following structure: 95 | ```py 96 | params = { 97 | 'universe': 'gym', 98 | 'task': 'HalfCheetahSafe-v2', 99 | 'algorithm_params': {...}, 100 | 'policy_params':{...}, 101 | 'buffer_params': {...}, 102 | 'sampler_params': {...}, 103 | 'run_params': {...}, 104 | } 105 | ``` 106 | Parameters specified in a config file overwrite the [base config](configs/baseconfig/base.py) file. For new algorithms or a new suite of environments, it might be practical to directly change the base config. 107 | 108 | In addition to model-parameters and policy-parameters, the main parameters of concern in CMPBO define rollout- and sampling-behavior of the algorithm. 109 | ```py 110 | 'n_initial_exploration_steps': int(10000), ### number of initial exploration steps for model-learning and 111 | # determining uncertainty calibration measurements 112 | 'sampling_alpha': 2, ### temperature for boltzman-sampling 113 | 'rollout_mode' : 'uncertainty', ### model rollouts terminate based on per-step uncertainty 114 | 'rollout_schedule': [10, 500, 5, 30], ### if rollout_mode:'schedule' this schedule is defined as 115 | # [min_epoch, max_epoch, min_horizon, max_horizon] 116 | ## if rollout_mode:'uncertainty', 'min_horizon' is used as 117 | # the initial rollout horizon and adapted throughout 118 | # training based on per-step uncertainty estimates 119 | # (KL-Divergence). 120 | 'batch_size_policy': 50000, ### batch size per policy update 121 | 'initial_real_samples_per_epoch': 1500, ### initial number of real samples per policy update, 122 | # adapted throughout training based on average uncertainty 123 | # estimates (mean KL-Divergence). 124 | 'min_real_samples_per_epoch': 500, ### absolute minimum number of real samples per policy update 125 | ``` 126 | ## Logging 127 | A range of measurements is logged automatically in tensorboard, the parameter configuration is saved as a JSON file. The location for summaries and checkpoints can be defined by specifying a `'log_dir'` in the configuration files. By default, this location will be set to `'~/ray_cmbpo/{env-taks}/defaults/{seed}'` and can be accessed with tensorboard by 128 | ```sh 129 | tensorboard --logdir ~/ray_cmbpo//defaults/ 130 | ``` 131 | 132 | # Acknowledgments 133 | Several sections of this repository contain code from other repositories, notably from [Tuomas Haarnoja](https://scholar.google.com/citations?user=VT7peyEAAAAJ&hl=en), [Kristian Hartikainen's](https://github.com/rail-berkeley/softlearning), [Michael Janner](https://github.com/JannerM/mbpo), [Kurtland Chua](https://github.com/kchua/handful-of-trials), and CPO by [Joshua Achiam and Alex Ray](https://github.com/openai/safety-starter-agents). 134 | -------------------------------------------------------------------------------- /algorithms/rl_algorithm.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from collections import OrderedDict 3 | from itertools import count 4 | import gtimer as gt 5 | import math 6 | import os 7 | import pdb 8 | 9 | import tensorflow as tf 10 | import numpy as np 11 | 12 | from utilities.utils import save_video 13 | 14 | 15 | class RLAlgorithm(tf.contrib.checkpoint.Checkpointable): 16 | """Abstract RLAlgorithm. 17 | 18 | Implements the _train and _evaluate methods to be used 19 | by classes inheriting from RLAlgorithm. 20 | """ 21 | 22 | def __init__( 23 | self, 24 | sampler, 25 | n_epochs=int(10e7), 26 | n_initial_exploration_steps=0, 27 | initial_exploration_policy=None, 28 | epoch_length=1000, 29 | eval_n_episodes=10, 30 | eval_deterministic=True, 31 | eval_render_mode=None, 32 | video_save_frequency=0, 33 | session=None, 34 | ): 35 | """ 36 | Args: 37 | n_epochs (`int`): Number of epochs to run the training for. 38 | n_initial_exploration_steps: Number of steps in the beginning to 39 | take using actions drawn from a separate exploration policy. 40 | initial_exploration_policy: policy to follow during initial 41 | exploration hook 42 | epoch_length (`int`): Epoch length. 43 | eval_n_episodes (`int`): Number of rollouts to evaluate. 44 | eval_deterministic (`int`): Whether or not to run the policy in 45 | deterministic mode when evaluating policy. 46 | eval_render_mode (`str`): Mode to render evaluation rollouts in. 47 | None to disable rendering. 48 | """ 49 | self.sampler = sampler 50 | 51 | self._n_epochs = n_epochs 52 | self._epoch_length = epoch_length 53 | self._n_initial_exploration_steps = n_initial_exploration_steps 54 | self._initial_exploration_policy = initial_exploration_policy 55 | 56 | self._eval_n_episodes = eval_n_episodes 57 | self._eval_deterministic = eval_deterministic 58 | self._video_save_frequency = video_save_frequency 59 | 60 | if self._video_save_frequency > 0: 61 | assert eval_render_mode != 'human', ( 62 | "RlAlgorithm cannot render and save videos at the same time") 63 | self._eval_render_mode = 'rgb_array' 64 | else: 65 | self._eval_render_mode = eval_render_mode 66 | 67 | self._session = session or tf.keras.backend.get_session() 68 | 69 | self._epoch = 0 70 | self._timestep = 0 71 | self._num_train_steps = 0 72 | 73 | def _initial_exploration_hook(self, env, initial_exploration_policy, pool): 74 | if self._n_initial_exploration_steps < 1: return 75 | 76 | if not initial_exploration_policy: 77 | raise ValueError( 78 | "Initial exploration policy must be provided when" 79 | " n_initial_exploration_steps > 0.") 80 | 81 | self.sampler.initialize(env, initial_exploration_policy, pool) 82 | while pool.size < self._n_initial_exploration_steps: 83 | self.sampler.sample() 84 | 85 | def _training_before_hook(self): 86 | """Method called before the actual training loops.""" 87 | pass 88 | 89 | def _training_after_hook(self): 90 | """Method called after the actual training loops.""" 91 | pass 92 | 93 | def _timestep_before_hook(self, *args, **kwargs): 94 | """Hook called at the beginning of each timestep.""" 95 | pass 96 | 97 | def _timestep_after_hook(self, *args, **kwargs): 98 | """Hook called at the end of each timestep.""" 99 | pass 100 | 101 | def _epoch_before_hook(self): 102 | """Hook called at the beginning of each epoch.""" 103 | self._train_steps_this_epoch = 0 104 | 105 | def _epoch_after_hook(self, *args, **kwargs): 106 | """Hook called at the end of each epoch.""" 107 | pass 108 | 109 | def _training_batch(self, batch_size=None): 110 | return self.sampler.random_batch(batch_size) 111 | 112 | def _evaluation_batch(self, *args, **kwargs): 113 | return self._training_batch(*args, **kwargs) 114 | 115 | @property 116 | def _training_started(self): 117 | return self._total_timestep > 0 118 | 119 | @property 120 | def _total_timestep(self): 121 | total_timestep = self._epoch * self._epoch_length + self._timestep 122 | return total_timestep 123 | 124 | def _train(self): 125 | """Return a generator that performs RL training. 126 | """ 127 | raise NotImplementedError 128 | 129 | @abc.abstractmethod 130 | def get_diagnostics(self,): 131 | raise NotImplementedError 132 | 133 | @property 134 | def ready_to_train(self): 135 | return self.sampler.batch_ready() 136 | 137 | def _do_sampling(self, timestep): 138 | return self.sampler.sample() 139 | 140 | @property 141 | def tf_saveables(self): 142 | return {} 143 | 144 | def __getstate__(self): 145 | state = { 146 | '_epoch_length': self._epoch_length, 147 | '_epoch': ( 148 | self._epoch + int(self._timestep >= self._epoch_length)), 149 | '_timestep': self._timestep % self._epoch_length, 150 | } 151 | 152 | return state 153 | 154 | def __setstate__(self, state): 155 | self.__dict__.update(state) 156 | -------------------------------------------------------------------------------- /algorithms/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from dotmap import DotMap 3 | from collections import OrderedDict 4 | 5 | def create_CMBPO_algorithm(variant, *args, **kwargs): 6 | from algorithms.cmbpo import CMBPO 7 | algorithm = CMBPO(*args, **kwargs) 8 | 9 | return algorithm 10 | 11 | 12 | ALGORITHM_CLASSES = { 13 | 'CMBPO': create_CMBPO_algorithm, 14 | } 15 | 16 | 17 | def get_algorithm_from_params(variant, 18 | *args, 19 | **kwargs): 20 | algorithm_params = variant['algorithm_params'] 21 | algorithm_type = algorithm_params['type'] 22 | algorithm_kwargs = deepcopy(algorithm_params['kwargs']) 23 | # @anyboby, workaround for local_example_debug mode, for some reason gets DotMap isntead of 24 | # OrderedDict as algorithm_kwargs, which doesn't seem to work for double asteriks ! 25 | if isinstance(algorithm_kwargs, DotMap): 26 | algorithm_kwargs = algorithm_kwargs.toDict() 27 | 28 | algorithm = ALGORITHM_CLASSES[algorithm_type]( 29 | variant, *args, **algorithm_kwargs, **kwargs) 30 | 31 | return algorithm 32 | -------------------------------------------------------------------------------- /buffers/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | def get_cpobuffer(env, *args, **kwargs): 4 | from buffers.cpobuffer import CPOBuffer 5 | 6 | buffer = CPOBuffer( 7 | *args, 8 | observation_space=env.observation_space, 9 | action_space=env.action_space, 10 | *args, 11 | **kwargs) 12 | 13 | return buffer 14 | 15 | BUFFER_FUNCTIONS = { 16 | 'CPOBuffer': get_cpobuffer, 17 | } 18 | 19 | def get_buffer_from_params(params, env, *args, **kwargs): 20 | buffer_params = params['buffer_params'] 21 | buffer_type = buffer_params['type'] 22 | buffer_kwargs = deepcopy(buffer_params['kwargs']) 23 | 24 | buffer = BUFFER_FUNCTIONS[buffer_type]( 25 | env, 26 | *args, 27 | **buffer_kwargs, 28 | **kwargs) 29 | 30 | return buffer 31 | -------------------------------------------------------------------------------- /cmbpo.yml: -------------------------------------------------------------------------------- 1 | name: cmbpo 2 | channels: 3 | - anaconda 4 | - defaults 5 | dependencies: 6 | - click=7.0 7 | - matplotlib=3.3.4 8 | - mpi4py=3.0.3 9 | - pip=21.0.1 10 | - python=3.6.12 11 | - requests=2.20.1 12 | - tensorflow-gpu=1.14.0 13 | - pip: 14 | - conda-env-export==0.3.2 15 | - dotmap==1.3.8 16 | - gtimer==1.0.0b5 17 | - gym==0.18.0 18 | - joblib==0.14.1 19 | - mkl-fft==1.2.0 20 | - mkl-random==1.1.0 21 | - mkl-service==2.3.0 22 | - mujoco-py==2.0.2.13 23 | - olefile==0.46 24 | - pyOpenSSL==19.1.0 25 | - PySocks==1.7.1 26 | - PyYAML==5.4.1 27 | - ray==0.6.4 28 | - sip==4.19.24 29 | - tensorflow==1.14.0 30 | - tornado==6.0.4 31 | prefix: /home/mo/anaconda3/envs/cmbpo_test2 32 | -------------------------------------------------------------------------------- /configs/baseconfig/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides functions that are utilized by the command line interface. 2 | 3 | In particular, the examples are exposed to the command line interface 4 | (defined in `scripts.console_scripts`) through the 5 | `get_trainable_class`, `get_variant_spec`, and `get_parser` functions. 6 | """ 7 | 8 | 9 | def get_trainable_class(*args, **kwargs): 10 | from .main import ExperimentRunner 11 | return ExperimentRunner 12 | 13 | def get_params_from_file(filepath, params_name='params'): 14 | import importlib 15 | from dotmap import DotMap 16 | module = importlib.import_module(filepath) 17 | params = getattr(module, params_name) 18 | params = DotMap(params) 19 | return params 20 | 21 | def get_variant_spec(command_line_args, *args, **kwargs): 22 | from .base import get_variant_spec 23 | import importlib 24 | params = get_params_from_file(command_line_args.config) 25 | variant_spec = get_variant_spec(command_line_args, *args, params, **kwargs) 26 | return variant_spec 27 | 28 | def get_parser(): 29 | from .utils import get_parser 30 | parser = get_parser() 31 | return parser 32 | -------------------------------------------------------------------------------- /configs/baseconfig/base.py: -------------------------------------------------------------------------------- 1 | from ray import tune 2 | import numpy as np 3 | import pdb 4 | 5 | from utilities.utils import deep_update 6 | 7 | M = 256 #256 8 | 9 | NUM_COUPLING_LAYERS = 2 10 | 11 | DEFAULT_MAX_PATH_LENGTH = 1000 12 | 13 | CPO_POLICY_PARAMS_BASE = { 14 | 'type': 'CPOPolicy', 15 | 'kwargs': { 16 | 'a_hidden_layer_sizes': (M, M), # policy network hidden layers 17 | 'constrain_cost': True, # constrain_cost=False will perform TRPO updates 18 | 'vf_lr': 3e-4, # learn rate for value learning 19 | 'vf_hidden_layer_sizes':(M,M), # nn hidden layers for vf 20 | 'vf_epochs': 8, # number of training epochs for values 21 | 'vf_batch_size': 2048, # minibatches for value training 22 | 'vf_ensemble_size': 3, # vf ensemble size 23 | 'vf_elites': 2, # vf elites 24 | 'vf_activation': 'swish', # activation function 25 | 'vf_loss': 'MSE', # choose from 'NLL', 'MSPE' (inc. var); 'MSE' ; 'Huber' 26 | 'vf_decay': 1e-6, # decay for nn regularization 27 | 'vf_clipping': False, # clip losses for a trust-region like vf update 28 | 'vf_kl_cliprange': 0.0, # only applicable if vf_clippping=True 29 | 'ent_reg': 0, # 5e-3 # exploration bonus for maintaining pol. entropy 30 | 'target_kl': 0.01, # trust region diameter 31 | 'cost_lim': 10, # cost limit for whole task length 32 | 'cost_lam': .5, # gae lambda 33 | 'cost_gamma': 0.97, # discounts 34 | 'lam': .95, # gae lambda 35 | 'gamma': 0.99, # discounts 36 | 'epoch_length': tune.sample_from(lambda spec: ( 37 | spec.get('config', spec) 38 | ['algorithm_params']['kwargs']['epoch_length'] 39 | )), 40 | 'max_path_length': tune.sample_from(lambda spec: ( 41 | spec.get('config', spec) 42 | ['sampler_params']['kwargs']['max_path_length'] 43 | )), 44 | 'log_dir': tune.sample_from(lambda spec: ( 45 | spec.get('config', spec) 46 | ['log_dir'] 47 | )), 48 | } 49 | } 50 | 51 | POLICY_PARAMS_BASE = { 52 | 'CPOPolicy' : CPO_POLICY_PARAMS_BASE, 53 | } 54 | 55 | POLICY_PARAMS_BASE.update({ 56 | 'cpopolicy': POLICY_PARAMS_BASE['CPOPolicy'] 57 | }) 58 | 59 | ALGORITHM_PARAMS = { 60 | 'CMBPO': { 61 | 'type': 'CMBPO', 62 | 'kwargs': { 63 | 'task': tune.sample_from(lambda spec: ( 64 | spec.get('config', spec) 65 | ['environment_params']['task'] 66 | )), 67 | 'n_env_interacts': int(10e6), 68 | 'epoch_length': 50000, 69 | 'eval_render_mode': 'human', 70 | 'eval_n_episodes': 1, 71 | 'eval_every_n_steps': 5e3, 72 | 'eval_deterministic': False, 73 | 'n_initial_exploration_steps': int(10000), # number of initial exploration steps for model-learning and 74 | # determining uncertainty calibration measurements 75 | #### it is crucial to choose a model that doesn't overfit when trained too often on seen data 76 | ## for model architecture finding: 1. play around with the start samples to find an architecture, that doesn't really overfit 77 | # 2. m_train_freq in can somewhat limit overfitting, but is only treating the symptom 78 | # 3. try finding a balance between the size of new samples per number of 79 | # updates of the model network (with m_train_freq) 80 | 'use_model': True, 81 | 'm_hidden_dims':(512,512), # hidden layer size of model bnn 82 | 'm_loss_type': 'MSPE', 83 | 'm_use_scaler_in': True, 84 | 'm_use_scaler_out': True, 85 | 'm_lr': 1e-3, 86 | 'm_train_freq': 4000, # model is only trained every (self._timestep % self._model_train_freq==0) steps (terminates when stops improving) 87 | 'rollout_batch_size': 1.0e3, # rollout_batch_size is the size of randomly chosen states to start from when rolling out model 88 | 'm_networks': 7, # size of model network ensemble 89 | 'm_elites': 5, # best networks to select from num_networks 90 | 'max_model_t': None, # a timeout for model training (e.g. for speeding up wallclock time) 91 | 'sampling_alpha': 2, # temperature for boltzman-sampling 92 | 'rollout_mode' : 'uncertainty', # 93 | 'rollout_schedule': [10, 500, 5, 30], # if rollout_mode:'schedule' this schedule is defined as 94 | #[min_epoch, max_epoch, min_horizon, max_horizon] 95 | # if rollout_mode:'uncertainty', 'min_horizon' is used as 96 | # the initial rollout horizon and adapted throughout 97 | # training based on per-step uncertainty estimates 98 | # (KL-Divergence). 99 | 'maxroll': 35, # maximum rollout horizon 100 | 'batch_size_policy': 50000, # batch size per policy update 101 | 'initial_real_samples_per_epoch': 15000, # number of real samples contained in first batch 102 | 'min_real_samples_per_epoch': 500, # absolute minimum of samples 103 | } 104 | }, 105 | } 106 | 107 | BUFFER_PARAMS_PER_ALGO = { 108 | 'CMBPO': { 109 | 'type': 'CPOBuffer', 110 | 'preprocess_type': 'default', 111 | 'kwargs': { 112 | 'size': tune.sample_from(lambda spec: ( 113 | spec.get('config', spec) 114 | ['algorithm_params']['kwargs']['epoch_length'] 115 | )), 116 | 'archive_size': tune.sample_from(lambda spec: ( 117 | { 118 | 'SimpleReplayPool': int(1e6), 119 | 'CPOBuffer':int(3e5), 120 | }.get( 121 | spec.get('config', spec) 122 | ['buffer_params']['type'], 123 | int(1e6)) 124 | )), 125 | } 126 | }, 127 | } 128 | 129 | SAMPLER_PARAMS_PER_ALGO = { 130 | 'default': { 131 | 'type':'CPOSampler', 132 | 'kwargs':{ 133 | 'max_path_length': DEFAULT_MAX_PATH_LENGTH, 134 | 'render_mode': None, 135 | }, 136 | }, 137 | 'CMBPO': { 138 | 'type':'CPOSampler', 139 | 'kwargs':{ 140 | 'max_path_length': DEFAULT_MAX_PATH_LENGTH, 141 | 'render_mode': None, 142 | }, 143 | } 144 | } 145 | 146 | RUN_PARAMS = { 147 | 'seed': tune.sample_from( 148 | lambda spec: np.random.randint(0, 10000)), 149 | 'checkpoint_at_end': True, 150 | 'checkpoint_frequency': 50, 151 | 'checkpoint_buffer': False, 152 | } 153 | 154 | ENV_PARAMS = { 155 | 'normalize_actions':False, 156 | 'kwargs':{} 157 | } 158 | 159 | def get_variant_spec(args, params): 160 | assert hasattr(params, 'universe') and \ 161 | hasattr(params, 'task') and \ 162 | hasattr(params, 'algorithm') and \ 163 | hasattr(params, 'policy') 164 | 165 | universe, task = params.universe, params.task 166 | ENV_PARAMS.update({ 167 | 'universe': universe, 168 | 'task': task, 169 | }) 170 | 171 | algorithm, policy = params.algorithm_params.type, params.policy_params.type 172 | base_spec = { 173 | 'log_dir': f'~/ray_{algorithm.lower()}', 174 | 'exp_name': 'defaults', 175 | 'environment_params': ENV_PARAMS, 176 | 'policy_params': POLICY_PARAMS_BASE[policy], 177 | 'algorithm_params': ALGORITHM_PARAMS[algorithm], 178 | 'buffer_params': BUFFER_PARAMS_PER_ALGO[algorithm], 179 | 'sampler_params': SAMPLER_PARAMS_PER_ALGO[algorithm], 180 | 'run_params': RUN_PARAMS, 181 | } 182 | 183 | variant_spec = deep_update( 184 | base_spec, 185 | params 186 | ) 187 | return variant_spec 188 | -------------------------------------------------------------------------------- /configs/baseconfig/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import glob 4 | import pickle 5 | import sys 6 | import pdb 7 | 8 | import tensorflow as tf 9 | from ray import tune 10 | 11 | from envs.utils import get_env_from_params 12 | from algorithms.utils import get_algorithm_from_params 13 | from policies.utils import get_policy_from_params 14 | from buffers.utils import get_buffer_from_params 15 | from samplers.utils import get_sampler_from_params 16 | 17 | from utilities.utils import set_seed, initialize_tf_variables 18 | from utilities.instrument import run_example_local, run_example_debug 19 | 20 | class ExperimentRunner(tune.Trainable): 21 | def _setup(self, variant): 22 | set_seed(variant['run_params']['seed']) 23 | 24 | self._variant = variant 25 | gpu_options = tf.GPUOptions(allow_growth=True) 26 | session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 27 | tf.keras.backend.set_session(session) 28 | self._session = tf.keras.backend.get_session() 29 | 30 | self.train_generator = None 31 | self._built = False 32 | 33 | def _stop(self): 34 | tf.reset_default_graph() 35 | tf.keras.backend.clear_session() 36 | 37 | def _build(self): 38 | """ 39 | called by tune to build algorithm 40 | """ 41 | variant = copy.deepcopy(self._variant) 42 | 43 | env_params = variant['environment_params'] 44 | env = self.env = ( 45 | get_env_from_params(env_params)) 46 | 47 | buffer = self.buffer = ( 48 | get_buffer_from_params(variant, env)) 49 | sampler = self.sampler = get_sampler_from_params(variant) 50 | policy = self.policy = get_policy_from_params( 51 | variant, env, self._session) 52 | 53 | #### build algorithm 54 | self.algorithm = get_algorithm_from_params( 55 | variant=self._variant, 56 | env=env, 57 | policy=policy, 58 | buffer=buffer, 59 | sampler=sampler, 60 | session=self._session) 61 | 62 | initialize_tf_variables(self._session, only_uninitialized=True) 63 | 64 | # add graph since ray doesn't seem to automatically add that 65 | graph_writer = tf.summary.FileWriter(self.logdir, self._session.graph) 66 | graph_writer.flush() 67 | graph_writer.close() 68 | 69 | #### finalize graph 70 | tf.get_default_graph().finalize() 71 | self._built = True 72 | 73 | 74 | def _train(self): 75 | if not self._built: 76 | self._build() 77 | 78 | if self.train_generator is None: 79 | self.train_generator = self.algorithm.train() 80 | 81 | diagnostics = next(self.train_generator) 82 | 83 | return diagnostics 84 | 85 | def _pickle_path(self, checkpoint_dir): 86 | return os.path.join(checkpoint_dir, 'checkpoint.pkl') 87 | 88 | def _replay_pool_pickle_path(self, checkpoint_dir): 89 | return os.path.join(checkpoint_dir, 'replay_pool.pkl') 90 | 91 | def _tf_checkpoint_prefix(self, checkpoint_dir): 92 | return os.path.join(checkpoint_dir, 'checkpoint') 93 | 94 | def _get_tf_checkpoint(self): 95 | tf_checkpoint = tf.train.Checkpoint(**self.algorithm.tf_saveables) 96 | 97 | return tf_checkpoint 98 | 99 | def _save_replay_pool(self, checkpoint_dir): 100 | replay_pool_pickle_path = self._replay_pool_pickle_path( 101 | checkpoint_dir) 102 | self.buffer.save_latest_experience(replay_pool_pickle_path) 103 | 104 | def _restore_replay_pool(self, current_checkpoint_dir): 105 | experiment_root = os.path.dirname(current_checkpoint_dir) 106 | 107 | experience_paths = [ 108 | self._replay_pool_pickle_path(checkpoint_dir) 109 | for checkpoint_dir in sorted(glob.iglob( 110 | os.path.join(experiment_root, 'checkpoint_*'))) 111 | ] 112 | for experience_path in experience_paths: 113 | self.buffer.load_experience(experience_path) 114 | 115 | def _save(self, checkpoint_dir): 116 | """Implements the saving logic. 117 | @anyboby: implementation very cmbpo specific saving methods, not optimal! 118 | but general interfaces seem hard to implement due to all the different 119 | frameworks (Keras, tf, pickling etc.) 120 | """ 121 | 122 | ## only saves model atm 123 | self.policy_path = self.policy.save(checkpoint_dir) ### @anyboby: this saves all tf objects 124 | self.algorithm.save(checkpoint_dir) 125 | 126 | if self._variant['run_params'].get('checkpoint_replay_pool', False): 127 | self._save_replay_pool(checkpoint_dir) 128 | 129 | return os.path.join(checkpoint_dir, '') 130 | 131 | def _restore(self, checkpoint_dir): 132 | raise NotImplementedError 133 | 134 | def main(argv=None): 135 | """Run ExperimentRunner locally on ray. 136 | """ 137 | run_example_local(__package__, argv) 138 | 139 | if __name__ == '__main__': 140 | main(argv=sys.argv[1:]) -------------------------------------------------------------------------------- /configs/baseconfig/utils.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import argparse 3 | from distutils.util import strtobool 4 | import json 5 | 6 | from ray.tune import sample_from 7 | 8 | def add_ray_init_args(parser): 9 | 10 | def init_help_string(help_string): 11 | return help_string + " Passed to `ray.init`." 12 | 13 | parser.add_argument( 14 | '--cpus', 15 | type=int, 16 | default=None, 17 | help=init_help_string("Cpus to allocate to ray process.")) 18 | parser.add_argument( 19 | '--gpus', 20 | type=int, 21 | default=None, 22 | help=init_help_string("Gpus to allocate to ray process.")) 23 | parser.add_argument( 24 | '--resources', 25 | type=json.loads, 26 | default=None, 27 | help=init_help_string("Resources to allocate to ray process.")) 28 | parser.add_argument( 29 | '--include-webui', 30 | type=str, 31 | default=False, 32 | help=init_help_string("Boolean flag indicating whether to start the" 33 | "web UI, which is a Jupyter notebook.")) 34 | parser.add_argument( 35 | '--temp-dir', 36 | type=str, 37 | default=None, 38 | help=init_help_string("If provided, it will specify the root temporary" 39 | " directory for the Ray process.")) 40 | 41 | return parser 42 | 43 | 44 | def add_ray_tune_args(parser): 45 | 46 | def tune_help_string(help_string): 47 | return help_string + " Passed to `tune.run_experiments`." 48 | 49 | parser.add_argument( 50 | '--resources-per-trial', 51 | type=json.loads, 52 | default={}, 53 | help=tune_help_string("Resources to allocate for each trial.")) 54 | parser.add_argument( 55 | '--trial-gpus', 56 | type=float, 57 | default=None, 58 | help=("Resources to allocate for each trial. Passed" 59 | " to `tune.run_experiments`.")) 60 | parser.add_argument( 61 | '--trial-extra-cpus', 62 | type=int, 63 | default=None, 64 | help=("Extra CPUs to reserve in case the trials need to" 65 | " launch additional Ray actors that use CPUs.")) 66 | parser.add_argument( 67 | '--trial-extra-gpus', 68 | type=float, 69 | default=None, 70 | help=("Extra GPUs to reserve in case the trials need to" 71 | " launch additional Ray actors that use GPUs.")) 72 | parser.add_argument( 73 | '--num-samples', 74 | default=1, 75 | type=int, 76 | help=tune_help_string("Number of times to repeat each trial.")) 77 | parser.add_argument( 78 | '--upload-dir', 79 | type=str, 80 | default='', 81 | help=tune_help_string("Optional URI to sync training results to (e.g." 82 | " s3:// or gs://).")) 83 | parser.add_argument( 84 | '--trial-name-template', 85 | type=str, 86 | default='seed:{trial.config[run_params][seed]}', 87 | help=tune_help_string( 88 | "Optional string template for trial name. For example:" 89 | " '{trial.trial_id}-seed={trial.config[run_params][seed]}'")) 90 | parser.add_argument( 91 | '--trial-cpus', 92 | type=int, 93 | default=multiprocessing.cpu_count(), 94 | help=tune_help_string("Resources to allocate for each trial.")) 95 | parser.add_argument( 96 | '--checkpoint-frequency', 97 | type=int, 98 | default=None, 99 | help=tune_help_string( 100 | "How many training iterations between checkpoints." 101 | " A value of 0 (default) disables checkpointing. If set," 102 | " takes precedence over variant['run_params']" 103 | "['checkpoint_frequency'].")) 104 | parser.add_argument( 105 | '--checkpoint-at-end', 106 | type=lambda x: bool(strtobool(x)), 107 | default=None, 108 | help=tune_help_string( 109 | "Whether to checkpoint at the end of the experiment. If set," 110 | " takes precedence over variant['run_params']" 111 | "['checkpoint_at_end'].")) 112 | parser.add_argument( 113 | '--max-failures', 114 | default=3, 115 | type=int, 116 | help=tune_help_string( 117 | "Try to recover a trial from its last checkpoint at least this " 118 | "many times. Only applies if checkpointing is enabled.")) 119 | parser.add_argument( 120 | '--restore', 121 | type=str, 122 | default=None, 123 | help=tune_help_string( 124 | "Path to checkpoint. Only makes sense to set if running 1 trial." 125 | " Defaults to None.")) 126 | parser.add_argument( 127 | '--with-server', 128 | type=str, 129 | default=False, 130 | help=tune_help_string("Starts a background Tune server. Needed for" 131 | " using the Client API.")) 132 | 133 | return parser 134 | 135 | 136 | def get_parser(): 137 | parser = argparse.ArgumentParser() 138 | 139 | parser.add_argument( 140 | '--config', 141 | type=str) 142 | 143 | parser.add_argument( 144 | '--checkpoint-replay-pool', 145 | type=lambda x: bool(strtobool(x)), 146 | default=None, 147 | help=("Whether a checkpoint should also saved the replay" 148 | " pool. If set, takes precedence over" 149 | " variant['run_params']['checkpoint_replay_pool']." 150 | " Note that the replay pool is saved (and " 151 | " constructed) piece by piece so that each" 152 | " experience is saved only once.")) 153 | 154 | parser.add_argument( 155 | '--policy', 156 | type=str, 157 | choices=('cpopolicy'), 158 | default='cpopolicy') 159 | 160 | parser.add_argument( 161 | '--mode', type=str, default='local') 162 | parser.add_argument( 163 | '--confirm-remote', 164 | type=lambda x: bool(strtobool(x)), 165 | nargs='?', 166 | const=True, 167 | default=True, 168 | help="Whether or not to query yes/no on remote run.") 169 | 170 | parser.add_argument( 171 | '--video-save-frequency', 172 | type=int, 173 | default=None, 174 | help="Save frequency for videos.") 175 | 176 | parser = add_ray_init_args(parser) 177 | parser = add_ray_tune_args(parser) 178 | 179 | return parser 180 | 181 | def variant_equals(*keys): 182 | def get_from_spec(spec): 183 | # TODO(hartikainen): This may break in some cases. ray.tune seems to 184 | # add a 'config' key at the top of the spec, whereas `generate_variants` 185 | # does not. 186 | node = spec.get('config', spec) 187 | for key in keys: 188 | node = node[key] 189 | 190 | return node 191 | 192 | return sample_from(get_from_spec) 193 | -------------------------------------------------------------------------------- /configs/cmbpo_antsafe.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | params = { 4 | 'universe': 'gym', 5 | 'task': 'AntSafe-v2', 6 | 'environment_params': { 7 | 'normalize_actions': True, 8 | }, 9 | 'algorithm_params': { 10 | 'type': 'CMBPO', 11 | 'kwargs':{ 12 | 'n_env_interacts': int(10e6), 13 | 'epoch_length': 50000, 14 | 'eval_every_n_steps': 5e3, 15 | 'n_initial_exploration_steps': int(10000), 16 | #### it is crucial to choose a model that doesn't overfit when trained too often on seen data 17 | ## for model architecture finding: 1. play around with the start samples to find an architecture, that doesn't really overfit 18 | # 2. m_train_freq in can somewhat limit overfitting, but is only treating the symptom 19 | # 3. try finding a balance between the size of new samples per number of 20 | # updates of the model network (with m_train_freq) 21 | 'use_model': True, 22 | 'm_hidden_dims':(512,512), # hidden layer size of model bnn 23 | 'm_loss_type': 'MSPE', 24 | 'm_use_scaler_in': True, 25 | 'm_use_scaler_out': True, 26 | 'm_lr': 1e-3, 27 | 'm_train_freq': 4000, # model is only trained every (self._timestep % self._model_train_freq==0) steps (terminates when stops improving) 28 | 'rollout_batch_size': 1.0e3, # rollout_batch_size is the size of randomly chosen states to start from when rolling out model 29 | 'm_networks': 7, # size of model network ensemble 30 | 'm_elites': 5, # best networks to select from num_networks 31 | 'max_model_t': None, # a timeout for model training (e.g. for speeding up wallclock time) 32 | 'sampling_alpha': 2, 33 | 'rollout_mode' : 'uncertainty', #### choose from 'schedule', or 'uncertainty' 34 | 'rollout_schedule': [10, 500, 5, 30], #[15, 100, 1, 15], # min_epoch, max_epoch, min_length, max_length = self._rollout_schedule 35 | # increases rollout length from min_length to max_length over 36 | # range of (min_epoch, max_epoch) 37 | ### Only applies if rollout_mode=='schedule' 38 | 'maxroll': 35, # maximum rollout horizon 39 | 'batch_size_policy': 50000, # batch size before policy is updates 40 | 'initial_real_samples_per_epoch': 20000, # number of real samples contained in first batch 41 | 'min_real_samples_per_epoch': 500, # absolute minimum of samples 42 | } 43 | }, 44 | 'policy_params':{ 45 | 'type':'cpopolicy', 46 | 'kwargs':{ 47 | 'constrain_cost': True, # constrain_cost=False will perform TRPO updates 48 | 'a_hidden_layer_sizes': (128, 128), # policy network hidden layers 49 | 'vf_lr': 3e-4, # learn rate for value learning 50 | 'vf_hidden_layer_sizes':(128,128), # nn hidden layers for vf 51 | 'vf_epochs': 8, # number of training epochs for values 52 | 'vf_batch_size': 2048, # minibatches for value training 53 | 'vf_ensemble_size': 3, # vf ensemble size 54 | 'vf_elites': 2, # vf elites 55 | 'vf_activation': 'swish', # activation function 56 | 'vf_loss': 'MSE', # choose from 'NLL', 'MSPE' (inc. var); 'MSE' ; 'Huber' 57 | 'vf_decay': 1e-6, # decay for nn regularization 58 | 'vf_clipping': False, # clip losses for a trust-region like vf update 59 | 'vf_kl_cliprange': 0.0, # only applicable if vf_clippping=True 60 | 'ent_reg': 0, # 5e-3 # exploration bonus for maintaining pol. entropy 61 | 'target_kl': 0.01, # trust region diameter 62 | 'cost_lim': 10, 63 | 'cost_lam': .5, # gae lambda 64 | 'cost_gamma': 0.97, # discounts 65 | 'lam': .95, # gae lambda 66 | 'gamma': 0.99, # discounts 67 | } 68 | }, 69 | 'buffer_params': {}, 70 | 'sampler_params': { 71 | 'kwargs':{ 72 | 'render_mode':None, #'human' 73 | } 74 | }, 75 | 'run_params': {}, 76 | } -------------------------------------------------------------------------------- /configs/cmbpo_hcs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | params = { 4 | 'universe': 'gym', 5 | 'task': 'HalfCheetahSafe-v2', 6 | 'environment_params': { 7 | 'normalize_actions': True, 8 | }, 9 | 'algorithm_params': { 10 | 'type': 'CMBPO', 11 | 'kwargs':{ 12 | 'n_env_interacts': int(10e6), 13 | 'epoch_length': 50000, 14 | 'eval_every_n_steps': 5e3, 15 | 'n_initial_exploration_steps': int(10000), 16 | 'use_model': True, 17 | 'm_hidden_dims':(512,512), 18 | 'm_loss_type': 'MSPE', 19 | 'm_use_scaler_in': True, 20 | 'm_use_scaler_out': True, 21 | 'm_lr': 1e-3, 22 | 'm_train_freq': 4000, 23 | 'rollout_batch_size': 1.0e3, 24 | 'm_networks': 7, 25 | 'm_elites': 5, 26 | 'max_model_t': None, 27 | 'sampling_alpha': 2, 28 | 'rollout_mode' : 'uncertainty', 29 | 'rollout_schedule': [10, 500, 5, 30], 30 | 'maxroll': 35, 31 | 'batch_size_policy': 50000, 32 | 'initial_real_samples_per_epoch': 15000, 33 | 'min_real_samples_per_epoch': 500, 34 | } 35 | }, 36 | 'policy_params':{ 37 | 'type':'cpopolicy', 38 | 'kwargs':{ 39 | 'constrain_cost': False, 40 | 'a_hidden_layer_sizes': (128, 128), 41 | 'vf_lr': 3e-4, 42 | 'vf_hidden_layer_sizes':(128,128), 43 | 'vf_epochs': 8, 44 | 'vf_batch_size': 2048, 45 | 'vf_ensemble_size': 3, 46 | 'vf_elites': 2, 47 | 'vf_activation': 'swish', 48 | 'vf_loss': 'MSE', 49 | 'vf_decay': 1e-6, 50 | 'vf_clipping': False, 51 | 'vf_kl_cliprange': 0.0, 52 | 'ent_reg': 0, # 5e-3 53 | 'target_kl': 0.01, 54 | 'lam': .95, 55 | 'gamma': 0.99, 56 | } 57 | }, 58 | 'buffer_params': {}, 59 | 'sampler_params': { 60 | 'kwargs':{ 61 | 'render_mode':'human', 62 | } 63 | }, 64 | 'run_params': {}, 65 | } -------------------------------------------------------------------------------- /configs/cmbpo_hs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | params = { 4 | 'universe': 'gym', 5 | 'task': 'HumanoidSafe-v2', 6 | 'environment_params': { 7 | 'normalize_actions': True, 8 | }, 9 | 'algorithm_params': { 10 | 'type': 'CMBPO', 11 | 'kwargs':{ 12 | 'n_env_interacts': int(10e6), 13 | 'epoch_length': 50000, 14 | 'eval_every_n_steps': 5e3, 15 | 'n_initial_exploration_steps': int(10000), 16 | 'use_model': True, 17 | 'm_hidden_dims':(512,512), 18 | 'm_loss_type': 'MSPE', 19 | 'm_use_scaler_in': True, 20 | 'm_use_scaler_out': True, 21 | 'm_lr': 1e-3, 22 | 'm_train_freq': 4000, 23 | 'rollout_batch_size': 1.0e3, 24 | 'm_networks': 7, 25 | 'm_elites': 5, 26 | 'max_model_t': None, 27 | 'sampling_alpha': 2, 28 | 'rollout_mode' : 'uncertainty', 29 | 'rollout_schedule': [10, 500, 5, 30], 30 | 'maxroll': 35, 31 | 'batch_size_policy': 50000, 32 | 'initial_real_samples_per_epoch': 15000, 33 | 'min_real_samples_per_epoch': 500, 34 | } 35 | }, 36 | 'policy_params':{ 37 | 'type':'cpopolicy', 38 | 'kwargs':{ 39 | 'constrain_cost': False, 40 | 'a_hidden_layer_sizes': (128, 128), 41 | 'vf_lr': 3e-4, 42 | 'vf_hidden_layer_sizes':(128,128), 43 | 'vf_epochs': 8, 44 | 'vf_batch_size': 2048, 45 | 'vf_ensemble_size': 3, 46 | 'vf_elites': 2, 47 | 'vf_activation': 'swish', 48 | 'vf_loss': 'MSE', 49 | 'vf_decay': 1e-6, 50 | 'vf_clipping': False, 51 | 'vf_kl_cliprange': 0.0, 52 | 'ent_reg': 0, # 5e-3 53 | 'target_kl': 0.01, 54 | 'lam': .95, 55 | 'gamma': 0.99, 56 | } 57 | }, 58 | 'buffer_params': {}, 59 | 'sampler_params': { 60 | 'kwargs':{ 61 | 'render_mode':'human', 62 | } 63 | }, 64 | 'run_params': {}, 65 | } -------------------------------------------------------------------------------- /configs/cpo_hcs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | params = { 4 | 'universe': 'gym', 5 | 'task': 'HalfCheetahSafe-v2', 6 | 'environment_params': { 7 | 'normalize_actions': True, 8 | }, 9 | 'algorithm_params': { 10 | 'type': 'CMBPO', 11 | 'kwargs':{ 12 | 'n_env_interacts': int(10e6), 13 | 'epoch_length': 50000, 14 | 'eval_every_n_steps': 5e3, 15 | 'n_initial_exploration_steps': int(0), 16 | 'use_model': False, 17 | 'batch_size_policy': 35000, 18 | } 19 | }, 20 | 'policy_params':{ 21 | 'type':'cpopolicy', 22 | 'kwargs':{ 23 | 'constrain_cost': True, 24 | 'a_hidden_layer_sizes': (128, 128), 25 | 'vf_lr': 3e-4, 26 | 'vf_hidden_layer_sizes':(128,128), 27 | 'vf_epochs': 8, 28 | 'vf_batch_size': 2048, 29 | 'vf_ensemble_size': 3, 30 | 'vf_elites': 2, 31 | 'vf_activation': 'swish', 32 | 'vf_loss': 'MSE', 33 | 'vf_decay': 1e-6, 34 | 'vf_clipping': False, 35 | 'vf_kl_cliprange': 0.0, 36 | 'ent_reg': 0, # 5e-3 37 | 'target_kl': 0.01, 38 | 'lam': .95, 39 | 'gamma': 0.99, 40 | } 41 | }, 42 | 'buffer_params': {}, 43 | 'sampler_params': { 44 | 'kwargs':{ 45 | 'render_mode':None, 46 | } 47 | }, 48 | 'run_params': {}, 49 | } -------------------------------------------------------------------------------- /configs/trpo_hcs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | params = { 4 | 'universe': 'gym', 5 | 'task': 'HalfCheetahSafe-v2', 6 | 'environment_params': { 7 | 'normalize_actions': True, 8 | }, 9 | 'algorithm_params': { 10 | 'type': 'CMBPO', 11 | 'kwargs':{ 12 | 'n_env_interacts': int(10e6), 13 | 'epoch_length': 50000, 14 | 'eval_render_mode': 'human', 15 | 'eval_n_episodes': 1, 16 | 'eval_every_n_steps': 5e3, 17 | 'eval_deterministic': False, 18 | 'n_initial_exploration_steps': int(0), 19 | 'use_model': False, 20 | 'batch_size_policy': 25000, 21 | } 22 | }, 23 | 'policy_params':{ 24 | 'type':'cpopolicy', 25 | 'kwargs':{ 26 | 'constrain_cost': False, 27 | 'a_hidden_layer_sizes': (128, 128), 28 | 'vf_lr': 3e-4, 29 | 'vf_hidden_layer_sizes':(128,128), 30 | 'vf_epochs': 8, 31 | 'vf_batch_size': 2048, 32 | 'vf_ensemble_size': 3, 33 | 'vf_elites': 2, 34 | 'vf_activation': 'swish', 35 | 'vf_loss': 'MSE', 36 | 'vf_decay': 1e-6, 37 | 'vf_clipping': False, 38 | 'vf_kl_cliprange': 0.0, 39 | 'ent_reg': 0, 40 | 'target_kl': 0.01, 41 | 'cost_lim': 10, 42 | 'cost_lam': .5, 43 | 'cost_gamma': 0.97, 44 | 'lam': .95, 45 | 'gamma': 0.99, 46 | } 47 | }, 48 | 'buffer_params': {}, 49 | 'sampler_params': { 50 | 'kwargs':{ 51 | 'render_mode':None, 52 | } 53 | }, 54 | 'run_params': {}, 55 | } -------------------------------------------------------------------------------- /envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/envs/__init__.py -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | import gym 3 | 4 | import os 5 | import sys 6 | dirpath = os.path.dirname(os.path.dirname(__file__)) 7 | sys.path.append(dirpath) 8 | 9 | env_specs = gym.envs.registry.env_specs 10 | 11 | if 'HumanoidSafe-v2' not in env_specs: 12 | register( 13 | id='HumanoidSafe-v2', 14 | entry_point='mujoco_safety_gym.envs:HumanoidEnv', 15 | max_episode_steps=1000, 16 | ) 17 | if 'AntSafe-v2' not in env_specs: 18 | register( 19 | id='AntSafe-v2', 20 | entry_point='mujoco_safety_gym.envs:AntEnv', 21 | max_episode_steps=1000, 22 | ) 23 | if 'AntSafeVisualize-v2' not in env_specs: 24 | register( 25 | id='AntSafeVisualize-v2', 26 | entry_point='mujoco_safety_gym.envs:AntEnvVisualize', 27 | max_episode_steps=1000, 28 | ) 29 | if 'HopperSafe-v2' not in env_specs: 30 | register( 31 | id='HopperSafe-v2', 32 | entry_point='mujoco_safety_gym.envs:HopperEnv', 33 | max_episode_steps=1000, 34 | ) 35 | if 'HalfCheetahSafe-v2' not in env_specs: 36 | register( 37 | id='HalfCheetahSafe-v2', 38 | entry_point='mujoco_safety_gym.envs:HalfCheetahEnv', 39 | max_episode_steps=1000, 40 | ) 41 | if 'FetchPushSafety-v0' not in env_specs: 42 | register( 43 | id='FetchPushSafety-v0', 44 | entry_point='mujoco_safety_gym.envs:FetchPushEnv', 45 | max_episode_steps=1000, 46 | ) 47 | if 'FetchReachSafety-v0' not in env_specs: 48 | register( 49 | id='FetchReachSafety-v0', 50 | entry_point='mujoco_safety_gym.envs:FetchReachEnv', 51 | max_episode_steps=1000, 52 | ) 53 | if 'FetchSlideSafety-v0' not in env_specs: 54 | register( 55 | id='FetchSlideSafety-v0', 56 | entry_point='mujoco_safety_gym.envs:FetchSlideEnv', 57 | max_episode_steps=1000, 58 | ) -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from mujoco_safety_gym.envs.humanoid import HumanoidEnv 2 | from mujoco_safety_gym.envs.ant import AntEnv 3 | from mujoco_safety_gym.envs.ant_viz import AntEnvVisualize 4 | from mujoco_safety_gym.envs.half_cheetah import HalfCheetahEnv 5 | from mujoco_safety_gym.envs.hopper import HopperEnv 6 | from mujoco_safety_gym.envs.fetch.push import FetchPushEnv 7 | from mujoco_safety_gym.envs.fetch.reach import FetchReachEnv 8 | from mujoco_safety_gym.envs.fetch.slide import FetchSlideEnv 9 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/ant.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_safety_gym.envs import mujoco_env 3 | from gym import utils 4 | import mujoco_py as mjp 5 | 6 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, a): 12 | xposbefore = self.get_body_com("torso")[0] 13 | self.do_simulation(a, self.frame_skip) 14 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200 15 | xposafter = self.get_body_com("torso")[0] 16 | forward_reward = (xposafter - xposbefore)/self.dt 17 | ctrl_cost = .5 * np.square(a).sum() 18 | contact_cost = 0.5 * 1e-3 * np.sum( 19 | np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) 20 | survive_reward = 1.0 21 | 22 | ### safety stuff 23 | yposafter = self.get_body_com("torso")[1] 24 | ywall = np.array([-5,5]) 25 | if xposafter<20: 26 | y_walldist = yposafter - xposafter*np.tan(30/360*2*np.pi)+ywall 27 | elif xposafter>20 and xposafter<60: 28 | y_walldist = yposafter + (xposafter-40)*np.tan(30/360*2*np.pi) - ywall 29 | elif xposafter>60 and xposafter<100: 30 | y_walldist = yposafter - (xposafter-80)*np.tan(30/360*2*np.pi) + ywall 31 | else: 32 | y_walldist = yposafter - 20*np.tan(30/360*2*np.pi) + ywall 33 | 34 | obj_cost = (abs(y_walldist)<1.8).any()*1.0 35 | reward = forward_reward - ctrl_cost - contact_cost + survive_reward 36 | 37 | body_quat = self.data.get_body_xquat('torso') 38 | z_rot = 1-2*(body_quat[1]**2+body_quat[2]**2) ### normally xx-rotation, not sure what axes mujoco uses 39 | state = self.state_vector() 40 | notdone = np.isfinite(state).all() \ 41 | and state[2] >= 0.2 and state[2] <= 1.0 \ 42 | and z_rot >= -0.7 43 | done = not notdone 44 | done_cost = done*1.0 45 | cost = np.clip(obj_cost+done_cost, 0, 1) 46 | ob = self._get_obs() 47 | return ob, reward, done, dict( 48 | reward_forward=forward_reward, 49 | reward_ctrl=-ctrl_cost, 50 | reward_contact=-contact_cost, 51 | reward_survive=survive_reward, 52 | cost_obj = obj_cost, 53 | cost_done = done_cost, 54 | cost = cost, 55 | ) 56 | 57 | def _get_obs(self): 58 | x = self.sim.data.qpos.flat[0] 59 | y = self.sim.data.qpos.flat[1] 60 | if x<20: 61 | y_off = y - x*np.tan(30/360*2*np.pi) 62 | elif x>20 and x<60: 63 | y_off = y + (x-40)*np.tan(30/360*2*np.pi) 64 | elif x>60 and x<100: 65 | y_off = y - (x-80)*np.tan(30/360*2*np.pi) 66 | else: 67 | y_off = y - 20*np.tan(30/360*2*np.pi) 68 | 69 | return np.concatenate([ 70 | self.sim.data.qpos.flat[2:-42], 71 | self.sim.data.qvel.flat[:-36], 72 | [x/5], 73 | [y_off], 74 | # np.clip(self.sim.data.cfrc_ext, -1, 1).flat, 75 | ]) 76 | 77 | def reset_model(self): 78 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 79 | qpos[-42:] = self.init_qpos[-42:] 80 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 81 | qvel[-36:] = self.init_qvel[-36:] 82 | self.set_state(qpos, qvel) 83 | return self._get_obs() 84 | 85 | 86 | def viewer_setup(self): 87 | self.viewer.cam.distance = self.model.stat.extent * 0.5 -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/ant_viz.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_safety_gym.envs import mujoco_env 3 | from gym import utils 4 | import mujoco_py as mjp 5 | 6 | class AntEnvVisualize(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, 'ant_viz.xml', 5) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, a): 12 | xposbefore = self.get_body_com("torso")[0] 13 | self.do_simulation(a, self.frame_skip) 14 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200 15 | xposafter = self.get_body_com("torso")[0] 16 | forward_reward = (xposafter - xposbefore)/self.dt 17 | ctrl_cost = .5 * np.square(a).sum() 18 | contact_cost = 0.5 * 1e-3 * np.sum( 19 | np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) 20 | survive_reward = 1.0 21 | 22 | ### safety stuff 23 | yposafter = self.get_body_com("torso")[1] 24 | ywall = np.array([-5,5]) 25 | if xposafter<20: 26 | y_walldist = yposafter - xposafter*np.tan(30/360*2*np.pi)+ywall 27 | elif xposafter>20 and xposafter<60: 28 | y_walldist = yposafter + (xposafter-40)*np.tan(30/360*2*np.pi) - ywall 29 | elif xposafter>60 and xposafter<100: 30 | y_walldist = yposafter - (xposafter-80)*np.tan(30/360*2*np.pi) + ywall 31 | else: 32 | y_walldist = yposafter - 20*np.tan(30/360*2*np.pi) + ywall 33 | 34 | obj_cost = (abs(y_walldist)<1.8).any()*1.0 35 | reward = forward_reward - ctrl_cost - contact_cost + survive_reward 36 | 37 | body_quat = self.data.get_body_xquat('torso') 38 | z_rot = 1-2*(body_quat[1]**2+body_quat[2]**2) ### normally xx-rotation, not sure what axes mujoco uses 39 | state = self.state_vector() 40 | notdone = np.isfinite(state).all() \ 41 | and state[2] >= 0.2 and state[2] <= 1.0 \ 42 | and z_rot >= -0.7 43 | done = not notdone 44 | done_cost = done*1.0 45 | cost = np.clip(obj_cost+done_cost, 0, 1) 46 | ob = self._get_obs() 47 | return ob, reward, done, dict( 48 | reward_forward=forward_reward, 49 | reward_ctrl=-ctrl_cost, 50 | reward_contact=-contact_cost, 51 | reward_survive=survive_reward, 52 | cost_obj = obj_cost, 53 | cost_done = done_cost, 54 | cost = cost, 55 | ) 56 | 57 | def _get_obs(self): 58 | x = self.sim.data.qpos.flat[0] 59 | y = self.sim.data.qpos.flat[1] 60 | 61 | x2 = self.sim.data.qpos.flat[15] 62 | y2 = self.sim.data.qpos.flat[16] 63 | 64 | x3 = self.sim.data.qpos.flat[30] 65 | y3 = self.sim.data.qpos.flat[31] 66 | 67 | if x<20: 68 | y_off = y - x*np.tan(30/360*2*np.pi) 69 | elif x>20 and x<60: 70 | y_off = y + (x-40)*np.tan(30/360*2*np.pi) 71 | elif x>60 and x<100: 72 | y_off = y - (x-80)*np.tan(30/360*2*np.pi) 73 | else: 74 | y_off = y - 20*np.tan(30/360*2*np.pi) 75 | 76 | qpos1 = self.sim.data.qpos.flat[2:15] 77 | qvel1 = self.sim.data.qvel.flat[:14] 78 | 79 | if x2<20: 80 | y_off2 = y2- x2*np.tan(30/360*2*np.pi) 81 | elif x2>20 and x<60: 82 | y_off2 = y2 + (x2-40)*np.tan(30/360*2*np.pi) 83 | elif x2>60 and x<100: 84 | y_off2 = y2 - (x2-80)*np.tan(30/360*2*np.pi) 85 | else: 86 | y_off2 = y2 - 20*np.tan(30/360*2*np.pi) 87 | 88 | qpos2 = self.sim.data.qpos.flat[17:30] 89 | qvel2 = self.sim.data.qvel.flat[14:28] 90 | 91 | if x3<20: 92 | y_off3 = y3 - x3*np.tan(30/360*2*np.pi) 93 | elif x3>20 and x<60: 94 | y_off3 = y3 + (x3-40)*np.tan(30/360*2*np.pi) 95 | elif x3>60 and x<100: 96 | y_off3 = y3 - (x3-80)*np.tan(30/360*2*np.pi) 97 | else: 98 | y_off3 = y3 - 20*np.tan(30/360*2*np.pi) 99 | 100 | qpos3 = self.sim.data.qpos.flat[32:45] 101 | qvel3 = self.sim.data.qvel.flat[28:42] 102 | 103 | return np.concatenate([ 104 | qpos1, 105 | qvel1, 106 | [x/5], 107 | [y_off], 108 | qpos2, 109 | qvel2, 110 | [x2/5], 111 | [y_off2], 112 | qpos3, 113 | qvel3, 114 | [x3/5], 115 | [y_off3], 116 | # np.clip(self.sim.data.cfrc_ext, -1, 1).flat, 117 | ]) 118 | 119 | def reset_model(self): 120 | # qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 121 | # qpos[-42:] = self.init_qpos[-42:] 122 | # qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 123 | # qvel[-36:] = self.init_qvel[-36:] 124 | qpos = self.init_qpos 125 | qvel = self.init_qvel 126 | self.set_state(qpos, qvel) 127 | return self._get_obs() 128 | 129 | 130 | def viewer_setup(self): 131 | self.viewer.cam.distance = self.model.stat.extent * 0.5 -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 129 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/fetch/pick_and_place.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/fetch/push.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/fetch/reach.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/fetch/robot.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/fetch/shared.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/fetch/slide.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/half_cheetah.xml: -------------------------------------------------------------------------------- 1 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 111 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/hopper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 65 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/textures/block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/envs/mujoco_safety_gym/envs/assets/textures/block.png -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/assets/textures/block_hidden.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/envs/mujoco_safety_gym/envs/assets/textures/block_hidden.png -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/fetch/pick_and_place.py: -------------------------------------------------------------------------------- 1 | import os 2 | from gym import utils 3 | from mujoco_safety_gym.envs.fetch_env import FetchEnvNew 4 | 5 | 6 | # Ensure we get the path separator correct on windows 7 | MODEL_XML_PATH = os.path.join('fetch', 'pick_and_place.xml') 8 | 9 | 10 | class FetchPickAndPlaceEnv(FetchEnvNew, utils.EzPickle): 11 | def __init__(self, reward_type='sparse'): 12 | initial_qpos = { 13 | 'robot0:slide0': 0.405, 14 | 'robot0:slide1': 0.48, 15 | 'robot0:slide2': 0.0, 16 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.], 17 | } 18 | FetchEnvNew.__init__( 19 | self, MODEL_XML_PATH, has_object=True, block_gripper=False, n_substeps=20, 20 | gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0, 21 | obj_range=0.15, target_range=0.15, distance_threshold=0.05, additional_objects=False, 22 | number_of_objects = 0, initial_qpos=initial_qpos, reward_type=reward_type) 23 | utils.EzPickle.__init__(self) 24 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/fetch/push.py: -------------------------------------------------------------------------------- 1 | import os 2 | from gym import utils 3 | from mujoco_safety_gym.envs.fetch_env import FetchEnvNew 4 | 5 | 6 | # Ensure we get the path separator correct on windows 7 | MODEL_XML_PATH = os.path.join('fetch', 'push.xml') 8 | 9 | 10 | class FetchPushEnv(FetchEnvNew, utils.EzPickle): 11 | def __init__(self, reward_type='sparse', additional_objects=False, number_of_objects=5): 12 | initial_qpos = { 13 | 'robot0:slide0': 0.405, 14 | 'robot0:slide1': 0.48, 15 | 'robot0:slide2': 0.0, 16 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.] 17 | } 18 | FetchEnvNew.__init__( 19 | self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20, 20 | gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0, 21 | obj_range=0.10, target_range=0.15, distance_threshold=0.05, additional_objects=additional_objects, 22 | number_of_objects = number_of_objects, initial_qpos=initial_qpos, reward_type=reward_type) 23 | utils.EzPickle.__init__(self) 24 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/fetch/reach.py: -------------------------------------------------------------------------------- 1 | import os 2 | from gym import utils 3 | from mujoco_safety_gym.envs.fetch_env import FetchEnvNew 4 | 5 | 6 | # Ensure we get the path separator correct on windows 7 | MODEL_XML_PATH = os.path.join('fetch', 'reach.xml') 8 | 9 | 10 | class FetchReachEnv(FetchEnvNew, utils.EzPickle): 11 | def __init__(self, reward_type='sparse', additional_objects=False, number_of_objects=5): 12 | initial_qpos = { 13 | 'robot0:slide0': 0.405, 14 | 'robot0:slide1': 0.48, 15 | 'robot0:slide2': 0.0, 16 | } 17 | FetchEnvNew.__init__( 18 | self, MODEL_XML_PATH, has_object=False, block_gripper=True, n_substeps=20, 19 | gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0, 20 | obj_range=0.1, target_range=0.2, distance_threshold=0.05, additional_objects=additional_objects, 21 | number_of_objects = number_of_objects, initial_qpos=initial_qpos, reward_type=reward_type) 22 | utils.EzPickle.__init__(self) -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/fetch/slide.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | from gym import utils 5 | from mujoco_safety_gym.envs import fetch_env 6 | 7 | 8 | # Ensure we get the path separator correct on windows 9 | MODEL_XML_PATH = os.path.join('fetch', 'slide.xml') 10 | 11 | 12 | class FetchSlideEnv(fetch_env.FetchEnvNew, utils.EzPickle): 13 | def __init__(self, reward_type='sparse'): 14 | initial_qpos = { 15 | 'robot0:slide0': 0.05, 16 | 'robot0:slide1': 0.48, 17 | 'robot0:slide2': 0.0, 18 | 'object0:joint': [1.7, 1.1, 0.41, 1., 0., 0., 0.], 19 | } 20 | fetch_env.FetchEnvNew.__init__( 21 | self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20, 22 | gripper_extra_height=-0.02, target_in_the_air=False, target_offset=np.array([0.4, 0.0, 0.0]), 23 | obj_range=0.1, target_range=0.3, distance_threshold=0.05, additional_objects=False, 24 | number_of_objects = 0, initial_qpos=initial_qpos, reward_type=reward_type) 25 | utils.EzPickle.__init__(self) 26 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/half_cheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mujoco_safety_gym.envs import mujoco_env 4 | import mujoco_py as mjp 5 | from gym import error, spaces 6 | 7 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5) 10 | utils.EzPickle.__init__(self) 11 | 12 | def step(self, action): 13 | xposbefore = self.sim.data.qpos[1] 14 | 15 | t = self.data.time 16 | wall_act = .02*np.sin(t/3)**2 - .004 17 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200 18 | action_p_wall = np.concatenate((np.squeeze(action), [wall_act])) 19 | 20 | self.do_simulation(action_p_wall, self.frame_skip) 21 | xposafter = self.sim.data.qpos[1] 22 | 23 | wallpos = self.data.get_geom_xpos("obj_geom")[0] 24 | wallvel = self.data.get_body_xvelp("obj1")[0] 25 | xdist = wallpos-xposafter 26 | obj_cost = int(np.abs(xdist)<2) 27 | if obj_cost>0: 28 | self.model.geom_rgba[9] = [1.0, 0, 0, 1.0] 29 | else: 30 | self.model.geom_rgba[9] = [1.0, 0.5, 0.5, .8] 31 | ob = self._get_obs() 32 | reward_ctrl = - 0.1 * np.square(action).sum() 33 | reward_run = (xposafter - xposbefore)/self.dt 34 | reward = reward_ctrl + reward_run 35 | done = False 36 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl, cost=obj_cost) 37 | 38 | def _get_obs(self): 39 | wallvel = self.data.get_body_xvelp("obj1")[0] 40 | wall_f = .02*np.sin(self.data.time/3)**2 - .004 41 | xdist = (self.data.get_geom_xpos("obj_geom")[0]-self.sim.data.qpos[1])/10 42 | 43 | return np.concatenate([ 44 | self.sim.data.qpos.flat[2:], 45 | self.sim.data.qvel.flat[1:], 46 | [wallvel], 47 | [wall_f], 48 | np.clip([xdist], -5, 5), 49 | ]) 50 | 51 | def reset_model(self): 52 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 53 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 54 | self.set_state(qpos, qvel) 55 | return self._get_obs() 56 | 57 | def viewer_setup(self): 58 | self.viewer.cam.distance = self.model.stat.extent * 0.5 59 | 60 | 61 | def _set_action_space(self): 62 | bounds = self.model.actuator_ctrlrange.copy().astype(np.float32) 63 | low, high = bounds.T 64 | low, high = low[:-1], high[:-1] 65 | self.action_space = spaces.Box(low=low, high=high, dtype=np.float32) 66 | return self.action_space 67 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/hopper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_safety_gym.envs import mujoco_env 3 | from gym import utils 4 | import mujoco_py as mjp 5 | 6 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 4) 9 | utils.EzPickle.__init__(self) 10 | self.last_mocx = 5 #### vel readings are super noisy for mocap weld 11 | 12 | def step(self, a): 13 | posbefore = self.sim.data.qpos[3] 14 | t = self.data.time 15 | pos = (t + np.sin(t)) + 3 16 | self.data.set_mocap_pos('mocap1', [pos, 0, 0.5]) 17 | 18 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200 19 | self.do_simulation(a, self.frame_skip) 20 | posafter, height, ang = self.sim.data.qpos[3:6] 21 | alive_bonus = 1.0 22 | 23 | mocapx = self.sim.data.qpos[0] 24 | xdist = mocapx-posafter 25 | cost = int(np.abs(xdist)<1) 26 | 27 | reward = (posafter - posbefore) / self.dt 28 | reward += alive_bonus 29 | reward -= 1e-3 * np.square(a).sum() 30 | s = self.state_vector() 31 | done = not (np.isfinite(s).all() and (np.abs(s[5:]) < 100).all() and 32 | (height > .7) and (abs(ang) < .2)) 33 | ob = self._get_obs() 34 | return ob, reward, done, {'cost':cost} 35 | 36 | def _get_obs(self): 37 | x = self.sim.data.qpos[3] 38 | mocapx = self.sim.data.qpos[0] 39 | mocvel = 1 + np.cos(self.data.time) 40 | mocacc = -np.sin(self.data.time) 41 | return np.concatenate([ 42 | self.sim.data.qpos.flat[4:], 43 | np.clip(self.sim.data.qvel[3:].flat, -10, 10), 44 | [mocvel], 45 | [mocacc], 46 | [mocapx-x], 47 | ]) 48 | 49 | def reset_model(self): 50 | qpos = self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq) 51 | qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 52 | self.set_state(qpos, qvel) 53 | return self._get_obs() 54 | 55 | def last_mocap_x(self): 56 | 57 | return self.last_mocx 58 | 59 | def viewer_setup(self): 60 | self.viewer.cam.trackbodyid = 2 61 | self.viewer.cam.distance = self.model.stat.extent * 0.75 62 | self.viewer.cam.lookat[2] = 1.15 63 | self.viewer.cam.elevation = -20 64 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/humanoid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_safety_gym.envs import mujoco_env 3 | from gym import utils 4 | import mujoco_py as mjp 5 | 6 | def mass_center(model, sim): 7 | mass = np.expand_dims(model.body_mass, 1) 8 | xpos = sim.data.xipos 9 | return (np.sum(mass * xpos, 0) / np.sum(mass))[0] 10 | 11 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): 12 | def __init__(self): 13 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5) 14 | utils.EzPickle.__init__(self) 15 | 16 | def _get_obs(self): 17 | data = self.sim.data 18 | x = data.qpos.flat[0] 19 | y = data.qpos.flat[1] 20 | if x<20: 21 | y_off = y - x*np.tan(30/360*2*np.pi) 22 | elif x>20 and x<60: 23 | y_off = y + (x-40)*np.tan(30/360*2*np.pi) 24 | elif x>60 and x<100: 25 | y_off = y - (x-80)*np.tan(30/360*2*np.pi) 26 | else: 27 | y_off = y - 20*np.tan(30/360*2*np.pi) 28 | 29 | return np.concatenate([data.qpos.flat[2:-42], 30 | data.qvel.flat[:-36], 31 | [x/5], 32 | [y_off]]) 33 | 34 | # return np.concatenate([data.qpos.flat[2:], 35 | # data.qvel.flat, 36 | # data.cinert.flat, 37 | # data.cvel.flat, 38 | # data.qfrc_actuator.flat, 39 | # data.cfrc_ext.flat]) 40 | 41 | def step(self, a): 42 | pos_before = mass_center(self.model, self.sim) 43 | self.do_simulation(a, self.frame_skip) 44 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200 45 | pos_after = mass_center(self.model, self.sim) 46 | alive_bonus = 5.0 47 | data = self.sim.data 48 | lin_vel_cost = 1.25 * (pos_after - pos_before) / self.dt 49 | quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() 50 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() 51 | quad_impact_cost = min(quad_impact_cost, 10) 52 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 53 | 54 | yposafter = self.get_body_com("torso")[1] 55 | ywall = np.array([-5,5]) 56 | if pos_after<20: 57 | y_walldist = yposafter - pos_after*np.tan(30/360*2*np.pi)+ywall 58 | elif pos_after>20 and pos_after<60: 59 | y_walldist = yposafter + (pos_after-40)*np.tan(30/360*2*np.pi) - ywall 60 | elif pos_after>60 and pos_after<100: 61 | y_walldist = yposafter - (pos_after-80)*np.tan(30/360*2*np.pi) + ywall 62 | else: 63 | y_walldist = yposafter - 20*np.tan(30/360*2*np.pi) + ywall 64 | obj_cost = (abs(y_walldist)<2).any()*1.0 65 | 66 | 67 | qpos = self.sim.data.qpos 68 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 69 | 70 | done_cost = done*1.0 71 | cost = np.clip(obj_cost+done_cost, 0, 1) 72 | 73 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, 74 | reward_quadctrl=-quad_ctrl_cost, 75 | reward_alive=alive_bonus, 76 | reward_impact=-quad_impact_cost, 77 | cost_obj = obj_cost, 78 | cost_done = done_cost, 79 | cost = cost, 80 | ) 81 | 82 | def reset_model(self): 83 | c = 0.01 84 | # self.set_state( 85 | # self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 86 | # self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 87 | # ) 88 | # return self._get_obs() 89 | qpos = self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq) 90 | qpos[-42:] = self.init_qpos[-42:] 91 | qvel = self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 92 | qvel[-36:] = self.init_qvel[-36:] 93 | self.set_state(qpos, qvel) 94 | return self._get_obs() 95 | 96 | 97 | def viewer_setup(self): 98 | self.viewer.cam.trackbodyid = 1 99 | self.viewer.cam.distance = self.model.stat.extent * 1.0 100 | self.viewer.cam.lookat[2] = 2.0 101 | self.viewer.cam.elevation = -20 102 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/mujoco_env.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import os 3 | 4 | 5 | from gym import error, spaces 6 | from gym.utils import seeding 7 | import numpy as np 8 | from os import path 9 | import gym 10 | 11 | try: 12 | import mujoco_py 13 | except ImportError as e: 14 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) 15 | 16 | DEFAULT_SIZE = 500 17 | 18 | 19 | def convert_observation_to_space(observation): 20 | if isinstance(observation, dict): 21 | space = spaces.Dict(OrderedDict([ 22 | (key, convert_observation_to_space(value)) 23 | for key, value in observation.items() 24 | ])) 25 | elif isinstance(observation, np.ndarray): 26 | low = np.full(observation.shape, -float('inf'), dtype=np.float32) 27 | high = np.full(observation.shape, float('inf'), dtype=np.float32) 28 | space = spaces.Box(low, high, dtype=observation.dtype) 29 | else: 30 | raise NotImplementedError(type(observation), observation) 31 | 32 | return space 33 | 34 | 35 | class MujocoEnv(gym.Env): 36 | """Superclass for all MuJoCo environments. 37 | """ 38 | 39 | def __init__(self, model_path, frame_skip): 40 | if model_path.startswith("/"): 41 | fullpath = model_path 42 | else: 43 | fullpath = os.path.join(os.path.dirname(__file__), "./assets", model_path) 44 | if not path.exists(fullpath): 45 | raise IOError("File %s does not exist" % fullpath) 46 | self.frame_skip = frame_skip 47 | self.model = mujoco_py.load_model_from_path(fullpath) 48 | self.sim = mujoco_py.MjSim(self.model) 49 | self.data = self.sim.data 50 | self.viewer = None 51 | self._viewers = {} 52 | 53 | self.metadata = { 54 | 'render.modes': ['human', 'rgb_array', 'depth_array'], 55 | 'video.frames_per_second': int(np.round(1.0 / self.dt)) 56 | } 57 | 58 | self.init_qpos = self.sim.data.qpos.ravel().copy() 59 | self.init_qvel = self.sim.data.qvel.ravel().copy() 60 | 61 | self._set_action_space() 62 | 63 | action = self.action_space.sample() 64 | observation, _reward, done, _info = self.step(action) 65 | # assert not done 66 | 67 | self._set_observation_space(observation) 68 | 69 | self.seed() 70 | 71 | def _set_action_space(self): 72 | bounds = self.model.actuator_ctrlrange.copy().astype(np.float32) 73 | low, high = bounds.T 74 | self.action_space = spaces.Box(low=low, high=high, dtype=np.float32) 75 | return self.action_space 76 | 77 | def _set_observation_space(self, observation): 78 | self.observation_space = convert_observation_to_space(observation) 79 | return self.observation_space 80 | 81 | def seed(self, seed=None): 82 | self.np_random, seed = seeding.np_random(seed) 83 | return [seed] 84 | 85 | # methods to override: 86 | # ---------------------------- 87 | 88 | def reset_model(self): 89 | """ 90 | Reset the robot degrees of freedom (qpos and qvel). 91 | Implement this in each subclass. 92 | """ 93 | raise NotImplementedError 94 | 95 | def viewer_setup(self): 96 | """ 97 | This method is called when the viewer is initialized. 98 | Optionally implement this method, if you need to tinker with camera position 99 | and so forth. 100 | """ 101 | pass 102 | 103 | # ----------------------------- 104 | 105 | def reset(self): 106 | self.sim.reset() 107 | ob = self.reset_model() 108 | return ob 109 | 110 | def set_state(self, qpos, qvel): 111 | assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,) 112 | old_state = self.sim.get_state() 113 | new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel, 114 | old_state.act, old_state.udd_state) 115 | self.sim.set_state(new_state) 116 | self.sim.forward() 117 | 118 | @property 119 | def dt(self): 120 | return self.model.opt.timestep * self.frame_skip 121 | 122 | def do_simulation(self, ctrl, n_frames): 123 | self.sim.data.ctrl[:] = ctrl 124 | for _ in range(n_frames): 125 | self.sim.step() 126 | 127 | def render(self, 128 | mode='human', 129 | width=DEFAULT_SIZE, 130 | height=DEFAULT_SIZE, 131 | camera_id=None, 132 | camera_name=None): 133 | if mode == 'rgb_array': 134 | if camera_id is not None and camera_name is not None: 135 | raise ValueError("Both `camera_id` and `camera_name` cannot be" 136 | " specified at the same time.") 137 | 138 | no_camera_specified = camera_name is None and camera_id is None 139 | if no_camera_specified: 140 | camera_name = 'track' 141 | 142 | if camera_id is None and camera_name in self.model._camera_name2id: 143 | camera_id = self.model.camera_name2id(camera_name) 144 | 145 | self._get_viewer(mode).render(width, height, camera_id=camera_id) 146 | # window size used for old mujoco-py: 147 | data = self._get_viewer(mode).read_pixels(width, height, depth=False) 148 | # original image is upside-down, so flip it 149 | return data[::-1, :, :] 150 | elif mode == 'depth_array': 151 | self._get_viewer(mode).render(width, height) 152 | # window size used for old mujoco-py: 153 | # Extract depth part of the read_pixels() tuple 154 | data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1] 155 | # original image is upside-down, so flip it 156 | return data[::-1, :] 157 | elif mode == 'human': 158 | self._get_viewer(mode).render() 159 | 160 | def close(self): 161 | if self.viewer is not None: 162 | # self.viewer.finish() 163 | self.viewer = None 164 | self._viewers = {} 165 | 166 | def _get_viewer(self, mode): 167 | self.viewer = self._viewers.get(mode) 168 | if self.viewer is None: 169 | if mode == 'human': 170 | self.viewer = mujoco_py.MjViewer(self.sim) 171 | elif mode == 'rgb_array' or mode == 'depth_array': 172 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1) 173 | 174 | self.viewer_setup() 175 | self._viewers[mode] = self.viewer 176 | return self.viewer 177 | 178 | def get_body_com(self, body_name): 179 | return self.data.get_body_xpos(body_name) 180 | 181 | def state_vector(self): 182 | return np.concatenate([ 183 | self.sim.data.qpos.flat, 184 | self.sim.data.qvel.flat 185 | ]) 186 | 187 | def place_random_objects(self): 188 | for i in range(9): 189 | random_color_array = np.append(np.random.uniform(0, 1, size=3), 1) 190 | random_pos_array = np.append(np.random.uniform(-10., 10., size=2), 0.5) 191 | site_id = self.sim.model.geom_name2id('obj' + str(i)) 192 | self.sim.model.geom_rgba[site_id] = random_color_array 193 | self.sim.model.geom_pos[site_id] = random_pos_array 194 | -------------------------------------------------------------------------------- /envs/mujoco_safety_gym/envs/robot_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import numpy as np 4 | 5 | import gym 6 | from gym import error, spaces 7 | from gym.utils import seeding 8 | 9 | try: 10 | import mujoco_py 11 | except ImportError as e: 12 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) 13 | 14 | DEFAULT_SIZE = 500 15 | 16 | class RobotEnv(gym.GoalEnv): 17 | def __init__(self, model_path, initial_qpos, n_actions, n_substeps): 18 | if model_path.startswith('/'): 19 | fullpath = model_path 20 | else: 21 | fullpath = os.path.join(os.path.dirname(__file__), 'assets', model_path) 22 | if not os.path.exists(fullpath): 23 | raise IOError('File {} does not exist'.format(fullpath)) 24 | 25 | model = mujoco_py.load_model_from_path(fullpath) 26 | self.sim = mujoco_py.MjSim(model, nsubsteps=n_substeps) 27 | self.viewer = None 28 | self._viewers = {} 29 | 30 | self.metadata = { 31 | 'render.modes': ['human', 'rgb_array'], 32 | 'video.frames_per_second': int(np.round(1.0 / self.dt)) 33 | } 34 | 35 | self.seed() 36 | self._env_setup(initial_qpos=initial_qpos) 37 | self.initial_state = copy.deepcopy(self.sim.get_state()) 38 | 39 | self.goal = self._sample_goal() 40 | obs = self._get_obs() 41 | self.action_space = spaces.Box(-1., 1., shape=(n_actions,), dtype='float32') 42 | self.observation_space = spaces.Dict(dict( 43 | desired_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'), 44 | achieved_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'), 45 | observation=spaces.Box(-np.inf, np.inf, shape=obs['observation'].shape, dtype='float32'), 46 | )) 47 | 48 | @property 49 | def dt(self): 50 | return self.sim.model.opt.timestep * self.sim.nsubsteps 51 | 52 | # Env methods 53 | # ---------------------------- 54 | 55 | def seed(self, seed=None): 56 | self.np_random, seed = seeding.np_random(seed) 57 | return [seed] 58 | 59 | def step(self, action): 60 | # if (action.shape < (4,) and np.ndim(action.shape) == 1): 61 | # action = np.append(action, np.zeros(4 - action.shape[0])) 62 | action = np.clip(action, self.action_space.low, self.action_space.high) 63 | self._set_action(action) 64 | self.sim.step() 65 | self._step_callback() 66 | obs = self._get_obs() 67 | 68 | done = False 69 | info = { 70 | 'is_success': self._is_success(obs['achieved_goal'], self.goal), 71 | 'cost': self._compute_costs(obs), 72 | } 73 | reward = self.compute_reward(obs['achieved_goal'], self.goal, info) 74 | return obs, reward, done, info 75 | 76 | def reset(self, **kwargs): 77 | # Attempt to reset the simulator. Since we randomize initial conditions, it 78 | # is possible to get into a state with numerical issues (e.g. due to penetration or 79 | # Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand). 80 | # In this case, we just keep randomizing until we eventually achieve a valid initial 81 | # configuration. 82 | super(RobotEnv, self).reset() 83 | did_reset_sim = False 84 | self.goal = self._sample_goal().copy() 85 | while not did_reset_sim: 86 | did_reset_sim = self._reset_sim(**kwargs) 87 | obs = self._get_obs() 88 | return obs 89 | 90 | def close(self): 91 | if self.viewer is not None: 92 | # self.viewer.finish() 93 | self.viewer = None 94 | self._viewers = {} 95 | 96 | def render(self, mode='human', width=DEFAULT_SIZE, height=DEFAULT_SIZE): 97 | self._render_callback() 98 | if mode == 'rgb_array': 99 | self._get_viewer(mode).render(width, height) 100 | # window size used for old mujoco-py: 101 | data = self._get_viewer(mode, True).read_pixels(width, height, depth=False) 102 | # original image is upside-down, so flip it 103 | return data[::-1, :, :] 104 | elif mode == 'human': 105 | self._get_viewer(mode).render() 106 | 107 | def _get_viewer(self, mode, cam_fixed=False): 108 | self.viewer = self._viewers.get(mode) 109 | if self.viewer is None: 110 | if mode == 'human': 111 | self.viewer = mujoco_py.MjViewer(self.sim) 112 | elif mode == 'rgb_array': 113 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, 0) 114 | cam_fixed = True 115 | self._viewer_setup(cam_fixed) 116 | self._viewers[mode] = self.viewer 117 | return self.viewer 118 | 119 | # Extension methods 120 | # ---------------------------- 121 | 122 | def _reset_sim(self, **kwargs): 123 | """Resets a simulation and indicates whether or not it was successful. 124 | If a reset was unsuccessful (e.g. if a randomized state caused an error in the 125 | simulation), this method should indicate such a failure by returning False. 126 | In such a case, this method will be called again to attempt a the reset again. 127 | """ 128 | self.sim.set_state(self.initial_state) 129 | self.sim.forward() 130 | return True 131 | 132 | def _get_obs(self): 133 | """Returns the observation. 134 | """ 135 | raise NotImplementedError() 136 | 137 | def _set_action(self, action): 138 | """Applies the given action to the simulation. 139 | """ 140 | raise NotImplementedError() 141 | 142 | def _is_success(self, achieved_goal, desired_goal): 143 | """Indicates whether or not the achieved goal successfully achieved the desired goal. 144 | """ 145 | raise NotImplementedError() 146 | 147 | def _sample_goal(self): 148 | """Samples a new goal and returns it. 149 | """ 150 | raise NotImplementedError() 151 | 152 | def _env_setup(self, initial_qpos): 153 | """Initial configuration of the environment. Can be used to configure initial state 154 | and extract information from the simulation. 155 | """ 156 | pass 157 | 158 | def _viewer_setup(self, cam_fixed=False): 159 | """Initial configuration of the viewer. Can be used to set the camera position, 160 | for example. 161 | """ 162 | pass 163 | 164 | def _render_callback(self): 165 | """A custom callback that is called before rendering. Can be used 166 | to implement custom visualizations. 167 | """ 168 | pass 169 | 170 | def _step_callback(self): 171 | """A custom callback that is called after stepping the simulation. Can be used 172 | to enforce additional constraints on the simulation state. 173 | """ 174 | pass 175 | 176 | def _compute_costs(self, obs): 177 | """Calculate the costs for the given observation 178 | """ 179 | pass 180 | -------------------------------------------------------------------------------- /envs/utils.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import envs.mujoco_safety_gym 3 | from wrappers import NormalizeActionWrapper 4 | 5 | def get_gym_env(): 6 | import gym 7 | import envs.mujoco_safety_gym 8 | 9 | return gym.make 10 | 11 | def get_safety_gym(): 12 | import safety_gym 13 | 14 | return gym.make 15 | 16 | ENVS_FUNCTIONS = { 17 | 'gym':get_gym_env() 18 | } 19 | 20 | def get_environment(universe, task, environment_kwargs): 21 | env = ENVS_FUNCTIONS[universe](task, **environment_kwargs) 22 | return env 23 | 24 | def get_env_from_params(env_params): 25 | universe = env_params['universe'] 26 | task = env_params['task'] 27 | environment_kwargs = env_params.get('kwargs', {}).copy() 28 | 29 | env = get_environment(universe, task, environment_kwargs) 30 | 31 | #### @anyboby maybe write something nicer for wrappers 32 | if env_params.get('normalize_actions', False): 33 | env = NormalizeActionWrapper(env) 34 | 35 | return env 36 | -------------------------------------------------------------------------------- /envs/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalize_action import NormalizeActionWrapper -------------------------------------------------------------------------------- /envs/wrappers/normalize_action.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | import numpy as np 4 | 5 | 6 | __all__ = ['NormalizeActionWrapper'] 7 | 8 | class NormalizeActionWrapper(gym.ActionWrapper): 9 | """Rescale the action space of the environment.""" 10 | 11 | def action(self, action): 12 | if not isinstance(self.env.action_space, spaces.Box): 13 | return action 14 | 15 | # rescale the action 16 | low, high = self.env.action_space.low, self.env.action_space.high 17 | scaled_action = low + (action + 1.0) * (high - low) / 2.0 18 | scaled_action = np.clip(scaled_action, low, high) 19 | 20 | return scaled_action 21 | 22 | def reverse_action(self, action): 23 | raise NotImplementedError 24 | 25 | normalize = NormalizeActionWrapper 26 | -------------------------------------------------------------------------------- /models/base_model.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | class BaseModel(abc.ABC): 4 | 5 | @abc.abstractmethod 6 | def predict(self, x): 7 | """ Make predictions, should return (mean, var) if model is probabilistic or mean else""" 8 | raise NotImplementedError 9 | 10 | @abc.abstractmethod 11 | def train(self, x, y, ): 12 | """ Make predictions, should return (mean, var) if model is probabilistic or mean else""" 13 | raise NotImplementedError 14 | 15 | @abc.abstractproperty 16 | def is_probabilistic(self): 17 | """ indicates whether model predictions are probabilistic or deterministic """ 18 | raise NotImplementedError 19 | 20 | @abc.abstractproperty 21 | def is_ensemble(self): 22 | """ indicates whether model is an ensemble """ 23 | raise NotImplementedError 24 | 25 | @abc.abstractproperty 26 | def in_dim(self): 27 | """ dimension of inputs """ 28 | raise NotImplementedError 29 | 30 | @abc.abstractproperty 31 | def out_dim(self): 32 | """ dimension of outputs """ 33 | raise NotImplementedError 34 | 35 | class EnsembleModel(BaseModel): 36 | @abc.abstractmethod 37 | def predict_ensemble(self, x): 38 | """ Make predictions of whole ensemble, output shape should be (ensemble, batch_size, y_shape)""" 39 | raise NotImplementedError 40 | 41 | @abc.abstractmethod 42 | def elite_inds(self,): 43 | """ Returns indices of the elite models""" 44 | raise NotImplementedError -------------------------------------------------------------------------------- /models/fake_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import pdb 4 | 5 | from models.pens.pe_factory import build_PE, format_samples_for_dyn, format_samples_for_cost 6 | from models.pens.utils import average_dkl, median_dkl 7 | from models.statics import (REWS_BY_TASK, COST_BY_TASK, TERMS_BY_TASK) 8 | 9 | from itertools import count 10 | import warnings 11 | import time 12 | 13 | EPS = 1e-8 14 | 15 | class FakeEnv: 16 | 17 | def __init__(self, 18 | true_environment, 19 | task, 20 | model, 21 | predicts_delta, 22 | predicts_rew, 23 | predicts_cost, 24 | ): 25 | """ 26 | Creates a fake environment that emulates common RL env methodology: 27 | Args: 28 | true_environment(`env`): true environment, used for shapes 29 | task(`str`): name of the task, used to locate static fallback functions for r, c, or term 30 | model(`BaseModel`): dynamics model, should inherit from BaseModel and implement the corresponding 31 | methods, 32 | inputs dim should be (obs_dim + act_dim,) 33 | predicts_delta(`bool`): Does the model predict state-changes or absolute next-states? 34 | predicts_rew(`bool`): Does the model predict rewards? 35 | If yes: rewards should be included in outputs after dynamics, 36 | i.e.: dim(outputs) = (..., (next_obs, r)) 37 | predicts_cost(`bool`): Does the model predict costs? 38 | If yes: costs should be included in outputs after dynamics and rewards (if applicable), 39 | i.e.: dim(outputs) = (..., (next_obs, r, c)) 40 | """ 41 | self.env = true_environment 42 | self.obs_dim = np.prod(self.observation_space.shape) 43 | self.act_dim = np.prod(self.action_space.shape) 44 | self._task = task 45 | 46 | self._model = model 47 | self._uses_ensemble = self._model.is_ensemble 48 | self._is_probabilistic = self._model.is_probabilistic 49 | 50 | self._predicts_delta = predicts_delta 51 | self._predicts_rew = predicts_rew 52 | self._predicts_cost = predicts_cost 53 | 54 | #### create fake env from model 55 | self.input_dim = self._model.in_dim 56 | self.output_dim = self._model.out_dim 57 | 58 | @property 59 | def observation_space(self): 60 | return self.env.observation_space 61 | 62 | @property 63 | def action_space(self): 64 | return self.env.action_space 65 | 66 | def step(self, obs, act, deterministic=True): 67 | assert len(obs.shape) == len(act.shape) 68 | assert obs.shape[-1]==self.obs_dim and act.shape[-1]==self.act_dim 69 | 70 | ### check dimensionality of obs 71 | obs_depth = len(obs.shape) 72 | if obs_depth == 1: 73 | obs = obs[None] 74 | act = act[None] 75 | return_single=True 76 | else: 77 | return_single = False 78 | 79 | 80 | ### create model inputs 81 | inputs = np.concatenate((obs, act), axis=-1) 82 | 83 | ### if 3D-inputs, we shuffle so different models predict at every step 84 | if obs_depth==3: 85 | inputs, shuffle_indxs = self.forward_shuffle(inputs) 86 | 87 | ### predict 88 | if self._uses_ensemble: 89 | pred = self._model.predict_ensemble(inputs) #### dyn_vars gives ep. vars for 90 | else: 91 | pred = self._model.predict(inputs) 92 | 93 | ### split predictions if probabilistic 94 | if self._is_probabilistic: 95 | pred_mean, pred_var = pred 96 | else: 97 | pred_mean, pred_var = pred, np.zeros_like(pred) 98 | 99 | ### shuffle back 100 | if obs_depth==3: 101 | pred_mean, pred_var = self.inverse_shuffle(pred_mean, shuffle_indxs), self.inverse_shuffle(pred_var, shuffle_indxs) 102 | 103 | #### probabilistic transitions if var is predicted and deterministic is passed 104 | pred_std = np.sqrt(pred_var) 105 | if not deterministic: 106 | next_obs = pred_mean[...,:self.obs_dim] + pred_std[...,:self.obs_dim] 107 | else: 108 | next_obs = pred_mean[...,:self.obs_dim] 109 | 110 | #### extract uncertainty measures 111 | if self._uses_ensemble: 112 | ens_ep_var = np.var(next_obs, axis=0) 113 | ens_dkl_path = np.mean(average_dkl(next_obs, pred_std[...,:self.obs_dim]), axis=-1) ##@anyboby gives ugly numbers if var=0 114 | ens_dkl_mean = np.mean(ens_dkl_path) 115 | else: 116 | ens_ep_var = 0 117 | ens_dkl_path = np.zeros(shape=obs.shape[1]) 118 | ens_dkl_mean = 0 119 | 120 | #### choose one model from ensemble randomly, if ensemble and not 3d inputs 121 | if self._uses_ensemble and obs_depth<3: 122 | _, batch_size, _ = next_obs.shape 123 | model_inds = self.random_inds(batch_size) ## only elites 124 | batch_inds = np.arange(0, batch_size) 125 | next_obs = next_obs[model_inds, batch_inds] 126 | else: 127 | next_obs = next_obs 128 | 129 | #### add to obs if delta predictions 130 | if self._predicts_delta: 131 | next_obs += obs 132 | 133 | #### extract rew, cost, or call fallback functions for terms, rews and costs 134 | if TERMS_BY_TASK.get(self._task, None): 135 | terms = TERMS_BY_TASK[self._task](obs, act, next_obs) 136 | else: 137 | terms = TERMS_BY_TASK['default'](obs, act, next_obs) 138 | 139 | if self._predicts_cost: 140 | c = pred_mean[...,-1:] 141 | c = c[model_inds, batch_inds] 142 | pred_mean = pred_mean[...,:-1] 143 | elif COST_BY_TASK.get(self._task, None): 144 | c = COST_BY_TASK[self._task](obs, act, next_obs) 145 | else: 146 | c = np.zeros_like(terms) 147 | 148 | if self._predicts_rew: 149 | r = pred_mean[...,-1:] 150 | r = r[model_inds, batch_inds] 151 | pred_mean = pred_mean[...,:-1] 152 | elif REWS_BY_TASK.get(self._task, None): 153 | r = REWS_BY_TASK[self._task](obs, act, next_obs) 154 | 155 | assert r is not None, \ 156 | "Please provide either static functions or predictions for rewards, costs and terms" 157 | 158 | if return_single: 159 | next_obs = next_obs[0] 160 | r = r[0] 161 | c = c[0] 162 | terms = terms[0] 163 | 164 | info = { 165 | 'ensemble_dkl_mean' : ens_dkl_mean, 166 | 'ensemble_dkl_path' : ens_dkl_path, 167 | 'ensemble_ep_var' : ens_ep_var, 168 | 'rew':r, 169 | 'cost':c, 170 | } 171 | 172 | return next_obs, r, terms, info 173 | 174 | def random_inds(self, size): 175 | if self._model.is_ensemble: 176 | return np.random.choice(self._model.elite_inds, (size)) 177 | else: 178 | return np.random.choice([0], (size)) 179 | 180 | def forward_shuffle(self, ndarray): 181 | """ 182 | shuffles ndarray forward along axis 0 with random elite indices, 183 | Returns shuffled copy of ndarray and indices with which was shuffled 184 | """ 185 | idxs = np.random.permutation(ndarray.shape[0]) 186 | shuffled = ndarray[idxs] 187 | return shuffled, idxs 188 | 189 | def inverse_shuffle(self, ndarray, idxs): 190 | """ 191 | inverses a shuffle of ndarray forward along axis 0, given the used indices. 192 | Returns unshuffled copy of ndarray 193 | """ 194 | unshuffled = ndarray[idxs] 195 | return unshuffled 196 | 197 | def close(self): 198 | pass 199 | -------------------------------------------------------------------------------- /models/pens/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/models/pens/__init__.py -------------------------------------------------------------------------------- /models/pens/logger.py: -------------------------------------------------------------------------------- 1 | import time 2 | import math 3 | import pdb 4 | 5 | 6 | 7 | def update_dict(dict_a, dict_b, weight_a=.5, weight_b=.5): 8 | """ 9 | creates new updated dict and adds entries according to weights. 10 | for both weights = 1 the entries are added 11 | """ 12 | dict_a_cp = dict(dict_a) 13 | dict_a_cp.update(dict_b) 14 | for k,v in dict_b.items(): 15 | if k in dict_a.keys(): 16 | dict_a_cp[k] = weight_b*dict_b[k] + weight_a*dict_a[k] 17 | return dict_a_cp 18 | 19 | class Progress: 20 | 21 | def __init__(self, total, name = 'Progress', ncol=3, max_length=20, indent=0, line_width=100, speed_update_freq=100): 22 | self.total = total 23 | self.name = name 24 | self.ncol = ncol 25 | self.max_length = max_length 26 | self.indent = indent 27 | self.line_width = line_width 28 | self._speed_update_freq = speed_update_freq 29 | 30 | self._step = 0 31 | self._prev_line = '\033[F' 32 | self._clear_line = ' ' * self.line_width 33 | 34 | self._pbar_size = self.ncol * self.max_length 35 | self._complete_pbar = '#' * self._pbar_size 36 | self._incomplete_pbar = ' ' * self._pbar_size 37 | 38 | self.lines = [''] 39 | self.fraction = '{} / {}'.format(0, self.total) 40 | 41 | self.resume() 42 | 43 | 44 | def update(self, n=1): 45 | self._step += n 46 | if self._step % self._speed_update_freq == 0: 47 | self._time0 = time.time() 48 | self._step0 = self._step 49 | 50 | def resume(self): 51 | self._skip_lines = 1 52 | print('\n', end='') 53 | self._time0 = time.time() 54 | self._step0 = self._step 55 | 56 | def pause(self): 57 | self._clear() 58 | self._skip_lines = 1 59 | 60 | def set_description(self, params=[]): 61 | 62 | ############ 63 | # Position # 64 | ############ 65 | self._clear() 66 | 67 | ########### 68 | # Percent # 69 | ########### 70 | percent, fraction = self._format_percent(self._step, self.total) 71 | self.fraction = fraction 72 | 73 | ######### 74 | # Speed # 75 | ######### 76 | speed = self._format_speed(self._step) 77 | 78 | ########## 79 | # Params # 80 | ########## 81 | num_params = len(params) 82 | nrow = math.ceil(num_params / self.ncol) 83 | params_split = self._chunk(params, self.ncol) 84 | params_string, lines = self._format(params_split) 85 | self.lines = lines 86 | 87 | 88 | description = '{} | {}{}'.format(percent, speed, params_string) 89 | print(description) 90 | self._skip_lines = nrow + 1 91 | 92 | def append_description(self, descr): 93 | self.lines.append(descr) 94 | 95 | def _clear(self): 96 | position = self._prev_line * self._skip_lines 97 | empty = '\n'.join([self._clear_line for _ in range(self._skip_lines)]) 98 | print(position, end='') 99 | print(empty) 100 | print(position, end='') 101 | 102 | def _format_percent(self, n, total): 103 | if total: 104 | percent = n / float(total) 105 | 106 | complete_entries = int(percent * self._pbar_size) 107 | incomplete_entries = self._pbar_size - complete_entries 108 | 109 | pbar = self._complete_pbar[:complete_entries] + self._incomplete_pbar[:incomplete_entries] 110 | fraction = '{} / {}'.format(n, total) 111 | string = '{} [{}] {:3d}%'.format(fraction, pbar, int(percent*100)) 112 | else: 113 | fraction = '{}'.format(n) 114 | string = '{} iterations'.format(n) 115 | return string, fraction 116 | 117 | def _format_speed(self, n): 118 | num_steps = n - self._step0 119 | t = time.time() - self._time0 120 | speed = num_steps / t 121 | string = '{:.1f} Hz'.format(speed) 122 | if num_steps > 0: 123 | self._speed = string 124 | return string 125 | 126 | def _chunk(self, l, n): 127 | return [l[i:i+n] for i in range(0, len(l), n)] 128 | 129 | def _format(self, chunks): 130 | lines = [self._format_chunk(chunk) for chunk in chunks] 131 | lines.insert(0,'') 132 | padding = '\n' + ' '*self.indent 133 | string = padding.join(lines) 134 | return string, lines 135 | 136 | def _format_chunk(self, chunk): 137 | line = ' | '.join([self._format_param(param) for param in chunk]) 138 | return line 139 | 140 | def _format_param(self, param): 141 | k, v = param 142 | return '{} : {}'.format(k, v)[:self.max_length] 143 | 144 | def stamp(self): 145 | if self.lines != ['']: 146 | params = ' | '.join(self.lines) 147 | string = '[ {} ] {}{} | {}'.format(self.name, self.fraction, params, self._speed) 148 | self._clear() 149 | print(string, end='\n') 150 | self._skip_lines = 1 151 | else: 152 | self._clear() 153 | self._skip_lines = 0 154 | 155 | def close(self): 156 | self.pause() 157 | 158 | class Silent: 159 | 160 | def __init__(self, *args, **kwargs): 161 | pass 162 | 163 | def __getattr__(self, attr): 164 | return lambda *args: None 165 | 166 | 167 | if __name__ == '__main__': 168 | silent = Silent() 169 | silent.update() 170 | silent.stamp() 171 | 172 | num_steps = 1000 173 | progress = Progress(num_steps) 174 | for i in range(num_steps): 175 | progress.update() 176 | params = [ 177 | ['A', '{:06d}'.format(i)], 178 | ['B', '{:06d}'.format(i)], 179 | ['C', '{:06d}'.format(i)], 180 | ['D', '{:06d}'.format(i)], 181 | ['E', '{:06d}'.format(i)], 182 | ['F', '{:06d}'.format(i)], 183 | ['G', '{:06d}'.format(i)], 184 | ['H', '{:06d}'.format(i)], 185 | ] 186 | progress.set_description(params) 187 | time.sleep(0.01) 188 | progress.close() 189 | -------------------------------------------------------------------------------- /models/pens/pe_factory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.ma as ma 3 | import tensorflow as tf 4 | 5 | import copy 6 | from .fc import FC 7 | from .pe import PE 8 | 9 | def build_PE(in_dim, 10 | out_dim, 11 | name='BNN', 12 | hidden_dims=(200, 200, 200), 13 | num_networks=7, 14 | num_elites=5, 15 | loss = 'MSPE', 16 | activation = 'swish', 17 | output_activation = None, 18 | decay=1e-4, 19 | lr = 1e-3, 20 | lr_decay = None, 21 | decay_steps=None, 22 | use_scaler_in = False, 23 | use_scaler_out = False, 24 | clip_loss = False, 25 | kl_cliprange = 0.1, 26 | max_logvar = .5, 27 | min_logvar = -6, 28 | session=None): 29 | """ 30 | Constructs a tf probabilistic ensemble model. 31 | Args: 32 | loss: Choose from 'MSPE', 'NLL', 'MSE', 'Huber', or 'CE'. 33 | choosing MSPE or NLL will construct a model with variance output 34 | """ 35 | print('[PE] dim in / out: {} / {} | Hidden dim: {}'.format(in_dim, out_dim, hidden_dims)) 36 | #print('[ BNN ] Input Layer dim: {} | Output Layer dim: {} '.format(obs_dim_in+act_dim+prior_dim, obs_dim_out+rew_dim)) 37 | params = {'name': name, 38 | 'loss':loss, 39 | 'num_networks': num_networks, 40 | 'num_elites': num_elites, 41 | 'sess': session, 42 | 'use_scaler_in': use_scaler_in, 43 | 'use_scaler_out': use_scaler_out, 44 | 'clip_loss': clip_loss, 45 | 'kl_cliprange':kl_cliprange, 46 | 'max_logvar':max_logvar, 47 | 'min_logvar':min_logvar, 48 | } 49 | model = PE(params) 50 | model.add(FC(hidden_dims[0], input_dim=in_dim, activation=activation, weight_decay=decay/4)) # def dec: 0.000025)) 51 | 52 | for hidden_dim in hidden_dims[1:]: 53 | model.add(FC(hidden_dim, activation=activation, weight_decay=decay/2)) # def dec: 0.00005)) 54 | 55 | model.add(FC(out_dim, activation=output_activation, weight_decay=decay)) # def dec: 0.0001 56 | 57 | opt_params = {"learning_rate":lr} if lr_decay is None else {"learning_rate":lr, 58 | "learning_rate_decay":lr_decay, 59 | "decay_steps":decay_steps} 60 | model.finalize(tf.train.AdamOptimizer, opt_params, lr_decay=lr_decay) 61 | 62 | total_parameters = 0 63 | for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name): 64 | # shape is an array of tf.Dimension 65 | shape = variable.get_shape() 66 | variable_parameters = 1 67 | for dim in shape: 68 | variable_parameters *= dim.value 69 | total_parameters += variable_parameters 70 | print('[ Probabilistic Ensemble ] Total trainable Parameteres: {} '.format(total_parameters)) 71 | 72 | return model 73 | 74 | def format_samples_for_dyn(samples, append_r=True, append_c=False, noise=None): 75 | """ 76 | formats samples to fit training, specifically returns: 77 | inputs, outputs: 78 | 79 | inputs = np.concatenate((observations, act, priors), axis=-1) 80 | outputs = np.concatenate(delta_observations, rewards ,costs), axis=-1) 81 | 82 | where rewards and costs are optional 83 | """ 84 | obs = samples['observations'] 85 | act = samples['actions'] 86 | next_obs = samples['next_observations'] 87 | terms = np.squeeze(samples['terminals'])[..., None] 88 | 89 | delta_obs = next_obs - obs 90 | 91 | #### ----END preprocess samples for model training in safety gym -----#### 92 | inputs = np.concatenate((obs, act), axis=-1) 93 | 94 | outputs = delta_obs 95 | 96 | if append_r: 97 | rew = np.squeeze(samples['rewards'])[..., None] 98 | outputs = np.concatenate((outputs, rew), axis=-1) 99 | 100 | if append_c: 101 | costs = np.squeeze(samples['costs'])[..., None] 102 | outputs = np.concatenate((outputs, costs), axis=-1) 103 | 104 | # add noise 105 | if noise: 106 | inputs = _add_noise(inputs, noise) ### noise helps (sometimes) 107 | 108 | return inputs, outputs 109 | 110 | 111 | ### @anyboby, try to include this in the model rather than separately 112 | def format_samples_for_cost(samples, oversampling=False, one_hot = True, num_classes=2, noise=None): 113 | """ 114 | formats samples to fit training for cost, specifically returns: 115 | (obs, act, next_obs) 116 | 117 | Args: 118 | one_hot: determines whether targets are structured as classification or regression 119 | one_hot: True will output targets with shape [batch_size, num_classes] 120 | one_hot: False wil output targets with shape [batch_size,] and scalar targets 121 | """ 122 | next_obs = samples['next_observations'] 123 | obs = samples['observations'] 124 | cost = samples['costs'] 125 | act = samples['actions'] 126 | 127 | if one_hot: 128 | cost_one_hot = np.zeros(shape=(len(cost), num_classes)) 129 | batch_indcs = np.arange(0, len(cost)) 130 | costs = cost.astype(int) 131 | cost_one_hot[(batch_indcs, costs)] = 1 132 | outputs = cost_one_hot 133 | else: 134 | outputs = cost[:, None] 135 | 136 | inputs = np.concatenate((obs, act, next_obs), axis=-1) 137 | ## ________________________________ ## 138 | ## oversample cost classes ## 139 | ## ________________________________ ## 140 | if oversampling: 141 | if len(outputs[np.where(costs>0)[0]])>0: 142 | imbalance_ratio = len(outputs[np.where(costs==0)[0]])//len(outputs[np.where(costs>0)[0]]) 143 | extra_outputs = np.tile(outputs[np.where(costs>0)[0]], (1+imbalance_ratio//3,1)) ## don't need to overdo it 144 | outputs = np.concatenate((outputs, extra_outputs), axis=0) 145 | extra_inputs = np.tile(inputs[np.where(costs>0)[0]], (1+imbalance_ratio//3,1)) 146 | extra_inputs = _add_noise(extra_inputs, 0.0001) 147 | inputs = np.concatenate((inputs, extra_inputs), axis=0) 148 | 149 | ### ______ add noise _____ ### 150 | if noise: 151 | inputs = _add_noise(inputs, noise) ### noise helps 152 | 153 | return inputs, outputs 154 | 155 | def _add_noise(data_inp, noiseToSignal): 156 | data= copy.deepcopy(data_inp) 157 | mean_data = np.mean(data, axis = 0) 158 | std_of_noise = mean_data*noiseToSignal 159 | for j in range(mean_data.shape[0]): 160 | if(std_of_noise[j]>0): 161 | data[:,j] = np.copy(data[:,j]+np.random.normal(0, np.absolute(std_of_noise[j]), (data.shape[0],))) 162 | return data 163 | 164 | def reset_model(model): 165 | model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=model.name) 166 | model.sess.run(tf.initialize_vars(model_vars)) 167 | -------------------------------------------------------------------------------- /models/pens/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | EPS = 1e-10 8 | 9 | def get_required_argument(dotmap, key, message, default=None): 10 | val = dotmap.get(key, default) 11 | if val is default: 12 | raise ValueError(message) 13 | return val 14 | 15 | def gaussian_kl_np(mu0, log_std0, mu1, log_std1): 16 | """interprets each entry in mu_i and log_std_i as independent, 17 | preserves shape 18 | output clipped to {0, 1e10} 19 | """ 20 | var0, var1 = np.exp(2 * log_std0), np.exp(2 * log_std1) 21 | pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1+EPS) - 1) + log_std1 - log_std0 22 | all_kls = pre_sum 23 | #all_kls = np.mean(all_kls) 24 | all_kls = np.clip(all_kls, 0, 1/EPS) ### for stability 25 | return all_kls 26 | 27 | def gaussian_jsd_np(mu0, log_std0, mu1, log_std1): 28 | pass 29 | 30 | def average_dkl(mu, std): 31 | """ 32 | Calculates the average kullback leiber divergences of multiple univariate gaussian distributions. 33 | 34 | K(P1,…Pk) = 1/(k(k−1)) ∑_[k_(i,j)=1] DKL(Pi||Pj) 35 | 36 | (Andrea Sgarro, Informational divergence and the dissimilarity of probability distributions.) 37 | 38 | expects the distributions along axis 0, and samples along axis 1. 39 | Output is reduced by axis 0 40 | 41 | Args: 42 | mu: array-like means 43 | std: array-like stds 44 | """ 45 | ## clip log 46 | log_std = np.log(std) 47 | log_std = np.clip(log_std, -100, 1e8) 48 | assert len(mu.shape)>=2 and len(log_std.shape)>=2 49 | num_models = len(mu) 50 | d_kl = None 51 | for i in range(num_models): 52 | for j in range(num_models): 53 | if d_kl is None: 54 | d_kl = gaussian_kl_np(mu[i], log_std[i], mu[j], log_std[j]) 55 | else: d_kl+= gaussian_kl_np(mu[i], log_std[i], mu[j], log_std[j]) 56 | d_kl = d_kl/(num_models*(num_models-1)+EPS) 57 | return d_kl 58 | 59 | def median_dkl(mu, std): 60 | """ 61 | Calculates the median kullback leiber divergences of multiple univariate gaussian distributions. 62 | 63 | K(P1,…Pk) = 1/(k(k−1)) ∑_[k_(i,j)=1] DKL(Pi||Pj) 64 | 65 | (Andrea Sgarro, Informational divergence and the dissimilarity of probability distributions.) 66 | 67 | expects the distributions along axis 0, and samples along axis 1. 68 | Output is reduced by axis 0 69 | 70 | Args: 71 | mu: array-like means 72 | std: array-like stds 73 | """ 74 | ## clip log 75 | log_std = np.log(std) 76 | log_std = np.clip(log_std, -100, 1e8) 77 | assert len(mu.shape)>=2 and len(log_std.shape)>=2 78 | num_models = len(mu) 79 | d_kl = np.zeros(shape=(num_models*(num_models-1),) + mu.shape[1:]) 80 | n = 0 81 | for i in range(num_models): 82 | for j in range(num_models): 83 | if i != j: 84 | d_kl[n] = gaussian_kl_np(mu[i], log_std[i], mu[j], log_std[j]) 85 | n += 1 86 | d_kl_med = np.median(d_kl, axis=0) 87 | return d_kl_med 88 | 89 | 90 | class TensorStandardScaler: 91 | """Helper class for automatically normalizing inputs into the network. 92 | """ 93 | def __init__(self, x_dim, name='Scaler'): 94 | """Initializes a scaler. 95 | 96 | Arguments: 97 | x_dim (int): The dimensionality of the inputs into the scaler. 98 | 99 | Returns: None. 100 | """ 101 | self.fitted = False 102 | with tf.variable_scope(name): 103 | self.count = tf.get_variable( 104 | name=name+'_count', shape=(), initializer=tf.constant_initializer(0), 105 | trainable=False 106 | ) 107 | 108 | self.mu = tf.get_variable( 109 | name=name+'_mu', shape=[1, x_dim], initializer=tf.constant_initializer(0.0), 110 | trainable=False 111 | ) 112 | self.var = tf.get_variable( 113 | name=name+'_std', shape=[1, x_dim], initializer=tf.constant_initializer(1.0), 114 | trainable=False 115 | ) 116 | 117 | self.cached_count, self.cached_mu, self.cached_var = 0, np.zeros([1, x_dim]), np.ones([1, x_dim]) 118 | 119 | def fit(self, data): 120 | """Runs two ops, one for assigning the mean of the data to the internal mean, and 121 | another for assigning the standard deviation of the data to the internal standard deviation. 122 | This function must be called within a 'with .as_default()' block. 123 | 124 | Arguments: 125 | data (np.ndarray): A numpy array containing the input 126 | 127 | Returns: None. 128 | """ 129 | batch_count = data.shape[0] 130 | batch_mu = np.mean(data, axis=0, keepdims=True) 131 | batch_var = np.var(data, axis=0, keepdims=True) 132 | new_mean, new_var, new_count = self.running_mean_var_from_batch(batch_mu, batch_var, batch_count) 133 | #sigma[sigma < 1e-8] = 1.0 134 | self.mu.load(new_mean) 135 | self.var.load(new_var) 136 | self.count.load(new_count) 137 | self.fitted = True 138 | self.cache() 139 | 140 | def transform(self, data): 141 | """Transforms the input matrix data using the parameters of this scaler. 142 | 143 | can be adjusted to scale with a factor, to control sensitivity to ood data: 144 | d = (d-mu)/sigma = d + (d-mu)/sigma - d = d + (d(1-sigma)-mu)/sigma 145 | and the version with scaling factor thus becomes 146 | d = d + sc_factor*(d(1-sigma)-mu)/sigma 147 | 148 | Arguments: 149 | data (np.array): A numpy array containing the points to be transformed. 150 | sc_factor: Factor to what degree the original dataset is transformed 151 | 152 | Returns: (np.array) The transformed dataset. 153 | 154 | 155 | """ 156 | scaled_transform = (data-self.mu)/(tf.maximum(tf.sqrt(self.var), 1e-2)) 157 | return scaled_transform 158 | 159 | def inverse_transform(self, data): 160 | """Undoes the transformation performed by this scaler. 161 | 162 | Arguments: 163 | data (np.array): A numpy array containing the points to be transformed. 164 | 165 | Returns: (np.array) The transformed dataset. 166 | """ 167 | return (tf.maximum(tf.sqrt(self.var), 1e-2)) * data + self.mu 168 | 169 | def inverse_transform_var(self, data): 170 | """Undoes the transformation performed by this scaler for variances. 171 | 172 | Arguments: 173 | data (np.array): A numpy array containing the points to be transformed. 174 | 175 | Returns: (np.array) The transformed dataset. 176 | """ 177 | return tf.square(tf.maximum(tf.sqrt(self.var), 1e-2)) * data 178 | 179 | def inverse_transform_logvar(self, data): 180 | """Undoes the transformation performed by this scaler for variances. 181 | 182 | Arguments: 183 | data (np.array): A numpy array containing the points to be transformed. 184 | 185 | Returns: (np.array) The transformed dataset. 186 | """ 187 | return 2*tf.log(tf.maximum(tf.sqrt(self.var), 1e-2)) + data 188 | 189 | def get_vars(self): 190 | """Returns a list of variables managed by this object. 191 | 192 | Returns: (list) The list of variables. 193 | """ 194 | return [self.mu, self.var] 195 | 196 | def get_mu(self): 197 | return self.mu 198 | 199 | def get_var(self): 200 | return self.var 201 | 202 | def cache(self): 203 | """Caches current values of this scaler. 204 | 205 | Returns: None. 206 | """ 207 | self.cached_mu = self.mu.eval() 208 | self.cached_var = self.var.eval() 209 | self.cached_count = self.count.eval() 210 | 211 | def load_cache(self): 212 | """Loads values from the cache 213 | 214 | Returns: None. 215 | """ 216 | self.mu.load(self.cached_mu) 217 | self.var.load(self.cached_var) 218 | self.count.load(self.cached_count) 219 | 220 | def running_mean_var_from_batch(self, batch_mean, batch_var, batch_count): 221 | delta = batch_mean - self.cached_mu 222 | tot_count = self.cached_count + batch_count 223 | 224 | new_mean = self.cached_mu + delta * batch_count / tot_count 225 | m_a = self.cached_var * self.cached_count 226 | m_b = batch_var * batch_count 227 | M2 = m_a + m_b + np.square(delta) * self.cached_count * batch_count / tot_count 228 | new_var = M2 / tot_count 229 | new_count = tot_count 230 | 231 | return new_mean, new_var, new_count 232 | -------------------------------------------------------------------------------- /models/statics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def no_done(obs, act, next_obs): 4 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) 5 | 6 | done = np.zeros(shape=obs.shape[:-1], dtype=np.bool) #np.array([False]).repeat(len(obs)) 7 | done = done[...,None] 8 | return done 9 | 10 | def hcs_cost_f(obs, act, next_obs): 11 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) 12 | 13 | xdist = next_obs[...,-1]*10 14 | obj_cost = np.array((np.abs(xdist)<2.0), dtype=np.float32)[..., None] 15 | return obj_cost 16 | 17 | def antsafe_term_fn(obs, act, next_obs): 18 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) 19 | 20 | z = next_obs[..., 0] 21 | body_quat = next_obs[...,1:5] 22 | z_rot = 1-2*(body_quat[...,1]**2+body_quat[...,2]**2) 23 | 24 | notdone = np.isfinite(next_obs).all(axis=-1) \ 25 | * (z >= 0.2) \ 26 | * (z <= 1.0) \ 27 | * z_rot >= -0.7 28 | 29 | done = ~notdone 30 | done = done[...,None] 31 | return done 32 | 33 | def antsafe_c_fn(obs, act, next_obs): 34 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) 35 | 36 | z = next_obs[..., 0] 37 | body_quat = next_obs[...,1:5] 38 | z_rot = 1-2*(body_quat[...,1]**2+body_quat[...,2]**2) 39 | y_dist = next_obs[..., -1:] 40 | 41 | obj_cost = np.any(abs(y_dist)>3.2, axis=-1)[...,None]*1.0 42 | 43 | notdone = np.isfinite(next_obs).all(axis=-1) \ 44 | * (z >= 0.2) \ 45 | * (z <= 1.0) \ 46 | * z_rot >= -0.7 47 | 48 | done = ~notdone 49 | done = done[...,None] 50 | 51 | done_cost = done*1.0 52 | cost = np.clip(done_cost+obj_cost, 0, 1) 53 | return cost 54 | 55 | 56 | TERMS_BY_TASK = { 57 | 'default':no_done, 58 | 'HalfCheetah-v2':no_done, 59 | 'HalfCheetahSafe-v2':no_done, 60 | 'AntSafe-v2':antsafe_term_fn, 61 | } 62 | 63 | REWS_BY_TASK = { 64 | 65 | } 66 | 67 | COST_BY_TASK = { 68 | 'HalfCheetahSafe-v2':hcs_cost_f, 69 | 'AntSafe-v2':antsafe_c_fn, 70 | } -------------------------------------------------------------------------------- /network/ac_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from gym.spaces import Box, Discrete 4 | from utilities.utils import combined_shape, EPS 5 | 6 | """ 7 | Network utils 8 | """ 9 | 10 | def placeholder(dim=None): 11 | return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim)) 12 | 13 | def placeholders(*args): 14 | return [placeholder(dim) for dim in args] 15 | 16 | def placeholder_from_space(space): 17 | if isinstance(space, Box): 18 | return placeholder(space.shape) 19 | elif isinstance(space, Discrete): 20 | return tf.placeholder(dtype=tf.int32, shape=(None,)) 21 | raise NotImplementedError('bad space {}'.format(space)) 22 | 23 | def placeholders_from_spaces(*args): 24 | return [placeholder_from_space(space) for space in args] 25 | 26 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None, ensemble_size = 1): 27 | for h in hidden_sizes[:-1]: 28 | if ensemble_size==1: 29 | x = tf.layers.dense(x, units=h, activation=activation) 30 | else: 31 | x = tf.layers.dense(x, units=(ensemble_size,)+(h,), activation=activation) 32 | x = tf.transpose(x) 33 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 34 | 35 | def get_vars(scope=''): 36 | return [x for x in tf.trainable_variables() if '/'+scope+'/' in x.name] 37 | 38 | def count_vars(scope=''): 39 | v = get_vars(scope) 40 | return sum([np.prod(var.shape.as_list()) for var in v]) 41 | 42 | """ 43 | Gaussian distributions 44 | """ 45 | 46 | def gaussian_likelihood(x, mu, log_std): 47 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 48 | return tf.reduce_sum(pre_sum, axis=1) 49 | 50 | def gaussian_kl(mu0, log_std0, mu1, log_std1): 51 | """Returns average kl divergence between two batches of dists""" 52 | var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1) 53 | pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1 + EPS) - 1) + log_std1 - log_std0 54 | all_kls = tf.reduce_sum(pre_sum, axis=1) 55 | return tf.reduce_mean(all_kls) 56 | 57 | def gaussian_entropy(log_std): 58 | """Returns average entropy over a batch of dists""" 59 | pre_sum = log_std + 0.5 * np.log(2*np.pi*np.e) 60 | all_ents = tf.reduce_sum(pre_sum, axis=-1) 61 | return tf.reduce_mean(all_ents) 62 | 63 | """ 64 | Categorical distributions 65 | """ 66 | 67 | def categorical_kl(logp0, logp1): 68 | """Returns average kl divergence between two batches of dists""" 69 | all_kls = tf.reduce_sum(tf.exp(logp1) * (logp1 - logp0), axis=1) 70 | return tf.reduce_mean(all_kls) 71 | 72 | def categorical_entropy(logp): 73 | """Returns average entropy over a batch of dists""" 74 | all_ents = -tf.reduce_sum(logp * tf.exp(logp), axis=1) 75 | return tf.reduce_mean(all_ents) 76 | 77 | 78 | """ 79 | Policies 80 | """ 81 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): 82 | act_dim = action_space.n 83 | logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None) 84 | logp_all = tf.nn.log_softmax(logits) 85 | pi = tf.squeeze(tf.multinomial(logits,1), axis=1) 86 | logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) 87 | logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) 88 | 89 | old_logp_all = placeholder(act_dim) 90 | d_kl = categorical_kl(logp_all, old_logp_all) 91 | ent = categorical_entropy(logp_all) 92 | 93 | pi_info = {'logp_all': logp_all} 94 | pi_info_phs = {'logp_all': old_logp_all} 95 | 96 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent 97 | 98 | 99 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 100 | act_dim = a.shape.as_list()[-1] 101 | 102 | 103 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 104 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32)) 105 | ### @anyboby testing: higher starting std, ppo1 uses log_std=0 at the beginning 106 | # log_std = tf.get_variable(name='log_std', shape=act_dim ,initializer=tf.zeros_initializer(), dtype=tf.float32) 107 | std = tf.exp(log_std) 108 | 109 | pi = mu + tf.random_normal(tf.shape(mu)) * std 110 | logp = gaussian_likelihood(a, mu, log_std) 111 | logp_pi = gaussian_likelihood(pi, mu, log_std) 112 | 113 | old_mu_ph, old_log_std_ph = placeholders(act_dim, act_dim) 114 | d_kl = gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph) 115 | ent = gaussian_entropy(log_std) 116 | 117 | # adjust log_std to input dim, even though it doesn't depend on it 118 | # @anyboby lol this is so bad 119 | log_std_info = tf.tensordot(tf.ones(tf.shape(x)[0]), log_std, axes=0) 120 | pi_info = {'mu': mu, 'log_std': log_std_info} 121 | pi_info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph} 122 | 123 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent 124 | 125 | 126 | LOG_STD_MAX = 2 127 | LOG_STD_MIN = -20 128 | 129 | def mlp_squashed_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 130 | """ 131 | Experimental code for squashed gaussian policies, not yet tested 132 | """ 133 | act_dim = a.shape.as_list()[-1] 134 | net = mlp(x, list(hidden_sizes), activation, activation) 135 | mu = tf.layers.dense(net, act_dim, activation=output_activation) 136 | log_std = tf.layers.dense(net, act_dim, activation=None) 137 | log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) 138 | 139 | std = tf.exp(log_std) 140 | u = mu + tf.random_normal(tf.shape(mu)) * std 141 | pi = tf.tanh(u) 142 | 143 | old_mu_ph, old_log_std_ph, u_ph = placeholders(act_dim, act_dim, act_dim) 144 | d_kl = gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph) # kl is invariant to squashing transform 145 | 146 | def apply_squashing_func(log_prob, raw_action): 147 | # Adjustment to log prob 148 | act = tf.tanh(raw_action) 149 | log_prob -= tf.reduce_sum(2*(np.log(2) - act - tf.nn.softplus(-2*act)), axis=1) 150 | return log_prob 151 | 152 | # Base log probs 153 | logp = gaussian_likelihood(u_ph, mu, log_std) 154 | logp_pi = gaussian_likelihood(u, mu, log_std) 155 | 156 | # Squashed log probs 157 | logp = apply_squashing_func(logp, u_ph) 158 | logp_pi = apply_squashing_func(logp_pi, u) 159 | 160 | # Approximate entropy 161 | ent = -tf.reduce_mean(logp_pi) # approximate! hacky! 162 | 163 | pi_info = {'mu': mu, 'log_std': log_std, 'raw_action': u} 164 | pi_info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph, 'raw_action': u_ph} 165 | 166 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent 167 | 168 | 169 | 170 | """ 171 | Actor-Critics 172 | """ 173 | def mlp_actor_critic(x, a, hidden_sizes_a=(64,64), hidden_sizes_c=(64,64), critic_ensemble_size=1, activation=tf.tanh, 174 | output_activation=None, policy=None, action_space=None): 175 | 176 | # default policy builder depends on action space 177 | if policy is None and isinstance(action_space, Box): 178 | policy = mlp_gaussian_policy 179 | elif policy is None and isinstance(action_space, Discrete): 180 | policy = mlp_categorical_policy 181 | 182 | with tf.variable_scope('pi'): 183 | policy_outs = policy(x, a, hidden_sizes_a, activation, output_activation, action_space) 184 | pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent = policy_outs 185 | 186 | with tf.variable_scope('vf'): 187 | v = tf.squeeze(mlp(x, list(hidden_sizes_c)+[1], activation, None, ensemble_size=critic_ensemble_size)) 188 | 189 | with tf.variable_scope('vc'): 190 | vc = tf.squeeze(mlp(x, list(hidden_sizes_c)+[1], activation, None, ensemble_size=critic_ensemble_size)) 191 | 192 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent, v, vc 193 | 194 | def mlp_actor(x, a, hidden_sizes=(64,64), activation=tf.tanh, 195 | output_activation=None, policy=None, action_space=None): 196 | 197 | # default policy builder depends on action space 198 | if policy is None and isinstance(action_space, Box): 199 | policy = mlp_gaussian_policy 200 | elif policy is None and isinstance(action_space, Discrete): 201 | policy = mlp_categorical_policy 202 | 203 | with tf.variable_scope('pi'): 204 | policy_outs = policy(x, a, hidden_sizes, activation, output_activation, action_space) 205 | pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent = policy_outs 206 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent 207 | 208 | def mlp_critic (x, hidden_sizes=(64,64), activation=tf.tanh, 209 | output_activation=None, policy=None, action_space=None, name='V'): 210 | with tf.variable_scope(name+'f'): 211 | v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None)) 212 | 213 | with tf.variable_scope(name+'c'): 214 | vc = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None)) 215 | 216 | return v, vc 217 | -------------------------------------------------------------------------------- /policies/base_policy.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from collections import OrderedDict 3 | 4 | import numpy as np 5 | 6 | class BasePolicy: 7 | def __init__(self): 8 | self._deterministic = False 9 | 10 | def reset(self): 11 | """Reset and clean the policy.""" 12 | raise NotImplementedError 13 | 14 | def actions(self, conditions): 15 | """Compute (symbolic) actions given conditions (observations)""" 16 | raise NotImplementedError 17 | 18 | def log_pis(self, conditions, actions): 19 | """Compute (symbolic) log probs for given observations and actions.""" 20 | raise NotImplementedError 21 | 22 | def actions_np(self, conditions): 23 | """Compute (numeric) actions given conditions (observations)""" 24 | raise NotImplementedError 25 | 26 | def log_pis_np(self, conditions, actions): 27 | """Compute (numeric) log probs for given observations and actions.""" 28 | raise NotImplementedError 29 | 30 | def get_diagnostics(self, conditions): 31 | """Return diagnostic information of the policy. 32 | 33 | Arguments: 34 | conditions: Observations to run the diagnostics for. 35 | Returns: 36 | diagnostics: OrderedDict of diagnostic information. 37 | """ 38 | diagnostics = OrderedDict({}) 39 | return diagnostics -------------------------------------------------------------------------------- /policies/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | def get_cpo_policy(env, session, *args, **kwargs): 4 | from policies.cpo_policy import CPOPolicy 5 | policy = CPOPolicy( 6 | obs_space=env.observation_space, 7 | act_space=env.action_space, 8 | session=session, 9 | *args, 10 | **kwargs) 11 | return policy 12 | 13 | POLICY_FUNCTIONS = { 14 | 'cpopolicy': get_cpo_policy, 15 | } 16 | 17 | 18 | def get_policy(policy_type, *args, **kwargs): 19 | return POLICY_FUNCTIONS[policy_type](*args, **kwargs) 20 | 21 | def get_policy_from_params(params, env, *args, **kwargs): 22 | policy_params = params['policy_params'] 23 | policy_type = policy_params['type'] 24 | policy_kwargs = deepcopy(policy_params['kwargs']) 25 | 26 | policy = POLICY_FUNCTIONS[policy_type]( 27 | env, 28 | *args, 29 | **policy_kwargs, 30 | **kwargs) 31 | 32 | return policy 33 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | conda-env-export==0.3.2 2 | dotmap==1.3.8 3 | gtimer==1.0.0b5 4 | gym==0.18.0 5 | joblib==0.14.1 6 | mkl-fft==1.2.0 7 | mkl-random==1.1.0 8 | mkl-service==2.3.0 9 | mujoco-py==2.0.2.13 10 | olefile==0.46 11 | pyOpenSSL==19.1.0 12 | PySocks==1.7.1 13 | PyYAML==5.4.1 14 | ray==0.6.4 15 | sip==4.19.24 16 | tensorflow==1.14.0 17 | tornado==6.0.4 18 | -------------------------------------------------------------------------------- /samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/samplers/__init__.py -------------------------------------------------------------------------------- /samplers/base_sampler.py: -------------------------------------------------------------------------------- 1 | from collections import deque, OrderedDict 2 | from itertools import islice 3 | 4 | 5 | class BaseSampler(object): 6 | def __init__(self, 7 | max_path_length, 8 | min_pool_size, 9 | batch_size, 10 | store_last_n_paths=10, 11 | preprocess_type='default'): 12 | self._max_path_length = max_path_length 13 | self._min_pool_size = min_pool_size 14 | self._batch_size = batch_size 15 | self._store_last_n_paths = store_last_n_paths 16 | self._last_n_paths = deque(maxlen=store_last_n_paths) 17 | self._obs_process_type = preprocess_type 18 | self.env = None 19 | self.policy = None 20 | self.pool = None 21 | 22 | def initialize(self, env, policy, pool): 23 | self.env = env 24 | self.policy = policy 25 | self.pool = pool 26 | 27 | def set_policy(self, policy): 28 | self.policy = policy 29 | 30 | def clear_last_n_paths(self): 31 | self._last_n_paths.clear() 32 | 33 | def get_last_n_paths(self, n=None): 34 | if n is None: 35 | n = self._store_last_n_paths 36 | 37 | last_n_paths = tuple(islice(self._last_n_paths, None, n)) 38 | 39 | return last_n_paths 40 | 41 | def sample(self): 42 | raise NotImplementedError 43 | 44 | def batch_ready(self): 45 | enough_samples = self.pool.size >= self._min_pool_size 46 | return enough_samples 47 | 48 | def random_batch(self, batch_size=None, **kwargs): 49 | batch_size = batch_size or self._batch_size 50 | return self.pool.random_batch(batch_size, **kwargs) 51 | 52 | def terminate(self): 53 | self.env.close() 54 | 55 | def get_diagnostics(self): 56 | diagnostics = OrderedDict({'pool-size': self.pool.size}) 57 | return diagnostics 58 | 59 | def __getstate__(self): 60 | state = { 61 | key: value for key, value in self.__dict__.items() 62 | if key not in ('env', 'policy', 'pool') 63 | } 64 | 65 | return state 66 | 67 | def __setstate__(self, state): 68 | self.__dict__.update(state) 69 | 70 | self.env = None 71 | self.policy = None 72 | self.pool = None 73 | -------------------------------------------------------------------------------- /samplers/simple_sampler.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | 4 | 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | from samplers.base_sampler import BaseSampler 9 | 10 | class SimpleSampler(BaseSampler): 11 | def __init__(self, **kwargs): 12 | super(SimpleSampler, self).__init__(**kwargs) 13 | 14 | self._path_length = 0 15 | self._path_return = 0 16 | self._current_path = defaultdict(list) 17 | self._last_path_return = 0 18 | self._max_path_return = -np.inf 19 | self._n_episodes = 0 20 | self._current_observation = None 21 | self._total_samples = 0 22 | self._last_action = None 23 | 24 | def _process_observations(self, 25 | observation, 26 | action, 27 | reward, 28 | cost, 29 | terminal, 30 | next_observation, 31 | info): 32 | 33 | processed_observation = { 34 | 'observations': observation, 35 | 'actions': action, 36 | 'rewards': [reward], 37 | 'cost' : [cost], 38 | 'terminals': [terminal], 39 | 'next_observations': next_observation, 40 | 'infos': info, 41 | } 42 | 43 | return processed_observation 44 | 45 | def sample(self): 46 | if self._current_observation is None: 47 | self._current_observation = np.squeeze(self.env.reset()) 48 | self._last_action = np.zeros(shape=self.env.action_space.shape) 49 | 50 | action = self.policy.actions_np( 51 | self.env.convert_to_active_observation( 52 | self._current_observation)[None] 53 | )[0] 54 | 55 | next_observation, reward, terminal, info = self.env.step(action) 56 | next_observation = np.squeeze(next_observation) 57 | reward = np.squeeze(reward) 58 | terminal = np.squeeze(terminal) 59 | cost = info.get('cost', 0) 60 | 61 | self._path_length += 1 62 | self._path_return += reward 63 | self._total_samples += 1 64 | 65 | processed_sample = self._process_observations( 66 | observation=self._current_observation, 67 | action=action, 68 | reward=reward, 69 | cost=cost, 70 | terminal=terminal, 71 | next_observation=next_observation, 72 | info=info, 73 | ) 74 | 75 | for key, value in processed_sample.items(): 76 | self._current_path[key].append(value) 77 | 78 | #### add to pool only after full epoch or terminal path 79 | if terminal or self._path_length >= self._max_path_length: 80 | last_path = { 81 | field_name: np.array(values) 82 | for field_name, values in self._current_path.items() 83 | } 84 | 85 | self.pool.add_path(last_path) 86 | self._last_n_paths.appendleft(last_path) 87 | 88 | self._max_path_return = max(self._max_path_return, 89 | self._path_return) 90 | self._last_path_return = self._path_return 91 | 92 | self.policy.reset() 93 | self._current_observation = None 94 | self._path_length = 0 95 | self._path_return = 0 96 | self._current_path = defaultdict(list) 97 | self._last_action = np.zeros(shape=self.env.action_space.shape) 98 | self._n_episodes += 1 99 | else: 100 | self._current_observation = next_observation 101 | self._last_action = action 102 | 103 | return next_observation, reward, terminal, info 104 | 105 | def random_batch(self, batch_size=None, **kwargs): 106 | batch_size = batch_size or self._batch_size 107 | observation_keys = getattr(self.env, 'observation_keys', None) 108 | 109 | return self.pool.random_batch( 110 | batch_size, observation_keys=observation_keys, **kwargs) 111 | 112 | def get_diagnostics(self): 113 | diagnostics = super(SimpleSampler, self).get_diagnostics() 114 | diagnostics.update({ 115 | 'max-path-return': self._max_path_return, 116 | 'last-path-return': self._last_path_return, 117 | 'episodes': self._n_episodes, 118 | 'total-samples': self._total_samples, 119 | }) 120 | 121 | return diagnostics 122 | 123 | -------------------------------------------------------------------------------- /samplers/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import numpy as np 4 | 5 | def get_cposampler(*args, **kwargs): 6 | from samplers.cpo_sampler import CpoSampler 7 | sampler = CpoSampler( 8 | *args, 9 | **kwargs) 10 | 11 | return sampler 12 | 13 | SAMPLERS_FUNCTIONS = { 14 | 'CPOSampler' : get_cposampler, 15 | } 16 | 17 | 18 | def get_sampler_from_params(params, *args, **kwargs): 19 | 20 | sampler_params = params['sampler_params'] 21 | sampler_type = sampler_params['type'] 22 | 23 | sampler_args = deepcopy(sampler_params.get('args', ())) 24 | sampler_kwargs = deepcopy(sampler_params.get('kwargs', {})) 25 | 26 | sampler = SAMPLERS_FUNCTIONS[sampler_type]( 27 | *sampler_args, *args, **sampler_kwargs, **kwargs) 28 | 29 | return sampler -------------------------------------------------------------------------------- /scripts/console_scripts.py: -------------------------------------------------------------------------------- 1 | """A command line interface that exposes softlearning examples to user. 2 | 3 | This package exposes the functions in examples.instrument module to the user 4 | through a cli, which allows seamless runs of examples in different modes (e.g. 5 | locally, in google compute engine, or ec2). 6 | 7 | 8 | There are two types of cli commands in this file (each have their corresponding 9 | function in examples.instrument): 10 | 1. run_example_* methods, which run the experiments by invoking 11 | `tune.run_experiments` function. 12 | 2. launch_example_* methods, which are helpers function to submit an 13 | example to be run in the cloud. In practice, these launch a cluster, 14 | and then run the `run_example_cluster` method with the provided 15 | arguments and options. 16 | """ 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import logging 23 | 24 | import click 25 | 26 | from utilities.instrument import ( 27 | run_example_dry, 28 | run_example_local, 29 | run_example_debug, 30 | run_example_cluster, 31 | launch_example_cluster, 32 | launch_example_gce, 33 | launch_example_ec2) 34 | 35 | 36 | logging.basicConfig(level=logging.INFO) 37 | logger = logging.getLogger(__name__) 38 | logger.setLevel(logging.INFO) 39 | 40 | def add_options(options): 41 | def decorator(f): 42 | for option in options[::-1]: 43 | click.decorators._param_memo(f, option) 44 | return f 45 | return decorator 46 | 47 | 48 | @click.group() 49 | def cli(): 50 | pass 51 | 52 | 53 | @cli.command( 54 | name='run_example_dry', 55 | context_settings={'ignore_unknown_options': True}) 56 | @click.argument("example_module_name", required=True, type=str) 57 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 58 | def run_example_dry_cmd(example_module_name, example_argv): 59 | """Print the variant spec and related information of an example.""" 60 | return run_example_dry(example_module_name, example_argv) 61 | 62 | 63 | @cli.command( 64 | name='run_local', 65 | context_settings={'ignore_unknown_options': True}) 66 | @click.argument("example_module_name", required=True, type=str) 67 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 68 | def run_example_local_cmd(example_module_name, example_argv): 69 | """Run example locally, potentially parallelizing across cpus/gpus.""" 70 | return run_example_local(example_module_name, example_argv) 71 | 72 | 73 | @cli.command( 74 | name='run_example_debug', 75 | context_settings={'ignore_unknown_options': True}) 76 | @click.argument("example_module_name", required=True, type=str) 77 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 78 | def run_example_debug_cmd(example_module_name, example_argv): 79 | """The debug mode limits tune trial runs to enable use of debugger.""" 80 | return run_example_debug(example_module_name, example_argv) 81 | 82 | @cli.command( 83 | name='run_example_cluster', 84 | context_settings={'ignore_unknown_options': True}) 85 | 86 | @click.argument("example_module_name", required=True, type=str) 87 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 88 | def run_example_cluster_cmd(example_module_name, example_argv): 89 | """Run example on cluster mode. 90 | 91 | This functions is very similar to the local mode, except that it 92 | correctly sets the redis address to make ray/tune work on a cluster. 93 | """ 94 | run_example_cluster(example_module_name, example_argv) 95 | 96 | @cli.command( 97 | name='launch_example_cluster', 98 | context_settings={ 99 | 'allow_extra_args': True, 100 | 'ignore_unknown_options': True 101 | }) 102 | @click.argument("example_module_name", required=True, type=str) 103 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 104 | @click.option( 105 | "--config_file", 106 | required=False, 107 | type=str) 108 | @click.option( 109 | "--stop/--no-stop", 110 | is_flag=True, 111 | default=True, 112 | help="Stop the cluster after the command finishes running.") 113 | @click.option( 114 | "--start/--no-start", 115 | is_flag=True, 116 | default=True, 117 | help="Start the cluster if needed.") 118 | @click.option( 119 | "--screen/--no-screen", 120 | is_flag=True, 121 | default=False, 122 | help="Run the command in a screen.") 123 | @click.option( 124 | "--tmux/--no-tmux", 125 | is_flag=True, 126 | default=True, 127 | help="Run the command in tmux.") 128 | @click.option( 129 | "--override-cluster-name", 130 | required=False, 131 | type=str, 132 | help="Override the configured cluster name.") 133 | @click.option( 134 | "--port-forward", required=False, type=int, help="Port to forward.") 135 | def launch_example_cluster_cmd(*args, **kwargs): 136 | """Launches the example on autoscaled ray cluster through ray exec_cmd. 137 | 138 | This handles basic validation and sanity checks for the experiment, and 139 | then executes the command on autoscaled ray cluster. If necessary, it will 140 | also fill in more useful defaults for our workflow (i.e. for tmux and 141 | override_cluster_name). 142 | """ 143 | return launch_example_cluster(*args, **kwargs) 144 | 145 | 146 | @cli.command( 147 | name='launch_example_gce', 148 | context_settings={ 149 | 'allow_extra_args': True, 150 | 'ignore_unknown_options': True 151 | }) 152 | @add_options(launch_example_cluster_cmd.params) 153 | def launch_example_gce_cmd(*args, **kwargs): 154 | """Forwards call to `launch_example_cluster` after adding gce defaults. 155 | 156 | This optionally sets the ray autoscaler configuration file to the default 157 | gce configuration file, and then calls `launch_example_cluster` to 158 | execute the original command on autoscaled gce cluster by parsing the args. 159 | 160 | See `launch_example_cluster` for further details. 161 | """ 162 | return launch_example_gce(*args, **kwargs) 163 | 164 | 165 | @cli.command( 166 | name='launch_example_ec2', 167 | context_settings={ 168 | 'allow_extra_args': True, 169 | 'ignore_unknown_options': True 170 | }) 171 | @add_options(launch_example_cluster_cmd.params) 172 | def launch_example_ec2_cmd(*args, **kwargs): 173 | """Forwards call to `launch_example_cluster` after adding ec2 defaults. 174 | 175 | This optionally sets the ray autoscaler configuration file to the default 176 | ec2 configuration file, and then calls `launch_example_cluster` to 177 | execute the original command on autoscaled ec2 cluster by parsing the args. 178 | 179 | See `launch_example_cluster` for further details. 180 | """ 181 | return launch_example_ec2(*args, **kwargs) 182 | 183 | cli.add_command(run_example_local_cmd) 184 | cli.add_command(run_example_dry_cmd) 185 | cli.add_command(run_example_cluster_cmd) 186 | 187 | # Alias for run_example_local 188 | cli.add_command(run_example_local_cmd, name='launch_example_local') 189 | # Alias for run_example_dry 190 | cli.add_command(run_example_dry_cmd, name='launch_example_dry') 191 | # Alias for run_example_debug 192 | cli.add_command(run_example_debug_cmd, name='launch_example_debug') 193 | cli.add_command(launch_example_cluster_cmd) 194 | cli.add_command(launch_example_gce_cmd) 195 | cli.add_command(launch_example_ec2_cmd) 196 | 197 | 198 | def main(): 199 | return cli() 200 | 201 | 202 | if __name__ == "__main__": 203 | main() 204 | -------------------------------------------------------------------------------- /scripts/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import glob 4 | import pickle 5 | import sys 6 | import pdb 7 | import importlib 8 | from dotmap import DotMap 9 | 10 | import tensorflow as tf 11 | import ray 12 | from ray import tune 13 | from ray.autoscaler.commands import exec_cluster 14 | 15 | from envs.utils import get_env_from_params 16 | from algorithms.utils import get_algorithm_from_params 17 | from policies.utils import get_policy_from_params 18 | from buffers.utils import get_buffer_from_params 19 | from samplers.utils import get_sampler_from_params 20 | from utilities.utils import set_seed, initialize_tf_variables 21 | from utilities.instrument import create_trial_name_creator 22 | 23 | class SimpleExperiment(tune.Trainable): 24 | def _setup(self, params): 25 | self._params = params 26 | 27 | #### set up tf session 28 | set_seed(params['run_params']['seed']) 29 | gpu_options = tf.GPUOptions(allow_growth=True) 30 | session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 31 | tf.keras.backend.set_session(session) 32 | 33 | self._session = tf.keras.backend.get_session() 34 | self.train_generator = None 35 | self._built = False 36 | 37 | def _stop(self): 38 | tf.reset_default_graph() 39 | tf.keras.backend.clear_session() 40 | 41 | def _build(self): 42 | """ 43 | called by tune to build algorithm 44 | """ 45 | 46 | #### set up building blocks for algorithm 47 | params = copy.deepcopy(self._params) 48 | env_params = params['environment_params'] 49 | env = self.env = ( 50 | get_env_from_params(env_params)) 51 | 52 | buffer = self.buffer = ( 53 | get_buffer_from_params(params, env)) 54 | 55 | sampler = self.sampler = get_sampler_from_params(params) 56 | 57 | policy = self.policy = get_policy_from_params( 58 | params, env, self._session) 59 | 60 | #### build algorithm 61 | self.algorithm = get_algorithm_from_params( 62 | variant=self._params, 63 | env=env, 64 | policy=policy, 65 | buffer=buffer, 66 | sampler=sampler, 67 | session=self._session) 68 | 69 | #### finalize graph 70 | initialize_tf_variables(self._session, only_uninitialized=True) 71 | tf.get_default_graph().finalize() 72 | 73 | #### set train generator function 74 | self.train_generator = self.algorithm.train() 75 | self._built = True 76 | 77 | def _train(self): 78 | if not self._built: 79 | self._build() 80 | 81 | diagnostics = next(self.train_generator) 82 | return diagnostics 83 | 84 | def main(argv=None): 85 | """ 86 | run simple ray tune experiment. 87 | 88 | Please provide config file location, e.g. 89 | 90 | 91 | """ 92 | assert argv[0] is not None, "Please provide config file location, e.g." 93 | 94 | #### create 95 | base_module = 'configs.baseconfig' 96 | base_module = importlib.import_module(base_module) 97 | 98 | #### tune configs 99 | trial_name_template = 'seed:{trial.config[run_params][seed]}' 100 | trial_name_creator = create_trial_name_creator(trial_name_template) ## generator for trial name (determines logdir) 101 | gpus=1 ## gpus to be used 102 | trial_gpus=1 ## gpus to be used in trial 103 | mode='local' ## local or remote, currently only local supported 104 | 105 | config=str(argv[0]) ## config file location 106 | 107 | exp_config = DotMap(dict( 108 | gpus=gpus, 109 | trial_gpus=trial_gpus, 110 | mode=mode, 111 | config=config, 112 | )) 113 | 114 | ### build the experiment 115 | exp_config = base_module.get_variant_spec(exp_config) ## merge base config and config file to final config 116 | exp_id = exp_config.get('exp_name') ## name of the experiment 117 | exp_class = SimpleExperiment ## tune trainable class that runs the experiments 118 | local_dir = os.path.join(exp_config.get('log_dir'), exp_config.get('task')) ## directory for tf summaries, configs etc. 119 | 120 | ### define experiment 121 | experiment = { 122 | exp_id:{ 123 | 'run': exp_class, 124 | 'config': exp_config, 125 | 'local_dir': local_dir, 126 | 'trial_name_creator': trial_name_creator, 127 | } 128 | } 129 | 130 | ### initialize ray und run experiments 131 | ray.init( 132 | num_gpus=gpus, 133 | local_mode=True, 134 | object_store_memory=100 * 1024 * 1024, #@anyboby TODO: test the memory config 135 | ) 136 | 137 | tune.run_experiments( 138 | experiment, 139 | server_port=4321, 140 | ) 141 | 142 | if __name__ == '__main__': 143 | main(argv=sys.argv[1:]) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from setuptools import find_packages 3 | 4 | setup( 5 | name='cmbpo', 6 | packages=find_packages(), 7 | version='0.0.1', 8 | description='Constrained Model-based policy optimization', 9 | long_description=open('./README.md').read(), 10 | author='Moritz Zanger', 11 | author_email='zanger.moritz@gmail.com', 12 | entry_points={ 13 | 'console_scripts': ( 14 | 'cmbpo=scripts.console_scripts:main', 15 | ) 16 | }, 17 | requires=(), 18 | zip_safe=True, 19 | license='MIT' 20 | ) 21 | -------------------------------------------------------------------------------- /utilities/logging.py: -------------------------------------------------------------------------------- 1 | import time 2 | import math 3 | import pdb 4 | 5 | 6 | 7 | def update_dict(dict_a, dict_b, weight_a=.5, weight_b=.5): 8 | """ 9 | creates new updated dict and adds entries according to weights. 10 | for both weights = 1 the entries are added 11 | """ 12 | dict_a_cp = dict(dict_a) 13 | dict_a_cp.update(dict_b) 14 | for k,v in dict_b.items(): 15 | if k in dict_a.keys(): 16 | dict_a_cp[k] = weight_b*dict_b[k] + weight_a*dict_a[k] 17 | return dict_a_cp 18 | 19 | class Progress: 20 | 21 | def __init__(self, total, name = 'Progress', ncol=3, max_length=20, indent=0, line_width=100, speed_update_freq=100): 22 | self.total = total 23 | self.name = name 24 | self.ncol = ncol 25 | self.max_length = max_length 26 | self.indent = indent 27 | self.line_width = line_width 28 | self._speed_update_freq = speed_update_freq 29 | 30 | self._step = 0 31 | self._prev_line = '\033[F' 32 | self._clear_line = ' ' * self.line_width 33 | 34 | self._pbar_size = self.ncol * self.max_length 35 | self._complete_pbar = '#' * self._pbar_size 36 | self._incomplete_pbar = ' ' * self._pbar_size 37 | 38 | self.lines = [''] 39 | self.fraction = '{} / {}'.format(0, self.total) 40 | 41 | self.resume() 42 | 43 | 44 | def update(self, n=1): 45 | self._step += n 46 | if self._step % self._speed_update_freq == 0: 47 | self._time0 = time.time() 48 | self._step0 = self._step 49 | 50 | def resume(self): 51 | self._skip_lines = 1 52 | print('\n', end='') 53 | self._time0 = time.time() 54 | self._step0 = self._step 55 | 56 | def pause(self): 57 | self._clear() 58 | self._skip_lines = 1 59 | 60 | def set_description(self, params=[]): 61 | 62 | ############ 63 | # Position # 64 | ############ 65 | self._clear() 66 | 67 | ########### 68 | # Percent # 69 | ########### 70 | percent, fraction = self._format_percent(self._step, self.total) 71 | self.fraction = fraction 72 | 73 | ######### 74 | # Speed # 75 | ######### 76 | speed = self._format_speed(self._step) 77 | 78 | ########## 79 | # Params # 80 | ########## 81 | num_params = len(params) 82 | nrow = math.ceil(num_params / self.ncol) 83 | params_split = self._chunk(params, self.ncol) 84 | params_string, lines = self._format(params_split) 85 | self.lines = lines 86 | 87 | 88 | description = '{} | {}{}'.format(percent, speed, params_string) 89 | print(description) 90 | self._skip_lines = nrow + 1 91 | 92 | def append_description(self, descr): 93 | self.lines.append(descr) 94 | 95 | def _clear(self): 96 | position = self._prev_line * self._skip_lines 97 | empty = '\n'.join([self._clear_line for _ in range(self._skip_lines)]) 98 | print(position, end='') 99 | print(empty) 100 | print(position, end='') 101 | 102 | def _format_percent(self, n, total): 103 | if total: 104 | percent = n / float(total) 105 | 106 | complete_entries = int(percent * self._pbar_size) 107 | incomplete_entries = self._pbar_size - complete_entries 108 | 109 | pbar = self._complete_pbar[:complete_entries] + self._incomplete_pbar[:incomplete_entries] 110 | fraction = '{} / {}'.format(n, total) 111 | string = '{} [{}] {:3d}%'.format(fraction, pbar, int(percent*100)) 112 | else: 113 | fraction = '{}'.format(n) 114 | string = '{} iterations'.format(n) 115 | return string, fraction 116 | 117 | def _format_speed(self, n): 118 | num_steps = n - self._step0 119 | t = time.time() - self._time0 120 | speed = num_steps / t 121 | string = '{:.1f} Hz'.format(speed) 122 | if num_steps > 0: 123 | self._speed = string 124 | return string 125 | 126 | def _chunk(self, l, n): 127 | return [l[i:i+n] for i in range(0, len(l), n)] 128 | 129 | def _format(self, chunks): 130 | lines = [self._format_chunk(chunk) for chunk in chunks] 131 | lines.insert(0,'') 132 | padding = '\n' + ' '*self.indent 133 | string = padding.join(lines) 134 | return string, lines 135 | 136 | def _format_chunk(self, chunk): 137 | line = ' | '.join([self._format_param(param) for param in chunk]) 138 | return line 139 | 140 | def _format_param(self, param): 141 | k, v = param 142 | return '{} : {}'.format(k, v)[:self.max_length] 143 | 144 | def stamp(self): 145 | if self.lines != ['']: 146 | params = ' | '.join(self.lines) 147 | string = '[ {} ] {}{} | {}'.format(self.name, self.fraction, params, self._speed) 148 | self._clear() 149 | print(string, end='\n') 150 | self._skip_lines = 1 151 | else: 152 | self._clear() 153 | self._skip_lines = 0 154 | 155 | def close(self): 156 | self.pause() 157 | 158 | class Silent: 159 | 160 | def __init__(self, *args, **kwargs): 161 | pass 162 | 163 | def __getattr__(self, attr): 164 | return lambda *args: None 165 | 166 | 167 | if __name__ == '__main__': 168 | silent = Silent() 169 | silent.update() 170 | silent.stamp() 171 | 172 | num_steps = 1000 173 | progress = Progress(num_steps) 174 | for i in range(num_steps): 175 | progress.update() 176 | params = [ 177 | ['A', '{:06d}'.format(i)], 178 | ['B', '{:06d}'.format(i)], 179 | ['C', '{:06d}'.format(i)], 180 | ['D', '{:06d}'.format(i)], 181 | ['E', '{:06d}'.format(i)], 182 | ['F', '{:06d}'.format(i)], 183 | ['G', '{:06d}'.format(i)], 184 | ['H', '{:06d}'.format(i)], 185 | ] 186 | progress.set_description(params) 187 | time.sleep(0.01) 188 | progress.close() 189 | -------------------------------------------------------------------------------- /utilities/mpi_tf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from mpi4py import MPI 4 | from utilities.mpi_tools import broadcast 5 | 6 | 7 | def flat_concat(xs): 8 | return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0) 9 | 10 | def assign_params_from_flat(x, params): 11 | flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars 12 | splits = tf.split(x, [flat_size(p) for p in params]) 13 | new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] 14 | return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) 15 | 16 | def sync_params(params): 17 | get_params = flat_concat(params) 18 | def _broadcast(x): 19 | broadcast(x) 20 | return x 21 | synced_params = tf.py_func(_broadcast, [get_params], tf.float32) 22 | return assign_params_from_flat(synced_params, params) 23 | 24 | def sync_all_params(): 25 | """Sync all tf variables across MPI processes.""" 26 | return sync_params(tf.global_variables()) 27 | 28 | 29 | class MpiAdamOptimizer(tf.train.AdamOptimizer): 30 | """ 31 | Adam optimizer that averages gradients across MPI processes. 32 | 33 | The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. 34 | For documentation on method arguments, see the Tensorflow docs page for 35 | the base `AdamOptimizer`_. 36 | 37 | .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py 38 | .. _`AdamOptimizer`: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 39 | """ 40 | 41 | def __init__(self, **kwargs): 42 | self.comm = MPI.COMM_WORLD 43 | tf.train.AdamOptimizer.__init__(self, **kwargs) 44 | 45 | def compute_gradients(self, loss, var_list, **kwargs): 46 | """ 47 | Same as normal compute_gradients, except average grads over processes. 48 | """ 49 | grads_and_vars = super().compute_gradients(loss, var_list, **kwargs) 50 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] 51 | flat_grad = flat_concat([g for g, v in grads_and_vars]) 52 | shapes = [v.shape.as_list() for g, v in grads_and_vars] 53 | sizes = [int(np.prod(s)) for s in shapes] 54 | 55 | num_tasks = self.comm.Get_size() 56 | buf = np.zeros(flat_grad.shape, np.float32) 57 | 58 | def _collect_grads(flat_grad): 59 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) 60 | np.divide(buf, float(num_tasks), out=buf) 61 | return buf 62 | 63 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) 64 | avg_flat_grad.set_shape(flat_grad.shape) 65 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0) 66 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v) 67 | for g, (_, v) in zip(avg_grads, grads_and_vars)] 68 | 69 | return avg_grads_and_vars 70 | 71 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 72 | """ 73 | Same as normal apply_gradients, except sync params after update. 74 | """ 75 | opt = super().apply_gradients(grads_and_vars, global_step, name) 76 | with tf.control_dependencies([opt]): 77 | sync = sync_params([v for g,v in grads_and_vars]) 78 | return tf.group([opt, sync]) -------------------------------------------------------------------------------- /utilities/mpi_tools.py: -------------------------------------------------------------------------------- 1 | 2 | from mpi4py import MPI 3 | import os, subprocess, sys 4 | import numpy as np 5 | 6 | 7 | def mpi_fork(n, bind_to_core=False): 8 | """ 9 | Re-launches the current script with workers linked by MPI. 10 | 11 | Also, terminates the original process that launched it. 12 | 13 | Taken almost without modification from the Baselines function of the 14 | `same name`_. 15 | 16 | .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py 17 | 18 | Args: 19 | n (int): Number of process to split into. 20 | 21 | bind_to_core (bool): Bind each MPI process to a core. 22 | """ 23 | if n<=1: 24 | return 25 | if os.getenv("IN_MPI") is None: 26 | env = os.environ.copy() 27 | env.update( 28 | MKL_NUM_THREADS="1", 29 | OMP_NUM_THREADS="1", 30 | IN_MPI="1" 31 | ) 32 | args = ["mpirun", "-np", str(n)] 33 | if bind_to_core: 34 | args += ["-bind-to", "core"] 35 | args += [sys.executable] + sys.argv 36 | subprocess.check_call(args, env=env) 37 | sys.exit() 38 | 39 | 40 | def msg(m, string=''): 41 | print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m)) 42 | 43 | def proc_id(): 44 | """Get rank of calling process.""" 45 | return MPI.COMM_WORLD.Get_rank() 46 | 47 | def allreduce(*args, **kwargs): 48 | return MPI.COMM_WORLD.Allreduce(*args, **kwargs) 49 | 50 | def num_procs(): 51 | """Count active MPI processes.""" 52 | return MPI.COMM_WORLD.Get_size() 53 | 54 | def broadcast(x, root=0): 55 | MPI.COMM_WORLD.Bcast(x, root=root) 56 | 57 | def mpi_op(x, op): 58 | x, scalar = ([x], True) if np.isscalar(x) else (x, False) 59 | x = np.asarray(x, dtype=np.float32) 60 | buff = np.zeros_like(x, dtype=np.float32) 61 | allreduce(x, buff, op=op) 62 | return buff[0] if scalar else buff 63 | 64 | def mpi_sum(x): 65 | return mpi_op(x, MPI.SUM) 66 | 67 | def mpi_avg(x): 68 | """Average a scalar or vector over MPI processes.""" 69 | return mpi_sum(x) / num_procs() 70 | 71 | def mpi_statistics_scalar(x, with_min_and_max=False): 72 | """ 73 | Get mean/std and optional min/max of scalar x across MPI processes. 74 | 75 | Args: 76 | x: An array containing samples of the scalar to produce statistics 77 | for. 78 | 79 | with_min_and_max (bool): If true, return min and max of x in 80 | addition to mean and std. 81 | """ 82 | x = np.array(x, dtype=np.float32) 83 | global_sum, global_n = mpi_sum([np.sum(x), len(x)]) 84 | mean = global_sum / global_n 85 | 86 | global_sum_sq = mpi_sum(np.sum((x - mean)**2)) 87 | std = np.sqrt(global_sum_sq / global_n) # compute global std 88 | 89 | if with_min_and_max: 90 | global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN) 91 | global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX) 92 | return mean, std, global_min, global_max 93 | return mean, std -------------------------------------------------------------------------------- /utilities/serialization_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def convert_json(obj): 4 | """ Convert obj to a version which can be serialized with JSON. """ 5 | if is_json_serializable(obj): 6 | return obj 7 | else: 8 | if isinstance(obj, dict): 9 | serializables = {} 10 | for k,v in obj.items(): 11 | if is_json_serializable(k) and is_json_serializable(v): 12 | serializables[convert_json(k)]=convert_json(v) 13 | 14 | return serializables 15 | 16 | elif isinstance(obj, tuple): 17 | return (convert_json(x) for x in obj) 18 | 19 | elif isinstance(obj, list): 20 | return [convert_json(x) for x in obj] 21 | 22 | elif hasattr(obj,'__name__') and not('lambda' in obj.__name__): 23 | return convert_json(obj.__name__) 24 | 25 | elif hasattr(obj,'__dict__') and obj.__dict__: 26 | obj_dict = {convert_json(k): convert_json(v) 27 | for k,v in obj.__dict__.items()} 28 | return {str(obj): obj_dict} 29 | 30 | return str(obj) 31 | 32 | def is_json_serializable(v): 33 | try: 34 | json.dumps(v) 35 | return True 36 | except: 37 | return False -------------------------------------------------------------------------------- /utilities/trust_region.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from utilities.utils import EPS 5 | """ 6 | Tensorflow utilities for trust region optimization 7 | """ 8 | 9 | def flat_concat(xs): 10 | return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0) 11 | 12 | def flat_grad(f, params): 13 | return flat_concat(tf.gradients(xs=params, ys=f)) 14 | 15 | def hessian_vector_product(f, params): 16 | # for H = grad**2 f, compute Hx 17 | g = flat_grad(f, params) 18 | x = tf.placeholder(tf.float32, shape=g.shape) 19 | return x, flat_grad(tf.reduce_sum(g*x), params) 20 | 21 | def assign_params_from_flat(x, params): 22 | flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars 23 | splits = tf.split(x, [flat_size(p) for p in params]) 24 | new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] 25 | return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) 26 | 27 | 28 | """ 29 | Conjugate gradient 30 | """ 31 | 32 | def cg(Ax, b, cg_iters=10): 33 | x = np.zeros_like(b) 34 | r = b.copy() # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. 35 | p = r.copy() 36 | r_dot_old = np.dot(r,r) 37 | for _ in range(cg_iters): 38 | z = Ax(p) 39 | alpha = r_dot_old / (np.dot(p, z) + EPS) 40 | x += alpha * p 41 | r -= alpha * z 42 | r_dot_new = np.dot(r,r) 43 | p = r + (r_dot_new / r_dot_old) * p 44 | r_dot_old = r_dot_new 45 | return x --------------------------------------------------------------------------------