├── .gitignore
├── README.md
├── algorithms
    ├── cmbpo.py
    ├── rl_algorithm.py
    └── utils.py
├── buffers
    ├── cpobuffer.py
    ├── modelbuffer.py
    └── utils.py
├── cmbpo.yml
├── configs
    ├── baseconfig
    │   ├── __init__.py
    │   ├── base.py
    │   ├── main.py
    │   └── utils.py
    ├── cmbpo_antsafe.py
    ├── cmbpo_hcs.py
    ├── cmbpo_hs.py
    ├── cpo_hcs.py
    └── trpo_hcs.py
├── envs
    ├── __init__.py
    ├── mujoco_safety_gym
    │   ├── __init__.py
    │   └── envs
    │   │   ├── __init__.py
    │   │   ├── ant.py
    │   │   ├── ant_viz.py
    │   │   ├── assets
    │   │       ├── ant.xml
    │   │       ├── ant_viz.xml
    │   │       ├── fetch
    │   │       │   ├── pick_and_place.xml
    │   │       │   ├── push.xml
    │   │       │   ├── reach.xml
    │   │       │   ├── robot.xml
    │   │       │   ├── shared.xml
    │   │       │   └── slide.xml
    │   │       ├── half_cheetah.xml
    │   │       ├── hopper.xml
    │   │       ├── humanoid.xml
    │   │       └── textures
    │   │       │   ├── block.png
    │   │       │   └── block_hidden.png
    │   │   ├── fetch
    │   │       ├── pick_and_place.py
    │   │       ├── push.py
    │   │       ├── reach.py
    │   │       └── slide.py
    │   │   ├── fetch_env.py
    │   │   ├── half_cheetah.py
    │   │   ├── hopper.py
    │   │   ├── humanoid.py
    │   │   ├── mujoco_env.py
    │   │   └── robot_env.py
    ├── utils.py
    └── wrappers
    │   ├── __init__.py
    │   └── normalize_action.py
├── models
    ├── base_model.py
    ├── fake_env.py
    ├── pens
    │   ├── __init__.py
    │   ├── fc.py
    │   ├── logger.py
    │   ├── pe.py
    │   ├── pe_factory.py
    │   └── utils.py
    └── statics.py
├── network
    └── ac_network.py
├── policies
    ├── base_policy.py
    ├── cpo_policy.py
    └── utils.py
├── requirements.txt
├── samplers
    ├── __init__.py
    ├── base_sampler.py
    ├── cpo_sampler.py
    ├── model_sampler.py
    ├── simple_sampler.py
    └── utils.py
├── scripts
    ├── console_scripts.py
    └── run.py
├── setup.py
└── utilities
    ├── instrument.py
    ├── logging.py
    ├── logx.py
    ├── mpi_tf.py
    ├── mpi_tools.py
    ├── serialization_utils.py
    ├── trust_region.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.pkl
  2 | *.stl
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | /environment/src/
 31 | /src/
 32 | /softlearning/environments/rllab/
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | 
110 | # soft learning specific things
111 | *.swp
112 | .idea
113 | *.mp4
114 | data/
115 | vis/
116 | tmp/
117 | vendor/*
118 | .pkl
119 | 
120 | 
121 | .mujoco/
122 | .vscode/
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Constrained Model-Based Policy Optimization
  2 | 
  3 | <p align="center">
  4 | 	<!-- <img src="https://drive.google.com/uc?export=view&id=1DcXi5wY_anmtlNeIErl1ECgKGsGi4oR1" width="80%"> -->
  5 | 	<img src="https://drive.google.com/uc?export=view&id=1DcXi5wY_anmtlNeIErl1ECgKGsGi4oR1" width="80%">
  6 | </p>
  7 | 
  8 | This repository contains code for Constrained Model-Based Policy Optimization (CMBPO), a model-based version of Constrained Policy Optimization (Achiam et al.). Installation, execution and code examples for the reproduction of the experiments described in [Safe Continuous Control with Constrained Model-Based Policy Optimization](https://arxiv.org/abs/2104.06922?context=cs) are provided below. 
  9 | 
 10 | # Prerequisites
 11 | 
 12 | 1. The simulation experiments using [mujoco-Py](https://github.com/openai/mujoco-py) require a working install of [MuJoCo 2.0](https://www.roboti.us/license.html) and a valid license. 
 13 | 2. We use conda environments for installs (tested on conda 4.6 - 4.10), please refer to [Anaconda](https://docs.anaconda.com/anaconda/install/) for instructions. 
 14 | 
 15 | # Installation
 16 | 
 17 | 1. Clone this repository
 18 | ```
 19 | git clone https://github.com/anyboby/Constrained-Model-Based-Policy-Optimization.git
 20 | ```
 21 | 2. Create a conda environment using the cmbpo yml-file
 22 | ```sh
 23 | cd Constrained-Model-Based-Policy-Optimization/
 24 | conda env create -f cmbpo.yml
 25 | conda activate cmbpo
 26 | pip install -e .
 27 | ```
 28 | This should create a conda environment labeled 'cmbpo' with the necessary packages and modules. The number of required modules is limited, so it is worth taking a look at the [cmbpo.yml](cmbpo.yml) and [requirements.txt](requirements.txt) files in case of troubles with the installs.
 29 | 
 30 | # Usage
 31 | To start an experiment with cmbpo, run 
 32 | ```sh
 33 | cmbpo run_local configs.baseconfig --config=configs.cmbpo_hcs --gpus=1 --trial-gpus=1
 34 | ```
 35 | 
 36 | `-- config` specifies the configuration file for experiment (here: CMBPO for HalfCheetahSafe)\
 37 | `-- gpus` specifies the number of gpus to use
 38 | 
 39 | A list of all available flags is provided in [baseconfig/utils](configs/baseconfig/utils.py). As of writing,only local running is supported. For further options, refer to the ray documentation.
 40 | 
 41 | The `cmbpo` command uses the [console scripts](scripts/console_scripts.py) as an entry point for running experiments. A simple workflow of running experiments with ray-tune is illustrated in [run.py](scripts/run.py), which can be executed with
 42 | ```sh
 43 | python scripts/run.py configs.cmbpo_hcs
 44 | ```
 45 | 
 46 | ## Algorithms
 47 | Constrained Model-Based Policy Optimization aims at combining Constrained Policy Optimization with model-based data augmentation and reconciling constraint satisfaction with the entailed model-errors. 
 48 | 
 49 | This repository can therefore also be used to run experiments with model-free versions of Constrained Policy Optimization and Trust-Region Policy Optimization by configuring the `use_model` and `constrain_cost` flags accordingly in the experiment configurations (see [CPO - HalfCheetahSafe](configs/cpo_hcs.py) and [TRPO - HalfCheetahSafe](configs/trpo_hcs.py)):
 50 | ```py
 51 | 'use_model': 		False,	# set to True for model-based
 52 | 'constrain_cost':   False,  # set to True for cost-constrained optimziation
 53 | ```
 54 | 
 55 | ## Adding new environments and running custom experiments
 56 | Different environments can be tested by creating a config file in the [configs](configs/) directory. OpenAi gym environments can be loaded directly with the corresponding parameters, for example:
 57 | ```py
 58 | 'universe': 'gym',
 59 | 'task':     'HalfCheetahSafe-v2',
 60 | ```
 61 | Environments from other sources require an entry in the `ENVS_FUNCTIONS` dict in the [environment utils](envs/utils.py) that specifies how to create an instance of the environment. For example, the Gym environments are specified with the following entries: 
 62 | ```py
 63 | def get_gym_env():
 64 |     import gym
 65 |     
 66 |     return gym.make
 67 | 
 68 | ENVS_FUNCTIONS = {
 69 |     'gym':get_gym_env()
 70 | }
 71 | ```
 72 | 
 73 | ## Model-Learning with custom environments
 74 | When using a model with custom environments, the model requires a few interfaces to function with the provided code. The [base model](models/base_model.py) should be inherited by a learned (or handcrafted) model and specify whether rewards, costs, and termination functions are predicted alongside the dynamics. 
 75 | 
 76 | By default our algorithm learns to predict rewards but assumes handcrafted cost- and termination-functions `c(s,a,s')` and `t(s,a,s')`. When adding a new environment, these functions should be defined (if not provided by the model) in the [statics](models/statics.py) file. For example, a default termination function that continues episodes for all states looks like this:
 77 | ```py
 78 | def no_done(obs, act, next_obs):
 79 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
 80 | 
 81 |     done = np.zeros(shape=obs.shape[:-1], dtype=np.bool) #always false
 82 |     done = done[...,None]
 83 |     return done
 84 | ```
 85 | The static functions should then be linked by the environments' task name, such that the [Fake Environment](models/fake_env.py) correctly discovers them:
 86 | ```py
 87 | TERMS_BY_TASK = {
 88 |     'default':no_done,
 89 |     'HalfCheetah-v2':no_done,
 90 | }
 91 | ```
 92 | 
 93 | ## Hyperparameters
 94 | Hyperparameters for a new experiment can be defined in the [configs](configs/) folder. The general form of our config files follows the following structure:
 95 | ```py
 96 | params = {
 97 |     'universe': 'gym',
 98 |     'task': 'HalfCheetahSafe-v2',
 99 |     'algorithm_params': {...},
100 |     'policy_params':{...},
101 |     'buffer_params': {...},
102 |     'sampler_params': {...},
103 |     'run_params': {...},
104 | }
105 | ```
106 | Parameters specified in a config file overwrite the [base config](configs/baseconfig/base.py) file. For new algorithms or a new suite of environments, it might be practical to directly change the base config. 
107 | 
108 | In addition to model-parameters and policy-parameters, the main parameters of concern in CMPBO define rollout- and sampling-behavior of the algorithm. 
109 | ```py
110 | 'n_initial_exploration_steps': int(10000), ### number of initial exploration steps for model-learning and 
111 |                                             # determining uncertainty calibration measurements
112 | 'sampling_alpha': 2,                    ### temperature for boltzman-sampling
113 | 'rollout_mode' : 'uncertainty',         ### model rollouts terminate based on per-step uncertainty
114 | 'rollout_schedule': [10, 500, 5, 30],   ### if rollout_mode:'schedule' this schedule is defined as 
115 |                                                 # [min_epoch, max_epoch, min_horizon, max_horizon]
116 |                                             ## if rollout_mode:'uncertainty', 'min_horizon' is used as 
117 |                                             # the initial rollout horizon and adapted throughout 
118 |                                             # training based on per-step uncertainty estimates 
119 |                                             # (KL-Divergence).
120 | 'batch_size_policy': 50000,             ### batch size per policy update
121 | 'initial_real_samples_per_epoch': 1500, ### initial number of real samples per policy update, 
122 |                                             # adapted   throughout training based on average uncertainty 
123 |                                             # estimates (mean KL-Divergence).
124 | 'min_real_samples_per_epoch': 500,      ### absolute minimum number of real samples per policy update
125 | ```
126 | ## Logging
127 | A range of measurements is logged automatically in tensorboard, the parameter configuration is saved as a JSON file. The location for summaries and checkpoints can be defined by specifying a `'log_dir'` in the configuration files. By default, this location will be set to `'~/ray_cmbpo/{env-taks}/defaults/{seed}'` and can be accessed with tensorboard by
128 | ```sh
129 | tensorboard --logdir ~/ray_cmbpo/<env>/defaults/<seed_dir>
130 | ```
131 | 
132 | # Acknowledgments
133 | Several sections of this repository contain code from other repositories, notably from [Tuomas Haarnoja](https://scholar.google.com/citations?user=VT7peyEAAAAJ&hl=en), [Kristian Hartikainen's](https://github.com/rail-berkeley/softlearning), [Michael Janner](https://github.com/JannerM/mbpo), [Kurtland Chua](https://github.com/kchua/handful-of-trials), and CPO by [Joshua Achiam and Alex Ray](https://github.com/openai/safety-starter-agents). 
134 | 


--------------------------------------------------------------------------------
/algorithms/rl_algorithm.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | from collections import OrderedDict
  3 | from itertools import count
  4 | import gtimer as gt
  5 | import math
  6 | import os
  7 | import pdb
  8 | 
  9 | import tensorflow as tf
 10 | import numpy as np
 11 | 
 12 | from utilities.utils import save_video
 13 | 
 14 | 
 15 | class RLAlgorithm(tf.contrib.checkpoint.Checkpointable):
 16 |     """Abstract RLAlgorithm.
 17 | 
 18 |     Implements the _train and _evaluate methods to be used
 19 |     by classes inheriting from RLAlgorithm.
 20 |     """
 21 | 
 22 |     def __init__(
 23 |             self,
 24 |             sampler,
 25 |             n_epochs=int(10e7),
 26 |             n_initial_exploration_steps=0,
 27 |             initial_exploration_policy=None,
 28 |             epoch_length=1000,
 29 |             eval_n_episodes=10,
 30 |             eval_deterministic=True,
 31 |             eval_render_mode=None,
 32 |             video_save_frequency=0,
 33 |             session=None,
 34 |     ):
 35 |         """
 36 |         Args:
 37 |             n_epochs (`int`): Number of epochs to run the training for.
 38 |             n_initial_exploration_steps: Number of steps in the beginning to
 39 |                 take using actions drawn from a separate exploration policy.
 40 |             initial_exploration_policy: policy to follow during initial 
 41 |                 exploration hook
 42 |             epoch_length (`int`): Epoch length.
 43 |             eval_n_episodes (`int`): Number of rollouts to evaluate.
 44 |             eval_deterministic (`int`): Whether or not to run the policy in
 45 |                 deterministic mode when evaluating policy.
 46 |             eval_render_mode (`str`): Mode to render evaluation rollouts in.
 47 |                 None to disable rendering.
 48 |         """
 49 |         self.sampler = sampler
 50 | 
 51 |         self._n_epochs = n_epochs
 52 |         self._epoch_length = epoch_length
 53 |         self._n_initial_exploration_steps = n_initial_exploration_steps
 54 |         self._initial_exploration_policy = initial_exploration_policy
 55 | 
 56 |         self._eval_n_episodes = eval_n_episodes
 57 |         self._eval_deterministic = eval_deterministic
 58 |         self._video_save_frequency = video_save_frequency
 59 | 
 60 |         if self._video_save_frequency > 0:
 61 |             assert eval_render_mode != 'human', (
 62 |                 "RlAlgorithm cannot render and save videos at the same time")
 63 |             self._eval_render_mode = 'rgb_array'
 64 |         else:
 65 |             self._eval_render_mode = eval_render_mode
 66 | 
 67 |         self._session = session or tf.keras.backend.get_session()
 68 | 
 69 |         self._epoch = 0
 70 |         self._timestep = 0
 71 |         self._num_train_steps = 0
 72 | 
 73 |     def _initial_exploration_hook(self, env, initial_exploration_policy, pool):
 74 |         if self._n_initial_exploration_steps < 1: return
 75 | 
 76 |         if not initial_exploration_policy:
 77 |             raise ValueError(
 78 |                 "Initial exploration policy must be provided when"
 79 |                 " n_initial_exploration_steps > 0.")
 80 | 
 81 |         self.sampler.initialize(env, initial_exploration_policy, pool)
 82 |         while pool.size < self._n_initial_exploration_steps:
 83 |             self.sampler.sample()   
 84 | 
 85 |     def _training_before_hook(self):
 86 |         """Method called before the actual training loops."""
 87 |         pass
 88 | 
 89 |     def _training_after_hook(self):
 90 |         """Method called after the actual training loops."""
 91 |         pass
 92 | 
 93 |     def _timestep_before_hook(self, *args, **kwargs):
 94 |         """Hook called at the beginning of each timestep."""
 95 |         pass
 96 | 
 97 |     def _timestep_after_hook(self, *args, **kwargs):
 98 |         """Hook called at the end of each timestep."""
 99 |         pass
100 | 
101 |     def _epoch_before_hook(self):
102 |         """Hook called at the beginning of each epoch."""
103 |         self._train_steps_this_epoch = 0
104 | 
105 |     def _epoch_after_hook(self, *args, **kwargs):
106 |         """Hook called at the end of each epoch."""
107 |         pass
108 | 
109 |     def _training_batch(self, batch_size=None):
110 |         return self.sampler.random_batch(batch_size)
111 | 
112 |     def _evaluation_batch(self, *args, **kwargs):
113 |         return self._training_batch(*args, **kwargs)
114 | 
115 |     @property
116 |     def _training_started(self):
117 |         return self._total_timestep > 0
118 | 
119 |     @property
120 |     def _total_timestep(self):
121 |         total_timestep = self._epoch * self._epoch_length + self._timestep
122 |         return total_timestep
123 | 
124 |     def _train(self):
125 |         """Return a generator that performs RL training.
126 |         """
127 |         raise NotImplementedError
128 | 
129 |     @abc.abstractmethod
130 |     def get_diagnostics(self,):
131 |         raise NotImplementedError
132 | 
133 |     @property
134 |     def ready_to_train(self):
135 |         return self.sampler.batch_ready()
136 | 
137 |     def _do_sampling(self, timestep):
138 |         return self.sampler.sample()
139 | 
140 |     @property
141 |     def tf_saveables(self):
142 |         return {}
143 | 
144 |     def __getstate__(self):
145 |         state = {
146 |             '_epoch_length': self._epoch_length,
147 |             '_epoch': (
148 |                 self._epoch + int(self._timestep >= self._epoch_length)),
149 |             '_timestep': self._timestep % self._epoch_length,
150 |         }
151 | 
152 |         return state
153 | 
154 |     def __setstate__(self, state):
155 |         self.__dict__.update(state)
156 | 


--------------------------------------------------------------------------------
/algorithms/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from dotmap import DotMap
 3 | from collections import OrderedDict      
 4 | 
 5 | def create_CMBPO_algorithm(variant, *args, **kwargs):
 6 |     from algorithms.cmbpo import CMBPO
 7 |     algorithm = CMBPO(*args, **kwargs)
 8 | 
 9 |     return algorithm
10 | 
11 | 
12 | ALGORITHM_CLASSES = {
13 |     'CMBPO': create_CMBPO_algorithm,
14 | }
15 | 
16 | 
17 | def get_algorithm_from_params(variant,
18 |                                *args,
19 |                                **kwargs):
20 |     algorithm_params = variant['algorithm_params']
21 |     algorithm_type = algorithm_params['type']
22 |     algorithm_kwargs = deepcopy(algorithm_params['kwargs'])
23 |     # @anyboby, workaround for local_example_debug mode, for some reason gets DotMap isntead of 
24 |     # OrderedDict as algorithm_kwargs, which doesn't seem to work for double asteriks !
25 |     if isinstance(algorithm_kwargs, DotMap): 
26 |         algorithm_kwargs = algorithm_kwargs.toDict()
27 | 
28 |     algorithm = ALGORITHM_CLASSES[algorithm_type](
29 |         variant, *args, **algorithm_kwargs, **kwargs)
30 | 
31 |     return algorithm
32 | 


--------------------------------------------------------------------------------
/buffers/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | def get_cpobuffer(env, *args, **kwargs):
 4 |     from buffers.cpobuffer import CPOBuffer
 5 | 
 6 |     buffer = CPOBuffer(
 7 |         *args,
 8 |         observation_space=env.observation_space,
 9 |         action_space=env.action_space,
10 |         *args,
11 |         **kwargs)
12 | 
13 |     return buffer
14 | 
15 | BUFFER_FUNCTIONS = {
16 |     'CPOBuffer': get_cpobuffer,
17 | }
18 | 
19 | def get_buffer_from_params(params, env, *args, **kwargs):
20 |     buffer_params = params['buffer_params']
21 |     buffer_type = buffer_params['type']
22 |     buffer_kwargs = deepcopy(buffer_params['kwargs'])
23 | 
24 |     buffer = BUFFER_FUNCTIONS[buffer_type](
25 |         env,
26 |         *args,
27 |         **buffer_kwargs,
28 |         **kwargs)
29 | 
30 |     return buffer
31 | 


--------------------------------------------------------------------------------
/cmbpo.yml:
--------------------------------------------------------------------------------
 1 | name: cmbpo
 2 | channels:
 3 |   - anaconda
 4 |   - defaults
 5 | dependencies:
 6 |   - click=7.0
 7 |   - matplotlib=3.3.4
 8 |   - mpi4py=3.0.3
 9 |   - pip=21.0.1
10 |   - python=3.6.12
11 |   - requests=2.20.1
12 |   - tensorflow-gpu=1.14.0
13 |   - pip:
14 |       - conda-env-export==0.3.2
15 |       - dotmap==1.3.8
16 |       - gtimer==1.0.0b5
17 |       - gym==0.18.0
18 |       - joblib==0.14.1
19 |       - mkl-fft==1.2.0
20 |       - mkl-random==1.1.0
21 |       - mkl-service==2.3.0
22 |       - mujoco-py==2.0.2.13
23 |       - olefile==0.46
24 |       - pyOpenSSL==19.1.0
25 |       - PySocks==1.7.1
26 |       - PyYAML==5.4.1
27 |       - ray==0.6.4
28 |       - sip==4.19.24
29 |       - tensorflow==1.14.0
30 |       - tornado==6.0.4
31 | prefix: /home/mo/anaconda3/envs/cmbpo_test2
32 | 


--------------------------------------------------------------------------------
/configs/baseconfig/__init__.py:
--------------------------------------------------------------------------------
 1 | """Provides functions that are utilized by the command line interface.
 2 | 
 3 | In particular, the examples are exposed to the command line interface
 4 | (defined in `scripts.console_scripts`) through the
 5 | `get_trainable_class`, `get_variant_spec`, and `get_parser` functions.
 6 | """
 7 | 
 8 | 
 9 | def get_trainable_class(*args, **kwargs):
10 |     from .main import ExperimentRunner
11 |     return ExperimentRunner
12 | 
13 | def get_params_from_file(filepath, params_name='params'):
14 | 	import importlib
15 | 	from dotmap import DotMap
16 | 	module = importlib.import_module(filepath)
17 | 	params = getattr(module, params_name)
18 | 	params = DotMap(params)
19 | 	return params
20 | 
21 | def get_variant_spec(command_line_args, *args, **kwargs):
22 |     from .base import get_variant_spec
23 |     import importlib
24 |     params = get_params_from_file(command_line_args.config)
25 |     variant_spec = get_variant_spec(command_line_args, *args, params, **kwargs)
26 |     return variant_spec
27 | 
28 | def get_parser():
29 |     from .utils import get_parser
30 |     parser = get_parser()
31 |     return parser
32 | 


--------------------------------------------------------------------------------
/configs/baseconfig/base.py:
--------------------------------------------------------------------------------
  1 | from ray import tune
  2 | import numpy as np
  3 | import pdb
  4 | 
  5 | from utilities.utils import deep_update
  6 | 
  7 | M = 256 #256
  8 | 
  9 | NUM_COUPLING_LAYERS = 2
 10 | 
 11 | DEFAULT_MAX_PATH_LENGTH = 1000
 12 | 
 13 | CPO_POLICY_PARAMS_BASE = {
 14 |     'type': 'CPOPolicy',
 15 |     'kwargs': {
 16 |         'a_hidden_layer_sizes':   (M, M),       # policy network hidden layers
 17 |         'constrain_cost':       True,           # constrain_cost=False will perform TRPO updates
 18 |         'vf_lr':                3e-4,           # learn rate for value learning
 19 |         'vf_hidden_layer_sizes':(M,M),          # nn hidden layers for vf
 20 |         'vf_epochs':            8,              # number of training epochs for values
 21 |         'vf_batch_size':        2048,           # minibatches for value training
 22 |         'vf_ensemble_size':     3,              # vf ensemble size
 23 |         'vf_elites':            2,              # vf elites 
 24 |         'vf_activation':        'swish',        # activation function
 25 |         'vf_loss':              'MSE',          # choose from 'NLL', 'MSPE' (inc. var); 'MSE' ; 'Huber'
 26 |         'vf_decay':             1e-6,           # decay for nn regularization
 27 |         'vf_clipping':          False,          # clip losses for a trust-region like vf update
 28 |         'vf_kl_cliprange':      0.0,                # only applicable if vf_clippping=True
 29 |         'ent_reg':              0, # 5e-3       # exploration bonus for maintaining pol. entropy
 30 |         'target_kl':            0.01,           # trust region diameter
 31 |         'cost_lim':             10,             # cost limit for whole task length
 32 |         'cost_lam':             .5,             # gae lambda
 33 |         'cost_gamma':           0.97,           # discounts
 34 |         'lam':                  .95,            # gae lambda
 35 |         'gamma':                0.99,           # discounts
 36 |         'epoch_length': tune.sample_from(lambda spec: (
 37 |                spec.get('config', spec)
 38 |                ['algorithm_params']['kwargs']['epoch_length'] 
 39 |             )),
 40 |         'max_path_length': tune.sample_from(lambda spec: (
 41 |                spec.get('config', spec)
 42 |                ['sampler_params']['kwargs']['max_path_length']
 43 |             )),
 44 |         'log_dir': tune.sample_from(lambda spec: (
 45 |                spec.get('config', spec)
 46 |                ['log_dir']
 47 |             )),
 48 |     }
 49 | }
 50 | 
 51 | POLICY_PARAMS_BASE = {
 52 |     'CPOPolicy' :   CPO_POLICY_PARAMS_BASE,
 53 | }
 54 | 
 55 | POLICY_PARAMS_BASE.update({
 56 |     'cpopolicy': POLICY_PARAMS_BASE['CPOPolicy']
 57 | })
 58 | 
 59 | ALGORITHM_PARAMS = {
 60 |     'CMBPO': {
 61 |         'type': 'CMBPO',
 62 |         'kwargs': {
 63 |             'task': tune.sample_from(lambda spec: (
 64 |                spec.get('config', spec)
 65 |                ['environment_params']['task']
 66 |             )),
 67 |             'n_env_interacts': int(10e6),
 68 |             'epoch_length': 50000, 
 69 |             'eval_render_mode': 'human',                
 70 |             'eval_n_episodes': 1,                       
 71 |             'eval_every_n_steps': 5e3,                  
 72 |             'eval_deterministic': False,                
 73 |             'n_initial_exploration_steps': int(10000),  # number of initial exploration steps for model-learning and 
 74 |                                                             # determining uncertainty calibration measurements
 75 |             #### it is crucial to choose a model that doesn't overfit when trained too often on seen data
 76 |             ## for model architecture finding:  1. play around with the start samples to find an architecture, that doesn't really overfit
 77 |                                             # 2. m_train_freq in can somewhat limit overfitting, but is only treating the symptom
 78 |                                             # 3. try finding a balance between the size of new samples per number of
 79 |                                             #  updates of the model network (with m_train_freq)
 80 |             'use_model': True,                          
 81 |             'm_hidden_dims':(512,512),                  # hidden layer size of model bnn
 82 |             'm_loss_type': 'MSPE',                      
 83 |             'm_use_scaler_in': True,                    
 84 |             'm_use_scaler_out': True,                   
 85 |             'm_lr': 1e-3,                               
 86 |             'm_train_freq': 4000,                       # model is only trained every (self._timestep % self._model_train_freq==0) steps (terminates when stops improving)
 87 |             'rollout_batch_size': 1.0e3,                # rollout_batch_size is the size of randomly chosen states to start from when rolling out model
 88 |             'm_networks': 7,                            # size of model network ensemble
 89 |             'm_elites': 5,                              # best networks to select from num_networks
 90 |             'max_model_t': None,                        # a timeout for model training (e.g. for speeding up wallclock time)
 91 |             'sampling_alpha': 2,                        # temperature for boltzman-sampling
 92 |             'rollout_mode' : 'uncertainty',             # 
 93 |             'rollout_schedule': [10, 500, 5, 30],       # if rollout_mode:'schedule' this schedule is defined as 
 94 |                                                             #[min_epoch, max_epoch, min_horizon, max_horizon]
 95 |                                                         # if rollout_mode:'uncertainty', 'min_horizon' is used as 
 96 |                                                             # the initial rollout horizon and adapted throughout 
 97 |                                                             # training based on per-step uncertainty estimates 
 98 |                                                             # (KL-Divergence).
 99 |             'maxroll': 35,                              # maximum rollout horizon
100 |             'batch_size_policy': 50000,                 # batch size per policy update
101 |             'initial_real_samples_per_epoch': 15000,    # number of real samples contained in first batch
102 |             'min_real_samples_per_epoch': 500,          # absolute minimum of samples
103 |         }
104 |     },
105 | }
106 | 
107 | BUFFER_PARAMS_PER_ALGO = {
108 |     'CMBPO': {
109 |         'type': 'CPOBuffer',
110 |         'preprocess_type': 'default',
111 |         'kwargs': {
112 |             'size': tune.sample_from(lambda spec: (
113 |                spec.get('config', spec)
114 |                ['algorithm_params']['kwargs']['epoch_length'] 
115 |             )),
116 |             'archive_size': tune.sample_from(lambda spec: (
117 |                 {
118 |                     'SimpleReplayPool': int(1e6),
119 |                     'CPOBuffer':int(3e5),
120 |                 }.get(
121 |                     spec.get('config', spec)
122 |                     ['buffer_params']['type'],
123 |                     int(1e6))
124 |             )),
125 |         }
126 |     },
127 | }
128 | 
129 | SAMPLER_PARAMS_PER_ALGO = {
130 |     'default': {
131 |         'type':'CPOSampler',
132 |         'kwargs':{
133 |             'max_path_length': DEFAULT_MAX_PATH_LENGTH,
134 |             'render_mode': None,
135 |         },
136 |     },
137 |     'CMBPO': {
138 |         'type':'CPOSampler',
139 |         'kwargs':{
140 |             'max_path_length': DEFAULT_MAX_PATH_LENGTH,
141 |             'render_mode': None,
142 |         },
143 |     }
144 | }
145 | 
146 | RUN_PARAMS = {
147 |     'seed': tune.sample_from(
148 |         lambda spec: np.random.randint(0, 10000)),
149 |     'checkpoint_at_end': True,
150 |     'checkpoint_frequency': 50,
151 |     'checkpoint_buffer': False,
152 | }
153 | 
154 | ENV_PARAMS = {
155 |     'normalize_actions':False,
156 |     'kwargs':{}
157 | }
158 | 
159 | def get_variant_spec(args, params):
160 |     assert hasattr(params, 'universe') and \
161 |         hasattr(params, 'task') and \
162 |         hasattr(params, 'algorithm') and \
163 |         hasattr(params, 'policy') 
164 | 
165 |     universe, task = params.universe, params.task
166 |     ENV_PARAMS.update({
167 |                 'universe': universe,
168 |                 'task': task,
169 |             })
170 | 
171 |     algorithm, policy = params.algorithm_params.type, params.policy_params.type
172 |     base_spec = {
173 |         'log_dir': f'~/ray_{algorithm.lower()}',
174 |         'exp_name': 'defaults',
175 |         'environment_params': ENV_PARAMS,
176 |         'policy_params': POLICY_PARAMS_BASE[policy],
177 |         'algorithm_params': ALGORITHM_PARAMS[algorithm],
178 |         'buffer_params': BUFFER_PARAMS_PER_ALGO[algorithm],
179 |         'sampler_params': SAMPLER_PARAMS_PER_ALGO[algorithm],
180 |         'run_params': RUN_PARAMS,
181 |     }
182 | 
183 |     variant_spec = deep_update(
184 |         base_spec,
185 |         params
186 |     )
187 |     return variant_spec
188 | 


--------------------------------------------------------------------------------
/configs/baseconfig/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import copy
  3 | import glob
  4 | import pickle
  5 | import sys
  6 | import pdb
  7 | 
  8 | import tensorflow as tf
  9 | from ray import tune
 10 | 
 11 | from envs.utils import get_env_from_params
 12 | from algorithms.utils import get_algorithm_from_params
 13 | from policies.utils import get_policy_from_params
 14 | from buffers.utils import get_buffer_from_params
 15 | from samplers.utils import get_sampler_from_params
 16 | 
 17 | from utilities.utils import set_seed, initialize_tf_variables
 18 | from utilities.instrument import run_example_local, run_example_debug
 19 | 
 20 | class ExperimentRunner(tune.Trainable):
 21 |     def _setup(self, variant):
 22 |         set_seed(variant['run_params']['seed'])
 23 | 
 24 |         self._variant = variant
 25 |         gpu_options = tf.GPUOptions(allow_growth=True)
 26 |         session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
 27 |         tf.keras.backend.set_session(session)
 28 |         self._session = tf.keras.backend.get_session()
 29 | 
 30 |         self.train_generator = None
 31 |         self._built = False
 32 | 
 33 |     def _stop(self):
 34 |         tf.reset_default_graph()
 35 |         tf.keras.backend.clear_session()
 36 | 
 37 |     def _build(self):
 38 |         """
 39 |         called by tune to build algorithm 
 40 |         """
 41 |         variant = copy.deepcopy(self._variant)
 42 | 
 43 |         env_params = variant['environment_params']
 44 |         env = self.env = (
 45 |             get_env_from_params(env_params))
 46 | 
 47 |         buffer = self.buffer = (
 48 |             get_buffer_from_params(variant, env))
 49 |         sampler = self.sampler = get_sampler_from_params(variant)
 50 |         policy = self.policy = get_policy_from_params(
 51 |             variant, env, self._session)
 52 |         
 53 |         #### build algorithm 
 54 |         self.algorithm = get_algorithm_from_params(
 55 |             variant=self._variant,
 56 |             env=env,
 57 |             policy=policy,
 58 |             buffer=buffer,
 59 |             sampler=sampler,
 60 |             session=self._session)
 61 | 
 62 |         initialize_tf_variables(self._session, only_uninitialized=True)
 63 | 
 64 |         # add graph since ray doesn't seem to automatically add that
 65 |         graph_writer = tf.summary.FileWriter(self.logdir, self._session.graph)
 66 |         graph_writer.flush()
 67 |         graph_writer.close()
 68 |         
 69 |         #### finalize graph
 70 |         tf.get_default_graph().finalize()
 71 |         self._built = True
 72 | 
 73 | 
 74 |     def _train(self):
 75 |         if not self._built:
 76 |             self._build()
 77 | 
 78 |         if self.train_generator is None:
 79 |             self.train_generator = self.algorithm.train()
 80 | 
 81 |         diagnostics = next(self.train_generator)
 82 | 
 83 |         return diagnostics
 84 | 
 85 |     def _pickle_path(self, checkpoint_dir):
 86 |         return os.path.join(checkpoint_dir, 'checkpoint.pkl')
 87 | 
 88 |     def _replay_pool_pickle_path(self, checkpoint_dir):
 89 |         return os.path.join(checkpoint_dir, 'replay_pool.pkl')
 90 | 
 91 |     def _tf_checkpoint_prefix(self, checkpoint_dir):
 92 |         return os.path.join(checkpoint_dir, 'checkpoint')
 93 | 
 94 |     def _get_tf_checkpoint(self):
 95 |         tf_checkpoint = tf.train.Checkpoint(**self.algorithm.tf_saveables)
 96 | 
 97 |         return tf_checkpoint
 98 | 
 99 |     def _save_replay_pool(self, checkpoint_dir):
100 |         replay_pool_pickle_path = self._replay_pool_pickle_path(
101 |             checkpoint_dir)
102 |         self.buffer.save_latest_experience(replay_pool_pickle_path)
103 | 
104 |     def _restore_replay_pool(self, current_checkpoint_dir):
105 |         experiment_root = os.path.dirname(current_checkpoint_dir)
106 | 
107 |         experience_paths = [
108 |             self._replay_pool_pickle_path(checkpoint_dir)
109 |             for checkpoint_dir in sorted(glob.iglob(
110 |                 os.path.join(experiment_root, 'checkpoint_*')))
111 |         ]
112 |         for experience_path in experience_paths:
113 |             self.buffer.load_experience(experience_path)
114 | 
115 |     def _save(self, checkpoint_dir):
116 |         """Implements the saving logic.
117 |         @anyboby: implementation very cmbpo specific saving methods, not optimal! 
118 |             but general interfaces seem hard to implement due to all the different 
119 |             frameworks (Keras, tf, pickling etc.)
120 |         """
121 | 
122 |         ## only saves model atm
123 |         self.policy_path = self.policy.save(checkpoint_dir)     ### @anyboby: this saves all tf objects
124 |         self.algorithm.save(checkpoint_dir)
125 | 
126 |         if self._variant['run_params'].get('checkpoint_replay_pool', False):
127 |             self._save_replay_pool(checkpoint_dir)
128 | 
129 |         return os.path.join(checkpoint_dir, '')
130 |         
131 |     def _restore(self, checkpoint_dir):
132 |         raise NotImplementedError
133 | 
134 | def main(argv=None):
135 |     """Run ExperimentRunner locally on ray.
136 |     """
137 |     run_example_local(__package__, argv)
138 | 
139 | if __name__ == '__main__':
140 |     main(argv=sys.argv[1:])


--------------------------------------------------------------------------------
/configs/baseconfig/utils.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import argparse
  3 | from distutils.util import strtobool
  4 | import json
  5 | 
  6 | from ray.tune import sample_from
  7 | 
  8 | def add_ray_init_args(parser):
  9 | 
 10 |     def init_help_string(help_string):
 11 |         return help_string + " Passed to `ray.init`."
 12 | 
 13 |     parser.add_argument(
 14 |         '--cpus',
 15 |         type=int,
 16 |         default=None,
 17 |         help=init_help_string("Cpus to allocate to ray process."))
 18 |     parser.add_argument(
 19 |         '--gpus',
 20 |         type=int,
 21 |         default=None,
 22 |         help=init_help_string("Gpus to allocate to ray process."))
 23 |     parser.add_argument(
 24 |         '--resources',
 25 |         type=json.loads,
 26 |         default=None,
 27 |         help=init_help_string("Resources to allocate to ray process."))
 28 |     parser.add_argument(
 29 |         '--include-webui',
 30 |         type=str,
 31 |         default=False,
 32 |         help=init_help_string("Boolean flag indicating whether to start the"
 33 |                               "web UI, which is a Jupyter notebook."))
 34 |     parser.add_argument(
 35 |         '--temp-dir',
 36 |         type=str,
 37 |         default=None,
 38 |         help=init_help_string("If provided, it will specify the root temporary"
 39 |                               " directory for the Ray process."))
 40 | 
 41 |     return parser
 42 | 
 43 | 
 44 | def add_ray_tune_args(parser):
 45 | 
 46 |     def tune_help_string(help_string):
 47 |         return help_string + " Passed to `tune.run_experiments`."
 48 | 
 49 |     parser.add_argument(
 50 |         '--resources-per-trial',
 51 |         type=json.loads,
 52 |         default={},
 53 |         help=tune_help_string("Resources to allocate for each trial."))
 54 |     parser.add_argument(
 55 |         '--trial-gpus',
 56 |         type=float,
 57 |         default=None,
 58 |         help=("Resources to allocate for each trial. Passed"
 59 |               " to `tune.run_experiments`."))
 60 |     parser.add_argument(
 61 |         '--trial-extra-cpus',
 62 |         type=int,
 63 |         default=None,
 64 |         help=("Extra CPUs to reserve in case the trials need to"
 65 |               " launch additional Ray actors that use CPUs."))
 66 |     parser.add_argument(
 67 |         '--trial-extra-gpus',
 68 |         type=float,
 69 |         default=None,
 70 |         help=("Extra GPUs to reserve in case the trials need to"
 71 |               " launch additional Ray actors that use GPUs."))
 72 |     parser.add_argument(
 73 |         '--num-samples',
 74 |         default=1,
 75 |         type=int,
 76 |         help=tune_help_string("Number of times to repeat each trial."))
 77 |     parser.add_argument(
 78 |         '--upload-dir',
 79 |         type=str,
 80 |         default='',
 81 |         help=tune_help_string("Optional URI to sync training results to (e.g."
 82 |                               " s3://<bucket> or gs://<bucket>)."))
 83 |     parser.add_argument(
 84 |         '--trial-name-template',
 85 |         type=str,
 86 |         default='seed:{trial.config[run_params][seed]}',
 87 |         help=tune_help_string(
 88 |             "Optional string template for trial name. For example:"
 89 |             " '{trial.trial_id}-seed={trial.config[run_params][seed]}'"))
 90 |     parser.add_argument(
 91 |         '--trial-cpus',
 92 |         type=int,
 93 |         default=multiprocessing.cpu_count(),
 94 |         help=tune_help_string("Resources to allocate for each trial."))
 95 |     parser.add_argument(
 96 |         '--checkpoint-frequency',
 97 |         type=int,
 98 |         default=None,
 99 |         help=tune_help_string(
100 |             "How many training iterations between checkpoints."
101 |             " A value of 0 (default) disables checkpointing. If set,"
102 |             " takes precedence over variant['run_params']"
103 |             "['checkpoint_frequency']."))
104 |     parser.add_argument(
105 |         '--checkpoint-at-end',
106 |         type=lambda x: bool(strtobool(x)),
107 |         default=None,
108 |         help=tune_help_string(
109 |             "Whether to checkpoint at the end of the experiment. If set,"
110 |             " takes precedence over variant['run_params']"
111 |             "['checkpoint_at_end']."))
112 |     parser.add_argument(
113 |         '--max-failures',
114 |         default=3,
115 |         type=int,
116 |         help=tune_help_string(
117 |             "Try to recover a trial from its last checkpoint at least this "
118 |             "many times. Only applies if checkpointing is enabled."))
119 |     parser.add_argument(
120 |         '--restore',
121 |         type=str,
122 |         default=None,
123 |         help=tune_help_string(
124 |             "Path to checkpoint. Only makes sense to set if running 1 trial."
125 |             " Defaults to None."))
126 |     parser.add_argument(
127 |         '--with-server',
128 |         type=str,
129 |         default=False,
130 |         help=tune_help_string("Starts a background Tune server. Needed for"
131 |                               " using the Client API."))
132 | 
133 |     return parser
134 | 
135 | 
136 | def get_parser():
137 |     parser = argparse.ArgumentParser()
138 | 
139 |     parser.add_argument(
140 |         '--config',
141 |         type=str)
142 | 
143 |     parser.add_argument(
144 |         '--checkpoint-replay-pool',
145 |         type=lambda x: bool(strtobool(x)),
146 |         default=None,
147 |         help=("Whether a checkpoint should also saved the replay"
148 |               " pool. If set, takes precedence over"
149 |               " variant['run_params']['checkpoint_replay_pool']."
150 |               " Note that the replay pool is saved (and "
151 |               " constructed) piece by piece so that each"
152 |               " experience is saved only once."))
153 | 
154 |     parser.add_argument(
155 |         '--policy',
156 |         type=str,
157 |         choices=('cpopolicy'),
158 |         default='cpopolicy')
159 | 
160 |     parser.add_argument(
161 |         '--mode', type=str, default='local')
162 |     parser.add_argument(
163 |         '--confirm-remote',
164 |         type=lambda x: bool(strtobool(x)),
165 |         nargs='?',
166 |         const=True,
167 |         default=True,
168 |         help="Whether or not to query yes/no on remote run.")
169 | 
170 |     parser.add_argument(
171 |         '--video-save-frequency',
172 |         type=int,
173 |         default=None,
174 |         help="Save frequency for videos.")
175 | 
176 |     parser = add_ray_init_args(parser)
177 |     parser = add_ray_tune_args(parser)
178 | 
179 |     return parser
180 | 
181 | def variant_equals(*keys):
182 |     def get_from_spec(spec):
183 |         # TODO(hartikainen): This may break in some cases. ray.tune seems to
184 |         # add a 'config' key at the top of the spec, whereas `generate_variants`
185 |         # does not.
186 |         node = spec.get('config', spec)
187 |         for key in keys:
188 |             node = node[key]
189 | 
190 |         return node
191 | 
192 |     return sample_from(get_from_spec)
193 | 


--------------------------------------------------------------------------------
/configs/cmbpo_antsafe.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | params = {
 4 |     'universe': 'gym',
 5 |     'task': 'AntSafe-v2',
 6 |     'environment_params': {
 7 |         'normalize_actions': True,
 8 |     },
 9 |     'algorithm_params': {
10 |         'type': 'CMBPO',
11 |         'kwargs':{
12 |             'n_env_interacts': int(10e6),
13 |             'epoch_length': 50000, 
14 |             'eval_every_n_steps': 5e3,
15 |             'n_initial_exploration_steps': int(10000), 
16 |             #### it is crucial to choose a model that doesn't overfit when trained too often on seen data
17 |             ## for model architecture finding:  1. play around with the start samples to find an architecture, that doesn't really overfit
18 |                                             # 2. m_train_freq in can somewhat limit overfitting, but is only treating the symptom
19 |                                             # 3. try finding a balance between the size of new samples per number of
20 |                                             #  updates of the model network (with m_train_freq)
21 |             'use_model': True, 
22 |             'm_hidden_dims':(512,512), # hidden layer size of model bnn
23 |             'm_loss_type': 'MSPE',
24 |             'm_use_scaler_in': True,
25 |             'm_use_scaler_out': True,
26 |             'm_lr': 1e-3,
27 |             'm_train_freq': 4000,        # model is only trained every (self._timestep % self._model_train_freq==0) steps (terminates when stops improving)
28 |             'rollout_batch_size': 1.0e3, # rollout_batch_size is the size of randomly chosen states to start from when rolling out model
29 |             'm_networks': 7,             # size of model network ensemble
30 |             'm_elites': 5,               # best networks to select from num_networks
31 |             'max_model_t': None,         # a timeout for model training (e.g. for speeding up wallclock time)
32 |             'sampling_alpha': 2,
33 |             'rollout_mode' : 'uncertainty',           #### choose from 'schedule', or 'uncertainty'
34 |             'rollout_schedule': [10, 500, 5, 30],       #[15, 100, 1, 15],    # min_epoch, max_epoch, min_length, max_length = self._rollout_schedule
35 |                                                         # increases rollout length from min_length to max_length over 
36 |                                                         # range of (min_epoch, max_epoch)
37 |                                                         ### Only applies if rollout_mode=='schedule'
38 |             'maxroll': 35,              # maximum rollout horizon
39 |             'batch_size_policy': 50000, # batch size before policy is updates
40 |             'initial_real_samples_per_epoch': 20000,    # number of real samples contained in first batch
41 |             'min_real_samples_per_epoch': 500, # absolute minimum of samples
42 |         }
43 |     },
44 |     'policy_params':{
45 |         'type':'cpopolicy',
46 |         'kwargs':{
47 |             'constrain_cost':       True,            # constrain_cost=False will perform TRPO updates
48 |             'a_hidden_layer_sizes':   (128, 128),    # policy network hidden layers
49 |             'vf_lr':                3e-4,            # learn rate for value learning
50 |             'vf_hidden_layer_sizes':(128,128),       # nn hidden layers for vf
51 |             'vf_epochs':            8,               # number of training epochs for values
52 |             'vf_batch_size':        2048,            # minibatches for value training
53 |             'vf_ensemble_size':     3,               # vf ensemble size
54 |             'vf_elites':            2,               # vf elites 
55 |             'vf_activation':        'swish',         # activation function
56 |             'vf_loss':              'MSE',           # choose from 'NLL', 'MSPE' (inc. var); 'MSE' ; 'Huber'
57 |             'vf_decay':             1e-6,            # decay for nn regularization
58 |             'vf_clipping':          False,           # clip losses for a trust-region like vf update
59 |             'vf_kl_cliprange':      0.0,                # only applicable if vf_clippping=True
60 |             'ent_reg':              0, # 5e-3        # exploration bonus for maintaining pol. entropy
61 |             'target_kl':            0.01,            # trust region diameter
62 |             'cost_lim':             10,              
63 |             'cost_lam':             .5,              # gae lambda
64 |             'cost_gamma':           0.97,            # discounts
65 |             'lam':                  .95,             # gae lambda
66 |             'gamma':                0.99,            # discounts
67 |         }
68 |     },
69 |     'buffer_params': {},
70 |     'sampler_params': {
71 |         'kwargs':{
72 |             'render_mode':None, #'human'
73 |         }
74 |     },
75 |     'run_params': {},
76 | }


--------------------------------------------------------------------------------
/configs/cmbpo_hcs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | params = {
 4 |     'universe': 'gym',
 5 |     'task': 'HalfCheetahSafe-v2',
 6 |     'environment_params': {
 7 |         'normalize_actions': True,
 8 |     },
 9 |     'algorithm_params': {
10 |         'type': 'CMBPO',
11 |         'kwargs':{
12 |             'n_env_interacts': int(10e6),
13 |             'epoch_length': 50000, 
14 |             'eval_every_n_steps': 5e3,
15 |             'n_initial_exploration_steps': int(10000), 
16 |             'use_model': True, 
17 |             'm_hidden_dims':(512,512), 
18 |             'm_loss_type': 'MSPE',
19 |             'm_use_scaler_in': True,
20 |             'm_use_scaler_out': True,
21 |             'm_lr': 1e-3,
22 |             'm_train_freq': 4000,        
23 |             'rollout_batch_size': 1.0e3, 
24 |             'm_networks': 7,             
25 |             'm_elites': 5,               
26 |             'max_model_t': None,         
27 |             'sampling_alpha': 2,
28 |             'rollout_mode' : 'uncertainty',      
29 |             'rollout_schedule': [10, 500, 5, 30],
30 |             'maxroll': 35,
31 |             'batch_size_policy': 50000,
32 |             'initial_real_samples_per_epoch': 15000,
33 |             'min_real_samples_per_epoch': 500,
34 |         }
35 |     },
36 |     'policy_params':{
37 |         'type':'cpopolicy',
38 |         'kwargs':{
39 |             'constrain_cost':       False,
40 |             'a_hidden_layer_sizes':   (128, 128),
41 |             'vf_lr':                3e-4,
42 |             'vf_hidden_layer_sizes':(128,128),
43 |             'vf_epochs':            8,
44 |             'vf_batch_size':        2048,
45 |             'vf_ensemble_size':     3,
46 |             'vf_elites':            2,
47 |             'vf_activation':        'swish',
48 |             'vf_loss':              'MSE',    
49 |             'vf_decay':             1e-6,
50 |             'vf_clipping':          False, 
51 |             'vf_kl_cliprange':      0.0,
52 |             'ent_reg':              0, # 5e-3
53 |             'target_kl':            0.01,
54 |             'lam':                  .95,
55 |             'gamma':                0.99,
56 |         }
57 |     },
58 |     'buffer_params': {},
59 |     'sampler_params': {
60 |         'kwargs':{
61 |             'render_mode':'human',
62 |         }
63 |     },
64 |     'run_params': {},
65 | }


--------------------------------------------------------------------------------
/configs/cmbpo_hs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | params = {
 4 |     'universe': 'gym',
 5 |     'task': 'HumanoidSafe-v2',
 6 |     'environment_params': {
 7 |         'normalize_actions': True,
 8 |     },
 9 |     'algorithm_params': {
10 |         'type': 'CMBPO',
11 |         'kwargs':{
12 |             'n_env_interacts': int(10e6),
13 |             'epoch_length': 50000, 
14 |             'eval_every_n_steps': 5e3,
15 |             'n_initial_exploration_steps': int(10000), 
16 |             'use_model': True, 
17 |             'm_hidden_dims':(512,512), 
18 |             'm_loss_type': 'MSPE',
19 |             'm_use_scaler_in': True,
20 |             'm_use_scaler_out': True,
21 |             'm_lr': 1e-3,
22 |             'm_train_freq': 4000,        
23 |             'rollout_batch_size': 1.0e3, 
24 |             'm_networks': 7,             
25 |             'm_elites': 5,               
26 |             'max_model_t': None,         
27 |             'sampling_alpha': 2,
28 |             'rollout_mode' : 'uncertainty',      
29 |             'rollout_schedule': [10, 500, 5, 30],
30 |             'maxroll': 35,
31 |             'batch_size_policy': 50000,
32 |             'initial_real_samples_per_epoch': 15000,
33 |             'min_real_samples_per_epoch': 500,
34 |         }
35 |     },
36 |     'policy_params':{
37 |         'type':'cpopolicy',
38 |         'kwargs':{
39 |             'constrain_cost':       False,
40 |             'a_hidden_layer_sizes':   (128, 128),
41 |             'vf_lr':                3e-4,
42 |             'vf_hidden_layer_sizes':(128,128),
43 |             'vf_epochs':            8,
44 |             'vf_batch_size':        2048,
45 |             'vf_ensemble_size':     3,
46 |             'vf_elites':            2,
47 |             'vf_activation':        'swish',
48 |             'vf_loss':              'MSE',    
49 |             'vf_decay':             1e-6,
50 |             'vf_clipping':          False, 
51 |             'vf_kl_cliprange':      0.0,
52 |             'ent_reg':              0, # 5e-3
53 |             'target_kl':            0.01,
54 |             'lam':                  .95,
55 |             'gamma':                0.99,
56 |         }
57 |     },
58 |     'buffer_params': {},
59 |     'sampler_params': {
60 |         'kwargs':{
61 |             'render_mode':'human',
62 |         }
63 |     },
64 |     'run_params': {},
65 | }


--------------------------------------------------------------------------------
/configs/cpo_hcs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | params = {
 4 |     'universe': 'gym',
 5 |     'task': 'HalfCheetahSafe-v2',
 6 |     'environment_params': {
 7 |         'normalize_actions': True,
 8 |     },
 9 |     'algorithm_params': {
10 |         'type': 'CMBPO',
11 |         'kwargs':{
12 |             'n_env_interacts': int(10e6),
13 |             'epoch_length': 50000, 
14 |             'eval_every_n_steps': 5e3,
15 |             'n_initial_exploration_steps': int(0), 
16 |             'use_model': False,
17 |             'batch_size_policy': 35000,
18 |         }
19 |     },
20 |     'policy_params':{
21 |         'type':'cpopolicy',
22 |         'kwargs':{
23 |             'constrain_cost':       True,
24 |             'a_hidden_layer_sizes': (128, 128),
25 |             'vf_lr':                3e-4,
26 |             'vf_hidden_layer_sizes':(128,128),
27 |             'vf_epochs':            8,
28 |             'vf_batch_size':        2048,
29 |             'vf_ensemble_size':     3,
30 |             'vf_elites':            2,
31 |             'vf_activation':        'swish',
32 |             'vf_loss':              'MSE', 
33 |             'vf_decay':             1e-6,
34 |             'vf_clipping':          False, 
35 |             'vf_kl_cliprange':      0.0,
36 |             'ent_reg':              0, # 5e-3
37 |             'target_kl':            0.01,
38 |             'lam':                  .95,
39 |             'gamma':                0.99,
40 |         }
41 |     },
42 |     'buffer_params': {},
43 |     'sampler_params': {
44 |         'kwargs':{
45 |             'render_mode':None,
46 |         }
47 |     },
48 |     'run_params': {},
49 | }


--------------------------------------------------------------------------------
/configs/trpo_hcs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | params = {
 4 |     'universe': 'gym',
 5 |     'task': 'HalfCheetahSafe-v2',
 6 |     'environment_params': {
 7 |         'normalize_actions': True,
 8 |     },
 9 |     'algorithm_params': {
10 |         'type': 'CMBPO',
11 |         'kwargs':{
12 |             'n_env_interacts': int(10e6),
13 |             'epoch_length': 50000, 
14 |             'eval_render_mode': 'human', 
15 |             'eval_n_episodes': 1,
16 |             'eval_every_n_steps': 5e3,
17 |             'eval_deterministic': False, 
18 |             'n_initial_exploration_steps': int(0), 
19 |             'use_model': False,
20 |             'batch_size_policy': 25000,
21 |         }
22 |     },
23 |     'policy_params':{
24 |         'type':'cpopolicy',
25 |         'kwargs':{
26 |             'constrain_cost':       False,
27 |             'a_hidden_layer_sizes': (128, 128),
28 |             'vf_lr':                3e-4,
29 |             'vf_hidden_layer_sizes':(128,128),
30 |             'vf_epochs':            8,
31 |             'vf_batch_size':        2048,
32 |             'vf_ensemble_size':     3,
33 |             'vf_elites':            2,
34 |             'vf_activation':        'swish',
35 |             'vf_loss':              'MSE',
36 |             'vf_decay':             1e-6,
37 |             'vf_clipping':          False,
38 |             'vf_kl_cliprange':      0.0,
39 |             'ent_reg':              0,
40 |             'target_kl':            0.01,
41 |             'cost_lim':             10,
42 |             'cost_lam':             .5,
43 |             'cost_gamma':           0.97,
44 |             'lam':                  .95,
45 |             'gamma':                0.99,
46 |         }
47 |     },
48 |     'buffer_params': {},
49 |     'sampler_params': {
50 |         'kwargs':{
51 |             'render_mode':None,
52 |         }
53 |     },
54 |     'run_params': {},
55 | }


--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/envs/__init__.py


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | import gym
 3 | 
 4 | import os
 5 | import sys
 6 | dirpath = os.path.dirname(os.path.dirname(__file__))
 7 | sys.path.append(dirpath)
 8 | 
 9 | env_specs = gym.envs.registry.env_specs
10 | 
11 | if 'HumanoidSafe-v2' not in env_specs:
12 |     register(
13 |         id='HumanoidSafe-v2',
14 |         entry_point='mujoco_safety_gym.envs:HumanoidEnv',
15 |         max_episode_steps=1000,
16 |     )
17 | if 'AntSafe-v2' not in env_specs:
18 |     register(
19 |         id='AntSafe-v2',
20 |         entry_point='mujoco_safety_gym.envs:AntEnv',
21 |         max_episode_steps=1000,
22 |     )
23 | if 'AntSafeVisualize-v2' not in env_specs:
24 |     register(
25 |         id='AntSafeVisualize-v2',
26 |         entry_point='mujoco_safety_gym.envs:AntEnvVisualize',
27 |         max_episode_steps=1000,
28 |     )
29 | if 'HopperSafe-v2' not in env_specs:    
30 |     register(
31 |         id='HopperSafe-v2',
32 |         entry_point='mujoco_safety_gym.envs:HopperEnv',
33 |         max_episode_steps=1000,
34 |     )
35 | if 'HalfCheetahSafe-v2' not in env_specs:
36 |     register(
37 |         id='HalfCheetahSafe-v2',
38 |         entry_point='mujoco_safety_gym.envs:HalfCheetahEnv',
39 |         max_episode_steps=1000,
40 |     )
41 | if 'FetchPushSafety-v0' not in env_specs:
42 |     register(
43 |         id='FetchPushSafety-v0', 
44 |         entry_point='mujoco_safety_gym.envs:FetchPushEnv', 
45 |         max_episode_steps=1000,
46 |     )
47 | if 'FetchReachSafety-v0' not in env_specs:
48 |     register(
49 |         id='FetchReachSafety-v0', 
50 |         entry_point='mujoco_safety_gym.envs:FetchReachEnv', 
51 |         max_episode_steps=1000,
52 |     )
53 | if 'FetchSlideSafety-v0' not in env_specs:
54 |     register(
55 |         id='FetchSlideSafety-v0', 
56 |         entry_point='mujoco_safety_gym.envs:FetchSlideEnv', 
57 |         max_episode_steps=1000,
58 |     )


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from mujoco_safety_gym.envs.humanoid import HumanoidEnv
2 | from mujoco_safety_gym.envs.ant import AntEnv
3 | from mujoco_safety_gym.envs.ant_viz import AntEnvVisualize
4 | from mujoco_safety_gym.envs.half_cheetah import HalfCheetahEnv
5 | from mujoco_safety_gym.envs.hopper import HopperEnv
6 | from mujoco_safety_gym.envs.fetch.push import FetchPushEnv
7 | from mujoco_safety_gym.envs.fetch.reach import FetchReachEnv
8 | from mujoco_safety_gym.envs.fetch.slide import FetchSlideEnv
9 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/ant.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from mujoco_safety_gym.envs import mujoco_env
 3 | from gym import utils
 4 | import mujoco_py as mjp
 5 | 
 6 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 7 |     def __init__(self):
 8 |         mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5)
 9 |         utils.EzPickle.__init__(self)
10 | 
11 |     def step(self, a):
12 |         xposbefore = self.get_body_com("torso")[0]
13 |         self.do_simulation(a, self.frame_skip)
14 |         mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
15 |         xposafter = self.get_body_com("torso")[0]
16 |         forward_reward = (xposafter - xposbefore)/self.dt
17 |         ctrl_cost = .5 * np.square(a).sum()
18 |         contact_cost = 0.5 * 1e-3 * np.sum(
19 |             np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
20 |         survive_reward = 1.0
21 |         
22 |         ### safety stuff
23 |         yposafter = self.get_body_com("torso")[1]
24 |         ywall = np.array([-5,5])
25 |         if xposafter<20:
26 |             y_walldist = yposafter - xposafter*np.tan(30/360*2*np.pi)+ywall
27 |         elif xposafter>20 and xposafter<60:
28 |             y_walldist = yposafter + (xposafter-40)*np.tan(30/360*2*np.pi) - ywall
29 |         elif xposafter>60 and xposafter<100:
30 |             y_walldist = yposafter - (xposafter-80)*np.tan(30/360*2*np.pi) + ywall
31 |         else:
32 |             y_walldist = yposafter - 20*np.tan(30/360*2*np.pi) + ywall
33 | 
34 |         obj_cost = (abs(y_walldist)<1.8).any()*1.0
35 |         reward = forward_reward - ctrl_cost - contact_cost + survive_reward
36 |         
37 |         body_quat = self.data.get_body_xquat('torso')
38 |         z_rot = 1-2*(body_quat[1]**2+body_quat[2]**2)  ### normally xx-rotation, not sure what axes mujoco uses
39 |         state = self.state_vector()
40 |         notdone = np.isfinite(state).all() \
41 |             and state[2] >= 0.2 and state[2] <= 1.0 \
42 |             and z_rot >= -0.7
43 |         done = not notdone
44 |         done_cost = done*1.0
45 |         cost = np.clip(obj_cost+done_cost, 0, 1)
46 |         ob = self._get_obs()
47 |         return ob, reward, done, dict(
48 |             reward_forward=forward_reward,
49 |             reward_ctrl=-ctrl_cost,
50 |             reward_contact=-contact_cost,
51 |             reward_survive=survive_reward,
52 |             cost_obj = obj_cost,
53 |             cost_done = done_cost,
54 |             cost = cost,
55 |             )
56 | 
57 |     def _get_obs(self):
58 |         x = self.sim.data.qpos.flat[0]
59 |         y = self.sim.data.qpos.flat[1]
60 |         if x<20:
61 |             y_off = y - x*np.tan(30/360*2*np.pi)
62 |         elif x>20 and x<60:
63 |             y_off = y + (x-40)*np.tan(30/360*2*np.pi)
64 |         elif x>60 and x<100:
65 |             y_off = y - (x-80)*np.tan(30/360*2*np.pi)
66 |         else:
67 |             y_off = y - 20*np.tan(30/360*2*np.pi)
68 | 
69 |         return np.concatenate([
70 |             self.sim.data.qpos.flat[2:-42],
71 |             self.sim.data.qvel.flat[:-36],
72 |             [x/5],
73 |             [y_off],
74 |             # np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
75 |         ])
76 | 
77 |     def reset_model(self):
78 |         qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
79 |         qpos[-42:] = self.init_qpos[-42:]
80 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
81 |         qvel[-36:] = self.init_qvel[-36:]
82 |         self.set_state(qpos, qvel)
83 |         return self._get_obs()
84 |         
85 | 
86 |     def viewer_setup(self):
87 |         self.viewer.cam.distance = self.model.stat.extent * 0.5


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/ant_viz.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from mujoco_safety_gym.envs import mujoco_env
  3 | from gym import utils
  4 | import mujoco_py as mjp
  5 | 
  6 | class AntEnvVisualize(mujoco_env.MujocoEnv, utils.EzPickle):
  7 |     def __init__(self):
  8 |         mujoco_env.MujocoEnv.__init__(self, 'ant_viz.xml', 5)
  9 |         utils.EzPickle.__init__(self)
 10 | 
 11 |     def step(self, a):
 12 |         xposbefore = self.get_body_com("torso")[0]
 13 |         self.do_simulation(a, self.frame_skip)
 14 |         mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
 15 |         xposafter = self.get_body_com("torso")[0]
 16 |         forward_reward = (xposafter - xposbefore)/self.dt
 17 |         ctrl_cost = .5 * np.square(a).sum()
 18 |         contact_cost = 0.5 * 1e-3 * np.sum(
 19 |             np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
 20 |         survive_reward = 1.0
 21 |         
 22 |         ### safety stuff
 23 |         yposafter = self.get_body_com("torso")[1]
 24 |         ywall = np.array([-5,5])
 25 |         if xposafter<20:
 26 |             y_walldist = yposafter - xposafter*np.tan(30/360*2*np.pi)+ywall
 27 |         elif xposafter>20 and xposafter<60:
 28 |             y_walldist = yposafter + (xposafter-40)*np.tan(30/360*2*np.pi) - ywall
 29 |         elif xposafter>60 and xposafter<100:
 30 |             y_walldist = yposafter - (xposafter-80)*np.tan(30/360*2*np.pi) + ywall
 31 |         else:
 32 |             y_walldist = yposafter - 20*np.tan(30/360*2*np.pi) + ywall
 33 | 
 34 |         obj_cost = (abs(y_walldist)<1.8).any()*1.0
 35 |         reward = forward_reward - ctrl_cost - contact_cost + survive_reward
 36 |         
 37 |         body_quat = self.data.get_body_xquat('torso')
 38 |         z_rot = 1-2*(body_quat[1]**2+body_quat[2]**2)  ### normally xx-rotation, not sure what axes mujoco uses
 39 |         state = self.state_vector()
 40 |         notdone = np.isfinite(state).all() \
 41 |             and state[2] >= 0.2 and state[2] <= 1.0 \
 42 |             and z_rot >= -0.7
 43 |         done = not notdone
 44 |         done_cost = done*1.0
 45 |         cost = np.clip(obj_cost+done_cost, 0, 1)
 46 |         ob = self._get_obs()
 47 |         return ob, reward, done, dict(
 48 |             reward_forward=forward_reward,
 49 |             reward_ctrl=-ctrl_cost,
 50 |             reward_contact=-contact_cost,
 51 |             reward_survive=survive_reward,
 52 |             cost_obj = obj_cost,
 53 |             cost_done = done_cost,
 54 |             cost = cost,
 55 |             )
 56 | 
 57 |     def _get_obs(self):
 58 |         x = self.sim.data.qpos.flat[0]
 59 |         y = self.sim.data.qpos.flat[1]
 60 | 
 61 |         x2 = self.sim.data.qpos.flat[15]
 62 |         y2 = self.sim.data.qpos.flat[16]
 63 | 
 64 |         x3 = self.sim.data.qpos.flat[30]
 65 |         y3 = self.sim.data.qpos.flat[31]
 66 | 
 67 |         if x<20:
 68 |             y_off = y - x*np.tan(30/360*2*np.pi)
 69 |         elif x>20 and x<60:
 70 |             y_off = y + (x-40)*np.tan(30/360*2*np.pi)
 71 |         elif x>60 and x<100:
 72 |             y_off = y - (x-80)*np.tan(30/360*2*np.pi)
 73 |         else:
 74 |             y_off = y - 20*np.tan(30/360*2*np.pi)
 75 | 
 76 |         qpos1 = self.sim.data.qpos.flat[2:15]
 77 |         qvel1 = self.sim.data.qvel.flat[:14]
 78 | 
 79 |         if x2<20:
 80 |             y_off2 = y2- x2*np.tan(30/360*2*np.pi)
 81 |         elif x2>20 and x<60:
 82 |             y_off2 = y2 + (x2-40)*np.tan(30/360*2*np.pi)
 83 |         elif x2>60 and x<100:
 84 |             y_off2 = y2 - (x2-80)*np.tan(30/360*2*np.pi)
 85 |         else:
 86 |             y_off2 = y2 - 20*np.tan(30/360*2*np.pi)
 87 | 
 88 |         qpos2 = self.sim.data.qpos.flat[17:30]
 89 |         qvel2 = self.sim.data.qvel.flat[14:28]
 90 | 
 91 |         if x3<20:
 92 |             y_off3 = y3 - x3*np.tan(30/360*2*np.pi)
 93 |         elif x3>20 and x<60:
 94 |             y_off3 = y3 + (x3-40)*np.tan(30/360*2*np.pi)
 95 |         elif x3>60 and x<100:
 96 |             y_off3 = y3 - (x3-80)*np.tan(30/360*2*np.pi)
 97 |         else:
 98 |             y_off3 = y3 - 20*np.tan(30/360*2*np.pi)
 99 | 
100 |         qpos3 = self.sim.data.qpos.flat[32:45]
101 |         qvel3 = self.sim.data.qvel.flat[28:42]
102 | 
103 |         return np.concatenate([
104 |             qpos1,
105 |             qvel1,
106 |             [x/5],
107 |             [y_off],
108 |             qpos2,
109 |             qvel2,
110 |             [x2/5],
111 |             [y_off2],
112 |             qpos3,
113 |             qvel3,
114 |             [x3/5],
115 |             [y_off3],
116 |             # np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
117 |         ])
118 | 
119 |     def reset_model(self):
120 |         # qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
121 |         # qpos[-42:] = self.init_qpos[-42:]
122 |         # qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
123 |         # qvel[-36:] = self.init_qvel[-36:]
124 |         qpos = self.init_qpos
125 |         qvel = self.init_qvel
126 |         self.set_state(qpos, qvel)
127 |         return self._get_obs()
128 |         
129 | 
130 |     def viewer_setup(self):
131 |         self.viewer.cam.distance = self.model.stat.extent * 0.5


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/ant.xml:
--------------------------------------------------------------------------------
  1 | <mujoco model="ant">
  2 |   <compiler angle="degree" coordinate="local" inertiafromgeom="true"/>
  3 |   <option integrator="RK4" timestep="0.01"/>
  4 |   <custom>
  5 |     <numeric data="0.0 0.0 0.55 1.0 0.0 0.0 0.0 0.0 1.0 0.0 -1.0 0.0 -1.0 0.0 1.0" name="init_qpos"/>
  6 |   </custom>
  7 |   <default>
  8 |     <joint armature="1" damping="1" limited="true"/>
  9 |     <geom conaffinity="0" condim="3" density="5.0" friction="1 0.5 0.5" margin="0.01" rgba="0.8 0.6 0.4 1"/>
 10 |   </default>
 11 |   <asset>
 12 |     <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>
 13 |     <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
 14 |     <texture builtin="checker" height="100" name="texplane" rgb1="#2c5987" rgb2="#1f4060" type="2d" width="100"/>
 15 |     <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
 16 |     <texture builtin="checker" height="100" name="texbox" rgb1="#ff66ff" rgb2="#ff66ff" type="2d" width="100"/>
 17 |     <material name="BoxMat" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texbox"/>
 18 | 
 19 |     <material name="geom" texture="texgeom" texuniform="true"/>
 20 |   </asset>
 21 |   <worldbody>
 22 |     <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
 23 |     <geom conaffinity="1" condim="3" material="MatPlane" name="floor" pos="30 0 0" rgba="0.2 0.2 0.2 1" size="70 25 40" type="plane"/>
 24 |     <!-- <geom conaffinity="1" condim="3" name="obj11" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="10  0 .5"    rgba="#ff66ff"/>
 25 |     <geom conaffinity="1" condim="3" name="obj12" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="10 -10 .5"    rgba="#ff66ff"/>
 26 |     <geom conaffinity="1" condim="3" name="obj13" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="10  10 .5"    rgba="#ff66ff"/>
 27 |     <geom conaffinity="1" condim="3" name="obj21" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="20 -4 .5"    rgba="#ff66ff"/>
 28 |     <geom conaffinity="1" condim="3" name="obj22" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="20  4 .5"    rgba="#ff66ff"/>
 29 |     <geom conaffinity="1" condim="3" name="obj23" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="20 -14 .5"    rgba="#ff66ff"/>
 30 |     <geom conaffinity="1" condim="3" name="obj24" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="20  14 .5"    rgba="#ff66ff"/>
 31 |     <geom conaffinity="1" condim="3" name="obj31" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="30  0 .5"    rgba="#ff66ff"/>    
 32 |     <geom conaffinity="1" condim="3" name="obj32" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="30 -9 .5"    rgba="#ff66ff"/>    
 33 |     <geom conaffinity="1" condim="3" name="obj33" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="30  11 .5"    rgba="#ff66ff"/>    
 34 |     <geom conaffinity="1" condim="3" name="obj34" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="30 -16 .5"    rgba="#ff66ff"/>    
 35 |     <geom conaffinity="1" condim="3" name="obj35" type="box" material="BoxMat" size="0.5 0.5 0.5" pos="30  19 .5"    rgba="#ff66ff"/>    
 36 |     <geom conaffinity="1" condim="3" name="wall1" type="box" material="BoxMat" size="0.1 14 1.0" pos="-14  0 1"    rgba="#ff66ff"/>
 37 |     <geom conaffinity="1" condim="3" name="wall2" type="box" material="BoxMat" size="14 .1 1.0" pos="0  14 1"    rgba="#ff66ff"/>
 38 |     <geom conaffinity="1" condim="3" name="wall3" type="box" material="BoxMat" size="14 0.1 1.0" pos="0  -14 1.0"    rgba="#ff66ff"/> -->
 39 |     <!-- <geom conaffinity="1" condim="3" name="wall2" type="box" density=".01" size="20 0.1 1.0" pos="0   6 1.0"   euler='0 0 30'  rgba="1 0.5 0.5 1"/>
 40 |     <geom conaffinity="1" condim="3" name="wall3" type="box" density=".01" size="20 0.1 1.0" pos="40 -6 1.0"  euler='0 0 -30'  rgba="1 0.5 0.5 1"/>
 41 |     <geom conaffinity="1" condim="3" name="wall4" type="box" density=".01" size="20 0.1 1.0" pos="40  6 1.0"  euler='0 0 -30'  rgba="1 0.5 0.5 1"/>
 42 |     <geom conaffinity="1" condim="3" name="wall5" type="box" density=".01" size="20 0.1 1.0" pos="80 -6 1.0"   euler='0 0 30'  rgba="1 0.5 0.5 1"/>
 43 |     <geom conaffinity="1" condim="3" name="wall6" type="box" density=".01" size="20 0.1 1.0" pos="80  6 1.0"   euler='0 0 30'  rgba="1 0.5 0.5 1"/> -->
 44 |     <body name="torso" pos="0 0 0.75">
 45 |       <camera name="track" mode="trackcom" pos="0 -10 -10" xyaxes=".8 .4 0 0 .4 .6"/>
 46 |       <geom name="torso_geom" pos="0 0 0" size="0.25" type="sphere"/>
 47 |       <joint armature="0" damping="0" limited="false" margin="0.01" name="root" pos="0 0 0" type="free"/>
 48 |       <body name="front_left_leg" pos="0 0 0">
 49 |         <geom fromto="0.0 0.0 0.0 0.2 0.2 0.0" name="aux_1_geom" size="0.08" type="capsule"/>
 50 |         <body name="aux_1" pos="0.2 0.2 0">
 51 |           <joint axis="0 0 1" name="hip_1" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
 52 |           <geom fromto="0.0 0.0 0.0 0.2 0.2 0.0" name="left_leg_geom" size="0.08" type="capsule"/>
 53 |           <body pos="0.2 0.2 0">
 54 |             <joint axis="-1 1 0" name="ankle_1" pos="0.0 0.0 0.0" range="30 70" type="hinge"/>
 55 |             <geom fromto="0.0 0.0 0.0 0.4 0.4 0.0" name="left_ankle_geom" size="0.08" type="capsule"/>
 56 |           </body>
 57 |         </body>
 58 |       </body>
 59 |       <body name="front_right_leg" pos="0 0 0">
 60 |         <geom fromto="0.0 0.0 0.0 -0.2 0.2 0.0" name="aux_2_geom" size="0.08" type="capsule"/>
 61 |         <body name="aux_2" pos="-0.2 0.2 0">
 62 |           <joint axis="0 0 1" name="hip_2" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
 63 |           <geom fromto="0.0 0.0 0.0 -0.2 0.2 0.0" name="right_leg_geom" size="0.08" type="capsule"/>
 64 |           <body pos="-0.2 0.2 0">
 65 |             <joint axis="1 1 0" name="ankle_2" pos="0.0 0.0 0.0" range="-70 -30" type="hinge"/>
 66 |             <geom fromto="0.0 0.0 0.0 -0.4 0.4 0.0" name="right_ankle_geom" size="0.08" type="capsule"/>
 67 |           </body>
 68 |         </body>
 69 |       </body>
 70 |       <body name="back_leg" pos="0 0 0">
 71 |         <geom fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" name="aux_3_geom" size="0.08" type="capsule"/>
 72 |         <body name="aux_3" pos="-0.2 -0.2 0">
 73 |           <joint axis="0 0 1" name="hip_3" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
 74 |           <geom fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" name="back_leg_geom" size="0.08" type="capsule"/>
 75 |           <body pos="-0.2 -0.2 0">
 76 |             <joint axis="-1 1 0" name="ankle_3" pos="0.0 0.0 0.0" range="-70 -30" type="hinge"/>
 77 |             <geom fromto="0.0 0.0 0.0 -0.4 -0.4 0.0" name="third_ankle_geom" size="0.08" type="capsule"/>
 78 |           </body>
 79 |         </body>
 80 |       </body>
 81 |       <body name="right_back_leg" pos="0 0 0">
 82 |         <geom fromto="0.0 0.0 0.0 0.2 -0.2 0.0" name="aux_4_geom" size="0.08" type="capsule"/>
 83 |         <body name="aux_4" pos="0.2 -0.2 0">
 84 |           <joint axis="0 0 1" name="hip_4" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
 85 |           <geom fromto="0.0 0.0 0.0 0.2 -0.2 0.0" name="rightback_leg_geom" size="0.08" type="capsule"/>
 86 |           <body pos="0.2 -0.2 0">
 87 |             <joint axis="1 1 0" name="ankle_4" pos="0.0 0.0 0.0" range="30 70" type="hinge"/>
 88 |             <geom fromto="0.0 0.0 0.0 0.4 -0.4 0.0" name="fourth_ankle_geom" size="0.08" type="capsule"/>
 89 |           </body>
 90 |         </body>
 91 |       </body>
 92 |     </body>
 93 |     <body name='b1' pos="0 5 1" euler='0 0 30'>
 94 |       <freejoint name="b1_fj"/>
 95 |       <geom conaffinity="1" condim="3" name="wall1" type="box" density=".000001" size="20 0.01 .7"    rgba="1 0.5 0.5 1"/>
 96 |     </body>
 97 |     <body name='b2' pos="0 -5 1" euler='0 0 30'>
 98 |       <freejoint name="b2_fj"/>
 99 |       <geom conaffinity="1" condim="3" name="wall2" type="box" density=".000001" size="20 0.01 .7"    rgba="1 0.5 0.5 1"/>
100 |     </body>
101 |     <body name='b3' pos="40 5 1" euler='0 0 -30'>
102 |       <freejoint name="b3_fj"/>
103 |       <geom conaffinity="1" condim="3" name="wall3" type="box" density=".000001" size="20 0.01 .7"    rgba="1 0.5 0.5 1"/>
104 |     </body>
105 |     <body name='b4' pos="40 -5 1" euler='0 0 -30'>
106 |       <freejoint name="b4_fj"/>
107 |       <geom conaffinity="1" condim="3" name="wall4" type="box" density=".000001" size="20 0.01 .7"    rgba="1 0.5 0.5 1"/>
108 |     </body>
109 |     <body name='b5' pos="80 5 1" euler='0 0 30'>
110 |       <freejoint name="b5_fj"/>
111 |       <geom conaffinity="1" condim="3" name="wall5" type="box" density=".000001" size="20 0.01 .7"    rgba="1 0.5 0.5 1"/>
112 |     </body>
113 |     <body name='b6' pos="80 -5 1" euler='0 0 30'>
114 |       <freejoint name="b6_fj"/>
115 |       <geom conaffinity="1" condim="3" name="wall6" type="box" density=".000001" size="20 0.01 .7"    rgba="1 0.5 0.5 1"/>
116 |     </body>
117 |   </worldbody>
118 |   <actuator>
119 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="hip_4" gear="150"/>
120 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="ankle_4" gear="150"/>
121 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="hip_1" gear="150"/>
122 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="ankle_1" gear="150"/>
123 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="hip_2" gear="150"/>
124 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="ankle_2" gear="150"/>
125 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="hip_3" gear="150"/>
126 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="ankle_3" gear="150"/>
127 |   </actuator>
128 | </mujoco>
129 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/pick_and_place.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <mujoco>
 3 | 	<compiler angle="radian" coordinate="local" meshdir="../stls/fetch" texturedir="../textures"></compiler>
 4 | 	<option timestep="0.002">
 5 | 		<flag warmstart="enable"></flag>
 6 | 	</option>
 7 | 
 8 | 	<include file="shared.xml"></include>
 9 | 	
10 | 	<worldbody>
11 | 		<geom name="floor0" pos="0.8 0.75 0" size="0.85 0.7 1" type="plane" condim="3" material="floor_mat"></geom>
12 | 		<body name="floor0" pos="0.8 0.75 0">
13 | 			<site name="target0" pos="0 0 0.5" size="0.02 0.02 0.02" rgba="1 0 0 1" type="sphere"></site>
14 | 		</body>
15 | 
16 | 		<include file="robot.xml"></include>
17 | 		
18 | 		<body pos="1.3 0.75 0.2" name="table0">
19 | 			<geom size="0.25 0.35 0.2" type="box" mass="2000" material="table_mat"></geom>
20 | 		</body>
21 | 		
22 | 		<body name="object0" pos="0.025 0.025 0.025">
23 | 			<joint name="object0:joint" type="free" damping="0.01"></joint>
24 | 			<geom size="0.025 0.025 0.025" type="box" condim="3" name="object0" material="block_mat" mass="2"></geom>
25 | 			<site name="object0" pos="0 0 0" size="0.02 0.02 0.02" rgba="1 0 0 1" type="sphere"></site>
26 | 		</body>
27 | 
28 | 		<light directional="true" ambient="0.2 0.2 0.2" diffuse="0.8 0.8 0.8" specular="0.3 0.3 0.3" castshadow="false" pos="0 0 4" dir="0 0 -1" name="light0"></light>
29 | 	</worldbody>
30 | 
31 | 	<actuator>
32 | 		<position ctrllimited="true" ctrlrange="0 0.2" joint="robot0:l_gripper_finger_joint" kp="30000" name="robot0:l_gripper_finger_joint" user="1"></position>
33 | 		<position ctrllimited="true" ctrlrange="0 0.2" joint="robot0:r_gripper_finger_joint" kp="30000" name="robot0:r_gripper_finger_joint" user="1"></position>
34 | 	</actuator>
35 | </mujoco>
36 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/push.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <mujoco>
 3 | 	<compiler angle="radian" coordinate="local" meshdir="../stls/fetch" texturedir="../textures"></compiler>
 4 | 	<option timestep="0.002">
 5 | 		<flag warmstart="enable"></flag>
 6 | 	</option>
 7 | 
 8 | 	<include file="shared.xml"></include>
 9 | 	
10 | 	<worldbody>
11 | 		<geom name="floor0" pos="0.8 0.75 0" size="0.85 0.70 1" type="plane" condim="3" material="floor_mat"></geom>
12 | 		<body name="floor0" pos="0.8 0.75 0">
13 | 			<site name="target0" pos="0 0 0.5" size="0.02 0.02 0.02" rgba="0 1 0 1" type="sphere"></site>
14 | 		</body>
15 | 
16 | 		<include file="robot.xml"></include>
17 | 
18 | 		<body pos="1.3 0.75 0.2" name="table0">
19 | 			<geom size="0.25 0.35 0.2" type="box" mass="2000" material="table_mat"></geom>
20 | 		</body>
21 | 		
22 | 		<body name="object0" pos="0.025 0.025 0.025">
23 | 			<joint name="object0:joint" type="free" damping="0.01"></joint>
24 | 			<geom size="0.025 0.025 0.025" type="box" condim="3" name="object0" material="block_mat" mass="2"></geom>
25 | 			<site name="object0" pos="0 0 0" size="0.02 0.02 0.02" rgba="1 0 0 1" type="sphere"></site>
26 | 		</body>
27 | 
28 | 		<geom name="obj0" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="1 0 0 0"/>
29 |         <geom name="obj1" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 1 0 0"/>
30 |         <geom name="obj2" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 0 1 0"/>
31 |         <geom name="obj3" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="1 0 0 0"/>
32 |         <geom name="obj4" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 1 0 0"/>
33 |         <geom name="obj5" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 0 1 0"/>
34 |         <geom name="obj6" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="1 0 0 0"/>
35 |         <geom name="obj7" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 1 0 0"/>
36 |         <geom name="obj8" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 0 1 0"/>
37 |         <geom name="obj9" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="1 0 0 0"/>
38 | 
39 | 		<light directional="true" ambient="0.2 0.2 0.2" diffuse="0.8 0.8 0.8" specular="0.3 0.3 0.3" castshadow="false" pos="0 0 4" dir="0 0 -1" name="light0"></light>
40 | 	</worldbody>
41 | 	
42 | 	<actuator></actuator>
43 | </mujoco>


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/reach.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <mujoco>
 3 | 	<compiler angle="radian" coordinate="local" meshdir="../stls/fetch" texturedir="../textures"></compiler>
 4 | 	<option timestep="0.002">
 5 | 		<flag warmstart="enable"></flag>
 6 | 	</option>
 7 | 
 8 | 	<include file="shared.xml"></include>
 9 | 	
10 | 	<worldbody>
11 | 		<geom name="floor0" pos="0.8 0.75 0" size="0.85 0.7 1" type="plane" condim="3" material="floor_mat"></geom>
12 | 		<body name="floor0" pos="0.8 0.75 0">
13 | 			<site name="target0" pos="0 0 0.5" size="0.02 0.02 0.02" rgba="0 1 0 1" type="sphere"></site>
14 | 		</body>
15 | 
16 | 		<include file="robot.xml"></include>
17 | 		
18 | 		<body pos="1.3 0.75 0.2" name="table0">
19 | 			<geom size="0.25 0.35 0.2" type="box" mass="2000" material="table_mat"></geom>
20 | 		</body>
21 | 
22 | 		<geom name="obj0" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="1 0 0 0"/>
23 |         <geom name="obj1" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 1 0 0"/>
24 |         <geom name="obj2" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 0 1 0"/>
25 |         <geom name="obj3" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="1 0 0 0"/>
26 |         <geom name="obj4" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 1 0 0"/>
27 |         <geom name="obj5" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 0 1 0"/>
28 |         <geom name="obj6" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="1 0 0 0"/>
29 |         <geom name="obj7" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 1 0 0"/>
30 |         <geom name="obj8" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="0 0 1 0"/>
31 |         <geom name="obj9" type="sphere" size="0.05 0.05 0.05" pos="1 1 10" rgba="1 0 0 0"/>
32 | 		
33 | 		<light directional="true" ambient="0.2 0.2 0.2" diffuse="0.8 0.8 0.8" specular="0.3 0.3 0.3" castshadow="false" pos="0 0 4" dir="0 0 -1" name="light0"></light>
34 | 	</worldbody>
35 | 	
36 | 	<actuator></actuator>
37 | </mujoco>
38 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/robot.xml:
--------------------------------------------------------------------------------
  1 | <mujoco>
  2 | 	<body mocap="true" name="robot0:mocap" pos="0 0 0">
  3 | 		
  4 | 	</body>
  5 | 	<body childclass="robot0:fetch" name="robot0:base_link" pos="0.2869 0.2641 0">
  6 | 		<joint armature="0.0001" axis="1 0 0" damping="1e+11" name="robot0:slide0" pos="0 0 0" type="slide"></joint>
  7 | 		<joint armature="0.0001" axis="0 1 0" damping="1e+11" name="robot0:slide1" pos="0 0 0" type="slide"></joint>
  8 | 		<joint armature="0.0001" axis="0 0 1" damping="1e+11" name="robot0:slide2" pos="0 0 0" type="slide"></joint>
  9 | 		<inertial diaginertia="1.2869 1.2236 0.9868" mass="70.1294" pos="-0.0036 0 0.0014" quat="0.7605 -0.0133 -0.0061 0.6491"></inertial>
 10 | 		<geom mesh="robot0:base_link" name="robot0:base_link" material="robot0:base_mat" class="robot0:grey"></geom>
 11 | 		<body name="robot0:torso_lift_link" pos="-0.0869 0 0.3774">
 12 | 			<inertial diaginertia="0.3365 0.3354 0.0943" mass="10.7796" pos="-0.0013 -0.0009 0.2935" quat="0.9993 -0.0006 0.0336 0.0185"></inertial>
 13 | 			<joint axis="0 0 1" damping="1e+07" name="robot0:torso_lift_joint" range="0.0386 0.3861" type="slide"></joint>
 14 | 			<geom mesh="robot0:torso_lift_link" name="robot0:torso_lift_link" material="robot0:torso_mat"></geom>
 15 | 			<body name="robot0:head_pan_link" pos="0.0531 0 0.603">
 16 | 				<inertial diaginertia="0.0185 0.0128 0.0095" mass="2.2556" pos="0.0321 0.0161 0.039" quat="0.5148 0.5451 -0.453 0.4823"></inertial>
 17 | 				<joint axis="0 0 1" name="robot0:head_pan_joint" range="-1.57 1.57"></joint>
 18 | 				<geom mesh="robot0:head_pan_link" name="robot0:head_pan_link" material="robot0:head_mat" class="robot0:grey"></geom>
 19 | 				<body name="robot0:head_tilt_link" pos="0.1425 0 0.058">
 20 | 					<inertial diaginertia="0.0063 0.0059 0.0014" mass="0.9087" pos="0.0081 0.0025 0.0113" quat="0.6458 0.66 -0.274 0.2689"></inertial>
 21 | 					<joint axis="0 1 0" damping="1000" name="robot0:head_tilt_joint" range="-0.76 1.45" ref="0.06"></joint>
 22 | 					<geom mesh="robot0:head_tilt_link" name="robot0:head_tilt_link" material="robot0:head_mat" class="robot0:blue"></geom>
 23 | 					<body name="robot0:head_camera_link" pos="0.055 0 0.0225">
 24 | 						<inertial diaginertia="0 0 0" mass="0" pos="0.055 0 0.0225"></inertial>
 25 | 						<body name="robot0:head_camera_rgb_frame" pos="0 0.02 0">
 26 | 							<inertial diaginertia="0 0 0" mass="0" pos="0 0.02 0"></inertial>
 27 | 							<body name="robot0:head_camera_rgb_optical_frame" pos="0 0 0" quat="0.5 -0.5 0.5 -0.5">
 28 | 								<inertial diaginertia="0 0 0" mass="0" pos="0 0 0" quat="0.5 -0.5 0.5 -0.5"></inertial>
 29 | 								<camera euler="3.1415 0 0" fovy="50" name="head_camera_rgb" pos="0 0 0"></camera>
 30 | 							</body>
 31 | 						</body>
 32 | 						<body name="robot0:head_camera_depth_frame" pos="0 0.045 0">
 33 | 							<inertial diaginertia="0 0 0" mass="0" pos="0 0.045 0"></inertial>
 34 | 							<body name="robot0:head_camera_depth_optical_frame" pos="0 0 0" quat="0.5 -0.5 0.5 -0.5">
 35 | 								<inertial diaginertia="0 0 0" mass="0" pos="0 0 0" quat="0.5 -0.5 0.5 -0.5"></inertial>
 36 | 							</body>
 37 | 						</body>
 38 | 					</body>
 39 | 				</body>
 40 | 			</body>
 41 | 			<body name="robot0:shoulder_pan_link" pos="0.1195 0 0.3486">
 42 | 				<inertial diaginertia="0.009 0.0086 0.0041" mass="2.5587" pos="0.0927 -0.0056 0.0564" quat="-0.1364 0.7624 -0.1562 0.613"></inertial>
 43 | 				<joint axis="0 0 1" name="robot0:shoulder_pan_joint" range="-1.6056 1.6056"></joint>
 44 | 				<geom mesh="robot0:shoulder_pan_link" name="robot0:shoulder_pan_link" material="robot0:arm_mat"></geom>
 45 | 				<body name="robot0:shoulder_lift_link" pos="0.117 0 0.06">
 46 | 					<inertial diaginertia="0.0116 0.0112 0.0023" mass="2.6615" pos="0.1432 0.0072 -0.0001" quat="0.4382 0.4382 0.555 0.555"></inertial>
 47 | 					<joint axis="0 1 0" name="robot0:shoulder_lift_joint" range="-1.221 1.518"></joint>
 48 | 					<geom mesh="robot0:shoulder_lift_link" name="robot0:shoulder_lift_link" material="robot0:arm_mat" class="robot0:blue"></geom>
 49 | 					<body name="robot0:upperarm_roll_link" pos="0.219 0 0">
 50 | 						<inertial diaginertia="0.0047 0.0045 0.0019" mass="2.3311" pos="0.1165 0.0014 0" quat="-0.0136 0.707 0.0136 0.707"></inertial>
 51 | 						<joint axis="1 0 0" limited="false" name="robot0:upperarm_roll_joint"></joint>
 52 | 						<geom mesh="robot0:upperarm_roll_link" name="robot0:upperarm_roll_link" material="robot0:arm_mat"></geom>
 53 | 						<body name="robot0:elbow_flex_link" pos="0.133 0 0">
 54 | 							<inertial diaginertia="0.0086 0.0084 0.002" mass="2.1299" pos="0.1279 0.0073 0" quat="0.4332 0.4332 0.5589 0.5589"></inertial>
 55 | 							<joint axis="0 1 0" name="robot0:elbow_flex_joint" range="-2.251 2.251"></joint>
 56 | 							<geom mesh="robot0:elbow_flex_link" name="robot0:elbow_flex_link" material="robot0:arm_mat" class="robot0:blue"></geom>
 57 | 							<body name="robot0:forearm_roll_link" pos="0.197 0 0">
 58 | 								<inertial diaginertia="0.0035 0.0031 0.0015" mass="1.6563" pos="0.1097 -0.0266 0" quat="-0.0715 0.7035 0.0715 0.7035"></inertial>
 59 | 								<joint armature="2.7538" axis="1 0 0" damping="3.5247" frictionloss="0" limited="false" name="robot0:forearm_roll_joint" stiffness="10"></joint>
 60 | 								<geom mesh="robot0:forearm_roll_link" name="robot0:forearm_roll_link" material="robot0:arm_mat"></geom>
 61 | 								<body name="robot0:wrist_flex_link" pos="0.1245 0 0">
 62 | 									<inertial diaginertia="0.0042 0.0042 0.0018" mass="1.725" pos="0.0882 0.0009 -0.0001" quat="0.4895 0.4895 0.5103 0.5103"></inertial>
 63 | 									<joint axis="0 1 0" name="robot0:wrist_flex_joint" range="-2.16 2.16"></joint>
 64 | 									<geom mesh="robot0:wrist_flex_link" name="robot0:wrist_flex_link" material="robot0:arm_mat" class="robot0:blue"></geom>
 65 | 									<body name="robot0:wrist_roll_link" pos="0.1385 0 0">
 66 | 										<inertial diaginertia="0.0001 0.0001 0.0001" mass="0.1354" pos="0.0095 0.0004 -0.0002"></inertial>
 67 | 										<joint axis="1 0 0" limited="false" name="robot0:wrist_roll_joint"></joint>
 68 | 										<geom mesh="robot0:wrist_roll_link" name="robot0:wrist_roll_link" material="robot0:arm_mat"></geom>
 69 | 										<body euler="0 0 0" name="robot0:gripper_link" pos="0.1664 0 0">
 70 | 											<inertial diaginertia="0.0024 0.0019 0.0013" mass="1.5175" pos="-0.09 -0.0001 -0.0017" quat="0 0.7071 0 0.7071"></inertial>
 71 | 											<geom mesh="robot0:gripper_link" name="robot0:gripper_link" material="robot0:gripper_mat"></geom>
 72 | 											<body name="robot0:gipper_camera_link" pos="0.055 0 0.0225">
 73 | 												<body name="robot0:gripper_camera_rgb_frame" pos="0 0.02 0">
 74 | 													<body name="robot0:gripper_camera_rgb_optical_frame" pos="0 0 0" quat="0.5 -0.5 0.5 -0.5">
 75 | 														<camera euler="3.1415 0 0" fovy="50" name="gripper_camera_rgb" pos="0 0 0"></camera>
 76 | 													</body>
 77 | 												</body>
 78 | 												<body name="robot0:gripper_camera_depth_frame" pos="0 0.045 0">
 79 | 													<body name="robot0:gripper_camera_depth_optical_frame" pos="0 0 0" quat="0.5 -0.5 0.5 -0.5"></body>
 80 | 												</body>
 81 | 											</body>
 82 | 
 83 | 											<body childclass="robot0:fetchGripper" name="robot0:r_gripper_finger_link" pos="0 0.0159 0">
 84 | 												<inertial diaginertia="0.1 0.1 0.1" mass="4" pos="-0.01 0 0"></inertial>
 85 | 												<joint axis="0 1 0" name="robot0:r_gripper_finger_joint" range="0 0.05"></joint>
 86 | 												<geom pos="0 -0.008 0" size="0.0385 0.007 0.0135" type="box" name="robot0:r_gripper_finger_link" material="robot0:gripper_finger_mat" condim="4" friction="1 0.05 0.01"></geom>
 87 | 											</body>
 88 | 											<body childclass="robot0:fetchGripper" name="robot0:l_gripper_finger_link" pos="0 -0.0159 0">
 89 | 												<inertial diaginertia="0.1 0.1 0.1" mass="4" pos="-0.01 0 0"></inertial>
 90 | 												<joint axis="0 -1 0" name="robot0:l_gripper_finger_joint" range="0 0.05"></joint>
 91 | 												<geom pos="0 0.008 0" size="0.0385 0.007 0.0135" type="box" name="robot0:l_gripper_finger_link" material="robot0:gripper_finger_mat" condim="4" friction="1 0.05 0.01"></geom>
 92 | 											</body>
 93 | 											<site name="robot0:grip" pos="0.02 0 0" rgba="0 0 0 0" size="0.02 0.02 0.02"></site>
 94 | 										</body>
 95 | 									</body>
 96 | 								</body>
 97 | 							</body>
 98 | 						</body>
 99 | 					</body>
100 | 				</body>
101 | 			</body>
102 | 		</body>
103 | 		<body name="robot0:estop_link" pos="-0.1246 0.2389 0.3113" quat="0.7071 0.7071 0 0">
104 | 			<inertial diaginertia="0 0 0" mass="0.002" pos="0.0024 -0.0033 0.0067" quat="0.3774 -0.1814 0.1375 0.8977"></inertial>
105 | 			<geom mesh="robot0:estop_link" rgba="0.8 0 0 1" name="robot0:estop_link"></geom>
106 | 		</body>
107 | 		<body name="robot0:laser_link" pos="0.235 0 0.2878" quat="0 1 0 0">
108 | 			<inertial diaginertia="0 0 0" mass="0.0083" pos="-0.0306 0.0007 0.0552" quat="0.5878 0.5378 -0.4578 0.3945"></inertial>
109 | 			<geom mesh="robot0:laser_link" rgba="0.7922 0.8196 0.9333 1" name="robot0:laser_link"></geom>
110 | 			<camera euler="1.55 -1.55 3.14" fovy="25" name="lidar" pos="0 0 0.02"></camera>
111 | 		</body>
112 | 		<body name="robot0:torso_fixed_link" pos="-0.0869 0 0.3774">
113 | 			<inertial diaginertia="0.3865 0.3394 0.1009" mass="13.2775" pos="-0.0722 0.0057 0.2656" quat="0.9995 0.0249 0.0177 0.011"></inertial>
114 | 			<geom mesh="robot0:torso_fixed_link" name="robot0:torso_fixed_link" class="robot0:blue"></geom>
115 | 		</body>
116 | 		<body name="robot0:external_camera_body_0" pos="0 0 0">
117 | 			<camera euler="0 0.75 1.57" fovy="43.3" name="external_camera_0" pos="1.3 0 1.2"></camera>
118 | 		</body>
119 | 	</body>
120 | </mujoco>
121 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/shared.xml:
--------------------------------------------------------------------------------
 1 | <mujoco>
 2 |     <asset>
 3 |         <texture type="skybox" builtin="gradient" rgb1="0.44 0.85 0.56" rgb2="0.46 0.87 0.58" width="32" height="32"></texture>
 4 |         <texture name="texture_block" file="block.png" gridsize="3 4" gridlayout=".U..LFRB.D.."></texture>
 5 | 
 6 |         <material name="floor_mat" specular="0" shininess="0.5" reflectance="0" rgba="0.2 0.2 0.2 1"></material>
 7 |         <material name="table_mat" specular="0" shininess="0.5" reflectance="0" rgba="0.93 0.93 0.93 1"></material>
 8 |         <material name="block_mat" specular="0" shininess="0.5" reflectance="0" rgba="0.2 0.2 0.2 1"></material>
 9 |         <material name="puck_mat" specular="0" shininess="0.5" reflectance="0" rgba="0.2 0.2 0.2 1"></material>
10 |         <material name="robot0:geomMat" shininess="0.03" specular="0.4"></material>
11 |         <material name="robot0:gripper_finger_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
12 |         <material name="robot0:gripper_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
13 |         <material name="robot0:arm_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
14 |         <material name="robot0:head_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
15 |         <material name="robot0:torso_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
16 |         <material name="robot0:base_mat" shininess="0.03" specular="0.4" reflectance="0"></material>
17 |         
18 |         <mesh file="base_link_collision.stl" name="robot0:base_link"></mesh>
19 |         <mesh file="bellows_link_collision.stl" name="robot0:bellows_link"></mesh>
20 |         <mesh file="elbow_flex_link_collision.stl" name="robot0:elbow_flex_link"></mesh>
21 |         <mesh file="estop_link.stl" name="robot0:estop_link"></mesh>
22 |         <mesh file="forearm_roll_link_collision.stl" name="robot0:forearm_roll_link"></mesh>
23 |         <mesh file="gripper_link.stl" name="robot0:gripper_link"></mesh>
24 |         <mesh file="head_pan_link_collision.stl" name="robot0:head_pan_link"></mesh>
25 |         <mesh file="head_tilt_link_collision.stl" name="robot0:head_tilt_link"></mesh>
26 |         <mesh file="l_wheel_link_collision.stl" name="robot0:l_wheel_link"></mesh>
27 |         <mesh file="laser_link.stl" name="robot0:laser_link"></mesh>
28 |         <mesh file="r_wheel_link_collision.stl" name="robot0:r_wheel_link"></mesh>
29 |         <mesh file="torso_lift_link_collision.stl" name="robot0:torso_lift_link"></mesh>
30 |         <mesh file="shoulder_pan_link_collision.stl" name="robot0:shoulder_pan_link"></mesh>
31 |         <mesh file="shoulder_lift_link_collision.stl" name="robot0:shoulder_lift_link"></mesh>
32 |         <mesh file="upperarm_roll_link_collision.stl" name="robot0:upperarm_roll_link"></mesh>
33 |         <mesh file="wrist_flex_link_collision.stl" name="robot0:wrist_flex_link"></mesh>
34 |         <mesh file="wrist_roll_link_collision.stl" name="robot0:wrist_roll_link"></mesh>
35 |         <mesh file="torso_fixed_link.stl" name="robot0:torso_fixed_link"></mesh>
36 |     </asset>
37 | 
38 |     <equality>
39 |         <weld body1="robot0:mocap" body2="robot0:gripper_link" solimp="0.9 0.95 0.001" solref="0.02 1"></weld>
40 |     </equality>
41 |     
42 |     <contact>
43 |         <exclude body1="robot0:r_gripper_finger_link" body2="robot0:l_gripper_finger_link"></exclude>
44 |         <exclude body1="robot0:torso_lift_link" body2="robot0:torso_fixed_link"></exclude>
45 |         <exclude body1="robot0:torso_lift_link" body2="robot0:shoulder_pan_link"></exclude>
46 |     </contact>
47 |     
48 |     <default>
49 |         <default class="robot0:fetch">
50 |             <geom margin="0.001" material="robot0:geomMat" rgba="1 1 1 1" solimp="0.99 0.99 0.01" solref="0.01 1" type="mesh" user="0"></geom>
51 |             <joint armature="1" damping="50" frictionloss="0" stiffness="0"></joint>
52 |             
53 |             <default class="robot0:fetchGripper">
54 |                 <geom condim="4" margin="0.001" type="box" user="0" rgba="0.356 0.361 0.376 1.0"></geom>
55 |                 <joint armature="100" damping="1000" limited="true" solimplimit="0.99 0.999 0.01" solreflimit="0.01 1" type="slide"></joint>
56 |             </default>
57 | 
58 |             <default class="robot0:grey">
59 |                 <geom rgba="0.356 0.361 0.376 1.0"></geom>
60 |             </default>
61 |             <default class="robot0:blue">
62 |                 <geom rgba="0.086 0.506 0.767 1.0"></geom>
63 |             </default>
64 |         </default>
65 |     </default>
66 | </mujoco>
67 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/slide.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <mujoco>
 3 | 	<compiler angle="radian" coordinate="local" meshdir="../stls/fetch" texturedir="../textures"></compiler>
 4 | 	<option timestep="0.002">
 5 | 		<flag warmstart="enable"></flag>
 6 | 	</option>
 7 | 
 8 | 	<include file="shared.xml"></include>
 9 | 	
10 | 	<worldbody>
11 | 		<geom name="floor0" pos="1 0.75 0" size="1.05 0.7 1" type="plane" condim="3" material="floor_mat"></geom>
12 | 		<body name="floor0" pos="1 0.75 0">
13 | 			<site name="target0" pos="0 0 0.5" size="0.02 0.02 0.02" rgba="1 0 0 1" type="sphere"></site>
14 | 		</body>
15 | 
16 | 		<include file="robot.xml"></include>
17 | 		
18 | 		<body name="table0" pos="1.32441906 0.75018422 0.2">
19 | 			<geom size="0.625 0.45 0.2" type="box" condim="3" name="table0" material="table_mat" mass="2000" friction="0.1 0.005 0.0001"></geom>
20 | 		</body>
21 | 
22 | 		<body name="object0" pos="0.025 0.025 0.02">
23 | 			<joint name="object0:joint" type="free" damping="0.01"></joint>
24 | 			<geom size="0.025 0.02" type="cylinder" condim="3" name="object0" material="puck_mat" friction="0.1 0.005 0.0001" mass="2"></geom>
25 | 			<site name="object0" pos="0 0 0" size="0.02 0.02 0.02" rgba="1 0 0 1" type="sphere"></site>
26 | 		</body>
27 | 
28 | 		<light directional="true" ambient="0.2 0.2 0.2" diffuse="0.8 0.8 0.8" specular="0.3 0.3 0.3" castshadow="false" pos="0 0 4" dir="0 0 -1" name="light0"></light>
29 | 	</worldbody>
30 | 
31 | 	<actuator></actuator>
32 | </mujoco>
33 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/half_cheetah.xml:
--------------------------------------------------------------------------------
  1 | <!-- Cheetah Model
  2 |     The state space is populated with joints in the order that they are
  3 |     defined in this file. The actuators also operate on joints.
  4 |     State-Space (name/joint/parameter):
  5 |         - rootx     slider      position (m)
  6 |         - rootz     slider      position (m)
  7 |         - rooty     hinge       angle (rad)
  8 |         - bthigh    hinge       angle (rad)
  9 |         - bshin     hinge       angle (rad)
 10 |         - bfoot     hinge       angle (rad)
 11 |         - fthigh    hinge       angle (rad)
 12 |         - fshin     hinge       angle (rad)
 13 |         - ffoot     hinge       angle (rad)
 14 |         - rootx     slider      velocity (m/s)
 15 |         - rootz     slider      velocity (m/s)
 16 |         - rooty     hinge       angular velocity (rad/s)
 17 |         - bthigh    hinge       angular velocity (rad/s)
 18 |         - bshin     hinge       angular velocity (rad/s)
 19 |         - bfoot     hinge       angular velocity (rad/s)
 20 |         - fthigh    hinge       angular velocity (rad/s)
 21 |         - fshin     hinge       angular velocity (rad/s)
 22 |         - ffoot     hinge       angular velocity (rad/s)
 23 |     Actuators (name/actuator/parameter):
 24 |         - bthigh    hinge       torque (N m)
 25 |         - bshin     hinge       torque (N m)
 26 |         - bfoot     hinge       torque (N m)
 27 |         - fthigh    hinge       torque (N m)
 28 |         - fshin     hinge       torque (N m)
 29 |         - ffoot     hinge       torque (N m)
 30 | -->
 31 | <mujoco model="cheetah">
 32 |   <compiler angle="radian" coordinate="local" inertiafromgeom="true" settotalmass="14"/>
 33 |   <default>
 34 |     <joint armature=".1" damping=".01" limited="true" solimplimit="0 .8 .03" solreflimit=".02 1" stiffness="8"/>
 35 |     <geom conaffinity="0" condim="3" contype="1" friction=".4 .1 .1" rgba="0.8 0.6 .4 1" solimp="0.0 0.8 0.01" solref="0.02 1"/>
 36 |     <motor ctrllimited="true" ctrlrange="-1 1"/>
 37 |   </default>
 38 |   <size nstack="300000" nuser_geom="1"/>
 39 |   <option gravity="0 0 -9.81" timestep="0.01"/>
 40 |   <asset>
 41 |     <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>
 42 |     <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
 43 |     <texture builtin="checker" height="100" name="texplane" rgb1="#2c5987" rgb2="#1f4060" type="2d" width="100"/>
 44 |     <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
 45 |     <material name="geom" texture="texgeom" texuniform="true"/>
 46 |   </asset>
 47 |   <worldbody>
 48 |     <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
 49 |     <geom conaffinity="1" condim="3" material="MatPlane" name="floor" pos="65 0 0" rgba="0.2 0.2 0.2 1" size="150 40 40" type="plane"/>
 50 |     <geom conaffinity="1" condim="3" name="wall1" type="box" size="24.8 0.1 1.0" pos="0  -4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
 51 |     <geom conaffinity="1" condim="3" name="wall2" type="box" size="24.8 0.1 1.0" pos="0   4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
 52 |     <geom conaffinity="1" condim="3" name="wall3" type="box" size="24.8 0.1 1.0" pos="50 -4 1.0"  euler='0 0 -0'  rgba="1 0.5 0.5 1"/>
 53 |     <geom conaffinity="1" condim="3" name="wall4" type="box" size="24.8 0.1 1.0" pos="50  4 1.0"  euler='0 0 -0'  rgba="1 0.5 0.5 1"/>
 54 |     <geom conaffinity="1" condim="3" name="wall5" type="box" size="24.8 0.1 1.0" pos="100 -4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
 55 |     <geom conaffinity="1" condim="3" name="wall6" type="box" size="24.8 0.1 1.0" pos="100  4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
 56 |     <geom conaffinity="1" condim="3" name="wall7" type="box" size="24.8 0.1 1.0" pos="150 -4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
 57 |     <geom conaffinity="1" condim="3" name="wall8" type="box" size="24.8 0.1 1.0" pos="150  4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
 58 | 
 59 |     <body name="obj1" pos="5 0 .7">
 60 |         <geom conaffinity="1" condim="3" name="obj_geom" pos='5 0 .7' density="0.0001" type="box" size=".1 2.3 1.3"  rgba="1 0.5 0.5 .8"/>
 61 |           <joint axis="1 0 0" damping=".2" name="wall_joint" pos="5 0 .7" range="-10000 10000" stiffness=".0" type="slide"/>
 62 |     </body>
 63 |     
 64 |     <body name="torso" pos="0 0 .7">
 65 |       <camera name="track" mode="trackcom" pos="0 -3 0.3" xyaxes="1 0 0 0 0 1"/>
 66 |       <joint armature="0" axis="1 0 0" damping="0" limited="false" name="rootx" pos="0 0 0" stiffness="0" type="slide"/>
 67 |       <joint armature="0" axis="0 0 1" damping="0" limited="false" name="rootz" pos="0 0 0" stiffness="0" type="slide"/>
 68 |       <joint armature="0" axis="0 1 0" damping="0" limited="false" name="rooty" pos="0 0 0" stiffness="0" type="hinge"/>
 69 |       <geom fromto="-.5 0 0 .5 0 0" name="torso" size="0.046" type="capsule"/>
 70 |       <geom axisangle="0 1 0 .87" name="head" pos=".6 0 .1" size="0.046 .15" type="capsule"/>
 71 |       <!-- <site name='tip'  pos='.15 0 .11'/>-->
 72 |       <body name="bthigh" pos="-.5 0 0">
 73 |         <joint axis="0 1 0" damping="6" name="bthigh" pos="0 0 0" range="-.52 1.05" stiffness="240" type="hinge"/>
 74 |         <geom axisangle="0 1 0 -3.8" name="bthigh" pos=".1 0 -.13" size="0.046 .145" type="capsule"/>
 75 |         <body name="bshin" pos=".16 0 -.25">
 76 |           <joint axis="0 1 0" damping="4.5" name="bshin" pos="0 0 0" range="-.785 .785" stiffness="180" type="hinge"/>
 77 |           <geom axisangle="0 1 0 -2.03" name="bshin" pos="-.14 0 -.07" rgba="0.9 0.6 0.6 1" size="0.046 .15" type="capsule"/>
 78 |           <body name="bfoot" pos="-.28 0 -.14">
 79 |             <joint axis="0 1 0" damping="3" name="bfoot" pos="0 0 0" range="-.4 .785" stiffness="120" type="hinge"/>
 80 |             <geom axisangle="0 1 0 -.27" name="bfoot" pos=".03 0 -.097" rgba="0.9 0.6 0.6 1" size="0.046 .094" type="capsule"/>
 81 |           </body>
 82 |         </body>
 83 |       </body>
 84 |       <body name="fthigh" pos=".5 0 0">
 85 |         <joint axis="0 1 0" damping="4.5" name="fthigh" pos="0 0 0" range="-1 .7" stiffness="180" type="hinge"/>
 86 |         <geom axisangle="0 1 0 .52" name="fthigh" pos="-.07 0 -.12" size="0.046 .133" type="capsule"/>
 87 |         <body name="fshin" pos="-.14 0 -.24">
 88 |           <joint axis="0 1 0" damping="3" name="fshin" pos="0 0 0" range="-1.2 .87" stiffness="120" type="hinge"/>
 89 |           <geom axisangle="0 1 0 -.6" name="fshin" pos=".065 0 -.09" rgba="0.9 0.6 0.6 1" size="0.046 .106" type="capsule"/>
 90 |           <body name="ffoot" pos=".13 0 -.18">
 91 |             <joint axis="0 1 0" damping="1.5" name="ffoot" pos="0 0 0" range="-.5 .5" stiffness="60" type="hinge"/>
 92 |             <geom axisangle="0 1 0 -.6" name="ffoot" pos=".045 0 -.07" rgba="0.9 0.6 0.6 1" size="0.046 .07" type="capsule"/>
 93 |           </body>
 94 |         </body>
 95 |       </body>
 96 |     </body>
 97 |   </worldbody>
 98 |   <!-- <equality>
 99 |     <weld name="weld1" body1="mocap1" body2="obj1" solref=".02 2.5"/>
100 |   </equality> -->
101 |   <actuator>
102 |     <motor gear="120" joint="bthigh" name="bthigh"/>
103 |     <motor gear="90" joint="bshin" name="bshin"/>
104 |     <motor gear="60" joint="bfoot" name="bfoot"/>
105 |     <motor gear="120" joint="fthigh" name="fthigh"/>
106 |     <motor gear="60" joint="fshin" name="fshin"/>
107 |     <motor gear="30" joint="ffoot" name="ffoot"/>
108 |     <motor gear="120" joint="wall_joint" name="wall_joint_ac"/>
109 |   </actuator>
110 | </mujoco>
111 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/hopper.xml:
--------------------------------------------------------------------------------
 1 | <mujoco model="hopper">
 2 |   <compiler angle="degree" coordinate="global" inertiafromgeom="true"/>
 3 |   <default>
 4 |     <joint armature="1" damping="1" limited="true"/>
 5 |     <geom conaffinity="1" condim="1" contype="1" margin="0.001" material="geom" rgba="0.8 0.6 .4 1" solimp=".8 .8 .01" solref=".02 1"/>
 6 |     <motor ctrllimited="true" ctrlrange="-.4 .4"/>
 7 |   </default>
 8 |   <option integrator="RK4" timestep="0.002"/>
 9 |   <visual>
10 |     <map znear="0.02"/>
11 |   </visual>
12 |   <worldbody>
13 |     <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
14 |     <geom conaffinity="1" condim="3" name="floor" pos="40 0 0" rgba="0.2 0.2 0.2 1" size="100 25 .125" type="plane" material="MatPlane"/>
15 |     <geom conaffinity="1" condim="3" name="wall1" type="box" size="24.8 0.1 1.0" pos="0  -4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
16 |     <geom conaffinity="1" condim="3" name="wall2" type="box" size="24.8 0.1 1.0" pos="0   4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
17 |     <geom conaffinity="1" condim="3" name="wall3" type="box" size="24.8 0.1 1.0" pos="50 -4 1.0"  euler='0 0 -0'  rgba="1 0.5 0.5 1"/>
18 |     <geom conaffinity="1" condim="3" name="wall4" type="box" size="24.8 0.1 1.0" pos="50  4 1.0"  euler='0 0 -0'  rgba="1 0.5 0.5 1"/>
19 |     <geom conaffinity="1" condim="3" name="wall5" type="box" size="24.8 0.1 1.0" pos="100 -4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
20 |     <geom conaffinity="1" condim="3" name="wall6" type="box" size="24.8 0.1 1.0" pos="100  4 1.0"  euler='0 0 0'  rgba="1 0.5 0.5 1"/>
21 |     <body name="mocap1" pos="5 0 .5" mocap="true">
22 |         <geom conaffinity="0" condim="3" name="mocap_geom" pos='5 0 .5' type="box" size=".3 0.3 0.3"  rgba="1 0.5 0.5 0"/>
23 |     </body>
24 |     <body name="obj1" pos="5 0 .5">
25 |         <freejoint name="obj1_fj"/>
26 |         <geom conaffinity="1" condim="3" name="obj_geom" pos='5 0 .5' type="box" size=".3 0.3 0.3"  rgba="1 0.5 0.5 1"/>
27 |     </body>
28 |     <body name="torso" pos="0 1 1.25">
29 |       <camera name="track" mode="trackcom" pos="0 -3 1" xyaxes="1 0 0 0 0 1"/>
30 |       <joint armature="0" axis="1 0 0" damping="0" limited="false" name="rootx" pos="0 0 0" stiffness="0" type="slide"/>
31 |       <joint armature="0" axis="0 0 1" damping="0" limited="false" name="rootz" pos="0 0 0" ref="1.25" stiffness="0" type="slide"/>
32 |       <joint armature="0" axis="0 1 0" damping="0" limited="false" name="rooty" pos="0 0 1.25" stiffness="0" type="hinge"/>
33 |       <geom friction="0.9" fromto="0 0 1.45 0 0 1.05" name="torso_geom" size="0.05" type="capsule"/>
34 |       <body name="thigh" pos="0 0 1.05">
35 |         <joint axis="0 -1 0" name="thigh_joint" pos="0 0 1.05" range="-150 0" type="hinge"/>
36 |         <geom friction="0.9" fromto="0 0 1.05 0 0 0.6" name="thigh_geom" size="0.05" type="capsule"/>
37 |         <body name="leg" pos="0 0 0.35">
38 |           <joint axis="0 -1 0" name="leg_joint" pos="0 0 0.6" range="-150 0" type="hinge"/>
39 |           <geom friction="0.9" fromto="0 0 0.6 0 0 0.1" name="leg_geom" size="0.04" type="capsule"/>
40 |           <body name="foot" pos="0.13/2 0 0.1">
41 |             <joint axis="0 -1 0" name="foot_joint" pos="0 0 0.1" range="-45 45" type="hinge"/>
42 |             <geom friction="2.0" fromto="-0.13 0 0.1 0.26 0 0.1" name="foot_geom" size="0.06" type="capsule"/>
43 |           </body>
44 |         </body>
45 |       </body>
46 |     </body>
47 |   </worldbody>
48 |   <equality>
49 |     <weld name="weld1" body1="mocap1" body2="obj1" solref=".02 .5"/>
50 |   </equality>
51 |   <actuator>
52 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="200.0" joint="thigh_joint"/>
53 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="200.0" joint="leg_joint"/>
54 |     <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="200.0" joint="foot_joint"/>
55 |   </actuator>
56 |     <asset>
57 |         <texture type="skybox" builtin="gradient" rgb1=".4 .5 .6" rgb2="0 0 0"
58 |             width="100" height="100"/>
59 |         <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
60 |         <texture builtin="checker" height="100" name="texplane" rgb1="#2c5987" rgb2="#1f4060" type="2d" width="100"/>
61 |         <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
62 |         <material name="geom" texture="texgeom" texuniform="true"/>
63 |     </asset>
64 | </mujoco>
65 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/textures/block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/envs/mujoco_safety_gym/envs/assets/textures/block.png


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/textures/block_hidden.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/envs/mujoco_safety_gym/envs/assets/textures/block_hidden.png


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/fetch/pick_and_place.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from gym import utils
 3 | from mujoco_safety_gym.envs.fetch_env import FetchEnvNew
 4 | 
 5 | 
 6 | # Ensure we get the path separator correct on windows
 7 | MODEL_XML_PATH = os.path.join('fetch', 'pick_and_place.xml')
 8 | 
 9 | 
10 | class FetchPickAndPlaceEnv(FetchEnvNew, utils.EzPickle):
11 |     def __init__(self, reward_type='sparse'):
12 |         initial_qpos = {
13 |             'robot0:slide0': 0.405,
14 |             'robot0:slide1': 0.48,
15 |             'robot0:slide2': 0.0,
16 |             'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.],
17 |         }
18 |         FetchEnvNew.__init__(
19 |             self, MODEL_XML_PATH, has_object=True, block_gripper=False, n_substeps=20,
20 |             gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0,
21 |             obj_range=0.15, target_range=0.15, distance_threshold=0.05, additional_objects=False,
22 |             number_of_objects = 0, initial_qpos=initial_qpos, reward_type=reward_type)
23 |         utils.EzPickle.__init__(self)
24 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/fetch/push.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from gym import utils
 3 | from mujoco_safety_gym.envs.fetch_env import FetchEnvNew
 4 | 
 5 | 
 6 | # Ensure we get the path separator correct on windows
 7 | MODEL_XML_PATH = os.path.join('fetch', 'push.xml')
 8 | 
 9 | 
10 | class FetchPushEnv(FetchEnvNew, utils.EzPickle):
11 |     def __init__(self, reward_type='sparse', additional_objects=False, number_of_objects=5):
12 |         initial_qpos = {
13 |             'robot0:slide0': 0.405,
14 |             'robot0:slide1': 0.48,
15 |             'robot0:slide2': 0.0,
16 |             'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.]          
17 |         }
18 |         FetchEnvNew.__init__(
19 |             self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20,
20 |             gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0,
21 |             obj_range=0.10, target_range=0.15, distance_threshold=0.05, additional_objects=additional_objects,
22 |             number_of_objects = number_of_objects, initial_qpos=initial_qpos, reward_type=reward_type)
23 |         utils.EzPickle.__init__(self)
24 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/fetch/reach.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from gym import utils
 3 | from mujoco_safety_gym.envs.fetch_env import FetchEnvNew
 4 | 
 5 | 
 6 | # Ensure we get the path separator correct on windows
 7 | MODEL_XML_PATH = os.path.join('fetch', 'reach.xml')
 8 | 
 9 | 
10 | class FetchReachEnv(FetchEnvNew, utils.EzPickle):
11 |     def __init__(self, reward_type='sparse', additional_objects=False, number_of_objects=5):
12 |         initial_qpos = {
13 |             'robot0:slide0': 0.405,
14 |             'robot0:slide1': 0.48,
15 |             'robot0:slide2': 0.0,
16 |         }
17 |         FetchEnvNew.__init__(
18 |             self, MODEL_XML_PATH, has_object=False, block_gripper=True, n_substeps=20,
19 |             gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0,
20 |             obj_range=0.1, target_range=0.2, distance_threshold=0.05, additional_objects=additional_objects,
21 |             number_of_objects = number_of_objects, initial_qpos=initial_qpos, reward_type=reward_type)
22 |         utils.EzPickle.__init__(self)


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/fetch/slide.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | 
 4 | from gym import utils
 5 | from mujoco_safety_gym.envs import fetch_env
 6 | 
 7 | 
 8 | # Ensure we get the path separator correct on windows
 9 | MODEL_XML_PATH = os.path.join('fetch', 'slide.xml')
10 | 
11 | 
12 | class FetchSlideEnv(fetch_env.FetchEnvNew, utils.EzPickle):
13 |     def __init__(self, reward_type='sparse'):
14 |         initial_qpos = {
15 |             'robot0:slide0': 0.05,
16 |             'robot0:slide1': 0.48,
17 |             'robot0:slide2': 0.0,
18 |             'object0:joint': [1.7, 1.1, 0.41, 1., 0., 0., 0.],
19 |         }
20 |         fetch_env.FetchEnvNew.__init__(
21 |             self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20,
22 |             gripper_extra_height=-0.02, target_in_the_air=False, target_offset=np.array([0.4, 0.0, 0.0]),
23 |             obj_range=0.1, target_range=0.3, distance_threshold=0.05, additional_objects=False,
24 |             number_of_objects = 0, initial_qpos=initial_qpos, reward_type=reward_type)
25 |         utils.EzPickle.__init__(self)
26 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/half_cheetah.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from mujoco_safety_gym.envs import mujoco_env
 4 | import mujoco_py as mjp
 5 | from gym import error, spaces
 6 | 
 7 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 8 |     def __init__(self):
 9 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
10 |         utils.EzPickle.__init__(self)
11 |         
12 |     def step(self, action):
13 |         xposbefore = self.sim.data.qpos[1]
14 |         
15 |         t = self.data.time
16 |         wall_act = .02*np.sin(t/3)**2 - .004
17 |         mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
18 |         action_p_wall = np.concatenate((np.squeeze(action), [wall_act]))
19 | 
20 |         self.do_simulation(action_p_wall, self.frame_skip)
21 |         xposafter = self.sim.data.qpos[1]
22 | 
23 |         wallpos = self.data.get_geom_xpos("obj_geom")[0]
24 |         wallvel = self.data.get_body_xvelp("obj1")[0]
25 |         xdist = wallpos-xposafter
26 |         obj_cost = int(np.abs(xdist)<2)
27 |         if obj_cost>0:
28 |             self.model.geom_rgba[9] = [1.0, 0, 0, 1.0]
29 |         else:
30 |             self.model.geom_rgba[9] = [1.0, 0.5, 0.5, .8]
31 |         ob = self._get_obs()
32 |         reward_ctrl = - 0.1 * np.square(action).sum()
33 |         reward_run = (xposafter - xposbefore)/self.dt
34 |         reward = reward_ctrl + reward_run
35 |         done = False
36 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl, cost=obj_cost)
37 | 
38 |     def _get_obs(self):
39 |         wallvel = self.data.get_body_xvelp("obj1")[0]
40 |         wall_f = .02*np.sin(self.data.time/3)**2 - .004
41 |         xdist = (self.data.get_geom_xpos("obj_geom")[0]-self.sim.data.qpos[1])/10
42 |         
43 |         return np.concatenate([
44 |             self.sim.data.qpos.flat[2:],
45 |             self.sim.data.qvel.flat[1:],
46 |             [wallvel],
47 |             [wall_f],
48 |             np.clip([xdist], -5, 5),
49 |         ])
50 | 
51 |     def reset_model(self):
52 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
53 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
54 |         self.set_state(qpos, qvel)
55 |         return self._get_obs()
56 | 
57 |     def viewer_setup(self):
58 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
59 | 
60 | 
61 |     def _set_action_space(self):
62 |         bounds = self.model.actuator_ctrlrange.copy().astype(np.float32)
63 |         low, high = bounds.T
64 |         low, high = low[:-1], high[:-1]
65 |         self.action_space = spaces.Box(low=low, high=high, dtype=np.float32)
66 |         return self.action_space
67 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/hopper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from mujoco_safety_gym.envs import mujoco_env
 3 | from gym import utils
 4 | import mujoco_py as mjp
 5 | 
 6 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 7 |     def __init__(self):
 8 |         mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 4)
 9 |         utils.EzPickle.__init__(self)
10 |         self.last_mocx = 5      #### vel readings are super noisy for mocap weld
11 | 
12 |     def step(self, a):
13 |         posbefore = self.sim.data.qpos[3]
14 |         t = self.data.time
15 |         pos = (t + np.sin(t)) + 3
16 |         self.data.set_mocap_pos('mocap1', [pos, 0, 0.5])
17 | 
18 |         mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
19 |         self.do_simulation(a, self.frame_skip)
20 |         posafter, height, ang = self.sim.data.qpos[3:6]
21 |         alive_bonus = 1.0
22 | 
23 |         mocapx = self.sim.data.qpos[0]
24 |         xdist = mocapx-posafter
25 |         cost = int(np.abs(xdist)<1)
26 | 
27 |         reward = (posafter - posbefore) / self.dt
28 |         reward += alive_bonus
29 |         reward -= 1e-3 * np.square(a).sum()
30 |         s = self.state_vector()
31 |         done = not (np.isfinite(s).all() and (np.abs(s[5:]) < 100).all() and
32 |                     (height > .7) and (abs(ang) < .2))
33 |         ob = self._get_obs()
34 |         return ob, reward, done, {'cost':cost}
35 | 
36 |     def _get_obs(self):
37 |         x = self.sim.data.qpos[3]
38 |         mocapx = self.sim.data.qpos[0]
39 |         mocvel = 1 + np.cos(self.data.time)
40 |         mocacc = -np.sin(self.data.time)
41 |         return np.concatenate([
42 |             self.sim.data.qpos.flat[4:],
43 |             np.clip(self.sim.data.qvel[3:].flat, -10, 10),
44 |             [mocvel],
45 |             [mocacc],
46 |             [mocapx-x],
47 |         ])
48 | 
49 |     def reset_model(self):
50 |         qpos = self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq)
51 |         qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
52 |         self.set_state(qpos, qvel)
53 |         return self._get_obs()
54 | 
55 |     def last_mocap_x(self):
56 |         
57 |         return self.last_mocx
58 | 
59 |     def viewer_setup(self):
60 |         self.viewer.cam.trackbodyid = 2
61 |         self.viewer.cam.distance = self.model.stat.extent * 0.75
62 |         self.viewer.cam.lookat[2] = 1.15
63 |         self.viewer.cam.elevation = -20
64 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/humanoid.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from mujoco_safety_gym.envs import mujoco_env
  3 | from gym import utils
  4 | import mujoco_py as mjp
  5 | 
  6 | def mass_center(model, sim):
  7 |     mass = np.expand_dims(model.body_mass, 1)
  8 |     xpos = sim.data.xipos
  9 |     return (np.sum(mass * xpos, 0) / np.sum(mass))[0]
 10 | 
 11 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 12 |     def __init__(self):
 13 |         mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5)
 14 |         utils.EzPickle.__init__(self)
 15 | 
 16 |     def _get_obs(self):
 17 |         data = self.sim.data
 18 |         x = data.qpos.flat[0]
 19 |         y = data.qpos.flat[1]
 20 |         if x<20:
 21 |             y_off = y - x*np.tan(30/360*2*np.pi)
 22 |         elif x>20 and x<60:
 23 |             y_off = y + (x-40)*np.tan(30/360*2*np.pi)
 24 |         elif x>60 and x<100:
 25 |             y_off = y - (x-80)*np.tan(30/360*2*np.pi)
 26 |         else:
 27 |             y_off = y - 20*np.tan(30/360*2*np.pi)
 28 | 
 29 |         return np.concatenate([data.qpos.flat[2:-42],
 30 |                                data.qvel.flat[:-36],
 31 |                                [x/5],
 32 |                                [y_off]])
 33 | 
 34 |         # return np.concatenate([data.qpos.flat[2:],
 35 |         #                        data.qvel.flat,
 36 |         #                        data.cinert.flat,
 37 |         #                        data.cvel.flat,
 38 |         #                        data.qfrc_actuator.flat,
 39 |         #                        data.cfrc_ext.flat])
 40 | 
 41 |     def step(self, a):
 42 |         pos_before = mass_center(self.model, self.sim)
 43 |         self.do_simulation(a, self.frame_skip)
 44 |         mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
 45 |         pos_after = mass_center(self.model, self.sim)
 46 |         alive_bonus = 5.0
 47 |         data = self.sim.data
 48 |         lin_vel_cost = 1.25 * (pos_after - pos_before) / self.dt
 49 |         quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
 50 |         quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum()
 51 |         quad_impact_cost = min(quad_impact_cost, 10)
 52 |         reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
 53 | 
 54 |         yposafter = self.get_body_com("torso")[1]
 55 |         ywall = np.array([-5,5])
 56 |         if pos_after<20:
 57 |             y_walldist = yposafter - pos_after*np.tan(30/360*2*np.pi)+ywall
 58 |         elif pos_after>20 and pos_after<60:
 59 |             y_walldist = yposafter + (pos_after-40)*np.tan(30/360*2*np.pi) - ywall
 60 |         elif pos_after>60 and pos_after<100:
 61 |             y_walldist = yposafter - (pos_after-80)*np.tan(30/360*2*np.pi) + ywall
 62 |         else:
 63 |             y_walldist = yposafter - 20*np.tan(30/360*2*np.pi) + ywall
 64 |         obj_cost = (abs(y_walldist)<2).any()*1.0
 65 | 
 66 | 
 67 |         qpos = self.sim.data.qpos
 68 |         done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
 69 |         
 70 |         done_cost = done*1.0
 71 |         cost = np.clip(obj_cost+done_cost, 0, 1)
 72 | 
 73 |         return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, 
 74 |                                                     reward_quadctrl=-quad_ctrl_cost, 
 75 |                                                     reward_alive=alive_bonus, 
 76 |                                                     reward_impact=-quad_impact_cost, 
 77 |                                                     cost_obj = obj_cost,
 78 |                                                     cost_done = done_cost,
 79 |                                                     cost = cost,
 80 |                                                     )
 81 | 
 82 |     def reset_model(self):
 83 |         c = 0.01
 84 |         # self.set_state(
 85 |         #     self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
 86 |         #     self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
 87 |         # )
 88 |         # return self._get_obs()
 89 |         qpos = self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq)
 90 |         qpos[-42:] = self.init_qpos[-42:]
 91 |         qvel = self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
 92 |         qvel[-36:] = self.init_qvel[-36:]
 93 |         self.set_state(qpos, qvel)
 94 |         return self._get_obs()
 95 | 
 96 | 
 97 |     def viewer_setup(self):
 98 |         self.viewer.cam.trackbodyid = 1
 99 |         self.viewer.cam.distance = self.model.stat.extent * 1.0
100 |         self.viewer.cam.lookat[2] = 2.0
101 |         self.viewer.cam.elevation = -20
102 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/mujoco_env.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import os
  3 | 
  4 | 
  5 | from gym import error, spaces
  6 | from gym.utils import seeding
  7 | import numpy as np
  8 | from os import path
  9 | import gym
 10 | 
 11 | try:
 12 |     import mujoco_py
 13 | except ImportError as e:
 14 |     raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e))
 15 | 
 16 | DEFAULT_SIZE = 500
 17 | 
 18 | 
 19 | def convert_observation_to_space(observation):
 20 |     if isinstance(observation, dict):
 21 |         space = spaces.Dict(OrderedDict([
 22 |             (key, convert_observation_to_space(value))
 23 |             for key, value in observation.items()
 24 |         ]))
 25 |     elif isinstance(observation, np.ndarray):
 26 |         low = np.full(observation.shape, -float('inf'), dtype=np.float32)
 27 |         high = np.full(observation.shape, float('inf'), dtype=np.float32)
 28 |         space = spaces.Box(low, high, dtype=observation.dtype)
 29 |     else:
 30 |         raise NotImplementedError(type(observation), observation)
 31 | 
 32 |     return space
 33 | 
 34 | 
 35 | class MujocoEnv(gym.Env):
 36 |     """Superclass for all MuJoCo environments.
 37 |     """
 38 | 
 39 |     def __init__(self, model_path, frame_skip):
 40 |         if model_path.startswith("/"):
 41 |             fullpath = model_path
 42 |         else:
 43 |             fullpath = os.path.join(os.path.dirname(__file__), "./assets", model_path)
 44 |         if not path.exists(fullpath):
 45 |             raise IOError("File %s does not exist" % fullpath)
 46 |         self.frame_skip = frame_skip
 47 |         self.model = mujoco_py.load_model_from_path(fullpath)
 48 |         self.sim = mujoco_py.MjSim(self.model)
 49 |         self.data = self.sim.data
 50 |         self.viewer = None
 51 |         self._viewers = {}
 52 | 
 53 |         self.metadata = {
 54 |             'render.modes': ['human', 'rgb_array', 'depth_array'],
 55 |             'video.frames_per_second': int(np.round(1.0 / self.dt))
 56 |         }
 57 | 
 58 |         self.init_qpos = self.sim.data.qpos.ravel().copy()
 59 |         self.init_qvel = self.sim.data.qvel.ravel().copy()
 60 | 
 61 |         self._set_action_space()
 62 | 
 63 |         action = self.action_space.sample()
 64 |         observation, _reward, done, _info = self.step(action)
 65 |         # assert not done
 66 | 
 67 |         self._set_observation_space(observation)
 68 | 
 69 |         self.seed()
 70 | 
 71 |     def _set_action_space(self):
 72 |         bounds = self.model.actuator_ctrlrange.copy().astype(np.float32)
 73 |         low, high = bounds.T
 74 |         self.action_space = spaces.Box(low=low, high=high, dtype=np.float32)
 75 |         return self.action_space
 76 | 
 77 |     def _set_observation_space(self, observation):
 78 |         self.observation_space = convert_observation_to_space(observation)
 79 |         return self.observation_space
 80 | 
 81 |     def seed(self, seed=None):
 82 |         self.np_random, seed = seeding.np_random(seed)
 83 |         return [seed]
 84 | 
 85 |     # methods to override:
 86 |     # ----------------------------
 87 | 
 88 |     def reset_model(self):
 89 |         """
 90 |         Reset the robot degrees of freedom (qpos and qvel).
 91 |         Implement this in each subclass.
 92 |         """
 93 |         raise NotImplementedError
 94 | 
 95 |     def viewer_setup(self):
 96 |         """
 97 |         This method is called when the viewer is initialized.
 98 |         Optionally implement this method, if you need to tinker with camera position
 99 |         and so forth.
100 |         """
101 |         pass
102 | 
103 |     # -----------------------------
104 | 
105 |     def reset(self):
106 |         self.sim.reset()
107 |         ob = self.reset_model()
108 |         return ob
109 | 
110 |     def set_state(self, qpos, qvel):
111 |         assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
112 |         old_state = self.sim.get_state()
113 |         new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel,
114 |                                          old_state.act, old_state.udd_state)
115 |         self.sim.set_state(new_state)
116 |         self.sim.forward()
117 | 
118 |     @property
119 |     def dt(self):
120 |         return self.model.opt.timestep * self.frame_skip
121 | 
122 |     def do_simulation(self, ctrl, n_frames):
123 |         self.sim.data.ctrl[:] = ctrl
124 |         for _ in range(n_frames):
125 |             self.sim.step()
126 | 
127 |     def render(self,
128 |                mode='human',
129 |                width=DEFAULT_SIZE,
130 |                height=DEFAULT_SIZE,
131 |                camera_id=None,
132 |                camera_name=None):
133 |         if mode == 'rgb_array':
134 |             if camera_id is not None and camera_name is not None:
135 |                 raise ValueError("Both `camera_id` and `camera_name` cannot be"
136 |                                  " specified at the same time.")
137 | 
138 |             no_camera_specified = camera_name is None and camera_id is None
139 |             if no_camera_specified:
140 |                 camera_name = 'track'
141 | 
142 |             if camera_id is None and camera_name in self.model._camera_name2id:
143 |                 camera_id = self.model.camera_name2id(camera_name)
144 | 
145 |             self._get_viewer(mode).render(width, height, camera_id=camera_id)
146 |             # window size used for old mujoco-py:
147 |             data = self._get_viewer(mode).read_pixels(width, height, depth=False)
148 |             # original image is upside-down, so flip it
149 |             return data[::-1, :, :]
150 |         elif mode == 'depth_array':
151 |             self._get_viewer(mode).render(width, height)
152 |             # window size used for old mujoco-py:
153 |             # Extract depth part of the read_pixels() tuple
154 |             data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1]
155 |             # original image is upside-down, so flip it
156 |             return data[::-1, :]
157 |         elif mode == 'human':
158 |             self._get_viewer(mode).render()
159 | 
160 |     def close(self):
161 |         if self.viewer is not None:
162 |             # self.viewer.finish()
163 |             self.viewer = None
164 |             self._viewers = {}
165 | 
166 |     def _get_viewer(self, mode):
167 |         self.viewer = self._viewers.get(mode)
168 |         if self.viewer is None:
169 |             if mode == 'human':
170 |                 self.viewer = mujoco_py.MjViewer(self.sim)
171 |             elif mode == 'rgb_array' or mode == 'depth_array':
172 |                 self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1)
173 | 
174 |             self.viewer_setup()
175 |             self._viewers[mode] = self.viewer
176 |         return self.viewer
177 | 
178 |     def get_body_com(self, body_name):
179 |         return self.data.get_body_xpos(body_name)
180 | 
181 |     def state_vector(self):
182 |         return np.concatenate([
183 |             self.sim.data.qpos.flat,
184 |             self.sim.data.qvel.flat
185 |         ])
186 | 
187 |     def place_random_objects(self):
188 |         for i in range(9):
189 |             random_color_array = np.append(np.random.uniform(0, 1, size=3), 1)
190 |             random_pos_array = np.append(np.random.uniform(-10., 10., size=2), 0.5)
191 |             site_id = self.sim.model.geom_name2id('obj' + str(i))
192 |             self.sim.model.geom_rgba[site_id] = random_color_array
193 |             self.sim.model.geom_pos[site_id] = random_pos_array
194 | 


--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/robot_env.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import copy
  3 | import numpy as np
  4 | 
  5 | import gym
  6 | from gym import error, spaces
  7 | from gym.utils import seeding
  8 | 
  9 | try:
 10 |     import mujoco_py
 11 | except ImportError as e:
 12 |     raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e))
 13 | 
 14 | DEFAULT_SIZE = 500
 15 | 
 16 | class RobotEnv(gym.GoalEnv):
 17 |     def __init__(self, model_path, initial_qpos, n_actions, n_substeps):
 18 |         if model_path.startswith('/'):
 19 |             fullpath = model_path
 20 |         else:
 21 |             fullpath = os.path.join(os.path.dirname(__file__), 'assets', model_path)
 22 |         if not os.path.exists(fullpath):
 23 |             raise IOError('File {} does not exist'.format(fullpath))
 24 | 
 25 |         model = mujoco_py.load_model_from_path(fullpath)
 26 |         self.sim = mujoco_py.MjSim(model, nsubsteps=n_substeps)
 27 |         self.viewer = None
 28 |         self._viewers = {}
 29 | 
 30 |         self.metadata = {
 31 |             'render.modes': ['human', 'rgb_array'],
 32 |             'video.frames_per_second': int(np.round(1.0 / self.dt))
 33 |         }
 34 | 
 35 |         self.seed()
 36 |         self._env_setup(initial_qpos=initial_qpos)
 37 |         self.initial_state = copy.deepcopy(self.sim.get_state())
 38 | 
 39 |         self.goal = self._sample_goal()
 40 |         obs = self._get_obs()
 41 |         self.action_space = spaces.Box(-1., 1., shape=(n_actions,), dtype='float32')
 42 |         self.observation_space = spaces.Dict(dict(
 43 |             desired_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'),
 44 |             achieved_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'),
 45 |             observation=spaces.Box(-np.inf, np.inf, shape=obs['observation'].shape, dtype='float32'),
 46 |         ))
 47 | 
 48 |     @property
 49 |     def dt(self):
 50 |         return self.sim.model.opt.timestep * self.sim.nsubsteps
 51 | 
 52 |     # Env methods
 53 |     # ----------------------------
 54 | 
 55 |     def seed(self, seed=None):
 56 |         self.np_random, seed = seeding.np_random(seed)
 57 |         return [seed]
 58 | 
 59 |     def step(self, action):
 60 |         # if (action.shape < (4,) and np.ndim(action.shape) == 1): 
 61 |         #     action = np.append(action, np.zeros(4 - action.shape[0]))
 62 |         action = np.clip(action, self.action_space.low, self.action_space.high)
 63 |         self._set_action(action)
 64 |         self.sim.step()
 65 |         self._step_callback()
 66 |         obs = self._get_obs()
 67 | 
 68 |         done = False
 69 |         info = {
 70 |             'is_success': self._is_success(obs['achieved_goal'], self.goal), 
 71 |             'cost': self._compute_costs(obs),
 72 |         }
 73 |         reward = self.compute_reward(obs['achieved_goal'], self.goal, info)
 74 |         return obs, reward, done, info
 75 | 
 76 |     def reset(self, **kwargs):
 77 |         # Attempt to reset the simulator. Since we randomize initial conditions, it
 78 |         # is possible to get into a state with numerical issues (e.g. due to penetration or
 79 |         # Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand).
 80 |         # In this case, we just keep randomizing until we eventually achieve a valid initial
 81 |         # configuration.
 82 |         super(RobotEnv, self).reset()
 83 |         did_reset_sim = False
 84 |         self.goal = self._sample_goal().copy()
 85 |         while not did_reset_sim:
 86 |             did_reset_sim = self._reset_sim(**kwargs)
 87 |         obs = self._get_obs()
 88 |         return obs
 89 | 
 90 |     def close(self):
 91 |         if self.viewer is not None:
 92 |             # self.viewer.finish()
 93 |             self.viewer = None
 94 |             self._viewers = {}
 95 | 
 96 |     def render(self, mode='human', width=DEFAULT_SIZE, height=DEFAULT_SIZE):
 97 |         self._render_callback()
 98 |         if mode == 'rgb_array':
 99 |             self._get_viewer(mode).render(width, height)
100 |             # window size used for old mujoco-py:
101 |             data = self._get_viewer(mode, True).read_pixels(width, height, depth=False)
102 |             # original image is upside-down, so flip it
103 |             return data[::-1, :, :]
104 |         elif mode == 'human':
105 |             self._get_viewer(mode).render()
106 | 
107 |     def _get_viewer(self, mode, cam_fixed=False):
108 |         self.viewer = self._viewers.get(mode)
109 |         if self.viewer is None:
110 |             if mode == 'human':
111 |                 self.viewer = mujoco_py.MjViewer(self.sim)
112 |             elif mode == 'rgb_array':
113 |                 self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, 0)
114 |                 cam_fixed = True
115 |             self._viewer_setup(cam_fixed)
116 |             self._viewers[mode] = self.viewer
117 |         return self.viewer
118 | 
119 |     # Extension methods
120 |     # ----------------------------
121 | 
122 |     def _reset_sim(self, **kwargs):
123 |         """Resets a simulation and indicates whether or not it was successful.
124 |         If a reset was unsuccessful (e.g. if a randomized state caused an error in the
125 |         simulation), this method should indicate such a failure by returning False.
126 |         In such a case, this method will be called again to attempt a the reset again.
127 |         """
128 |         self.sim.set_state(self.initial_state)
129 |         self.sim.forward()
130 |         return True
131 | 
132 |     def _get_obs(self):
133 |         """Returns the observation.
134 |         """
135 |         raise NotImplementedError()
136 | 
137 |     def _set_action(self, action):
138 |         """Applies the given action to the simulation.
139 |         """
140 |         raise NotImplementedError()
141 | 
142 |     def _is_success(self, achieved_goal, desired_goal):
143 |         """Indicates whether or not the achieved goal successfully achieved the desired goal.
144 |         """
145 |         raise NotImplementedError()
146 | 
147 |     def _sample_goal(self):
148 |         """Samples a new goal and returns it.
149 |         """
150 |         raise NotImplementedError()
151 | 
152 |     def _env_setup(self, initial_qpos):
153 |         """Initial configuration of the environment. Can be used to configure initial state
154 |         and extract information from the simulation.
155 |         """
156 |         pass
157 | 
158 |     def _viewer_setup(self, cam_fixed=False):
159 |         """Initial configuration of the viewer. Can be used to set the camera position,
160 |         for example.
161 |         """
162 |         pass
163 | 
164 |     def _render_callback(self):
165 |         """A custom callback that is called before rendering. Can be used
166 |         to implement custom visualizations.
167 |         """
168 |         pass
169 | 
170 |     def _step_callback(self):
171 |         """A custom callback that is called after stepping the simulation. Can be used
172 |         to enforce additional constraints on the simulation state.
173 |         """
174 |         pass
175 | 
176 |     def _compute_costs(self, obs): 
177 |         """Calculate the costs for the given observation
178 |         """
179 |         pass
180 | 


--------------------------------------------------------------------------------
/envs/utils.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import envs.mujoco_safety_gym
 3 | from wrappers import NormalizeActionWrapper
 4 | 
 5 | def get_gym_env():
 6 |     import gym
 7 |     import envs.mujoco_safety_gym
 8 |     
 9 |     return gym.make
10 | 
11 | def get_safety_gym():  
12 |     import safety_gym
13 | 
14 |     return gym.make
15 | 
16 | ENVS_FUNCTIONS = {
17 |     'gym':get_gym_env()
18 | }
19 | 
20 | def get_environment(universe, task, environment_kwargs):
21 |     env = ENVS_FUNCTIONS[universe](task, **environment_kwargs)
22 |     return env
23 | 
24 | def get_env_from_params(env_params):
25 |     universe = env_params['universe']
26 |     task = env_params['task']
27 |     environment_kwargs = env_params.get('kwargs', {}).copy()
28 |     
29 |     env = get_environment(universe, task, environment_kwargs)
30 |     
31 |     #### @anyboby maybe write something nicer for wrappers
32 |     if env_params.get('normalize_actions', False):
33 |         env = NormalizeActionWrapper(env)
34 |     
35 |     return env
36 | 


--------------------------------------------------------------------------------
/envs/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | from .normalize_action import NormalizeActionWrapper


--------------------------------------------------------------------------------
/envs/wrappers/normalize_action.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym import spaces
 3 | import numpy as np
 4 | 
 5 | 
 6 | __all__ = ['NormalizeActionWrapper']
 7 | 
 8 | class NormalizeActionWrapper(gym.ActionWrapper):
 9 |     """Rescale the action space of the environment."""
10 | 
11 |     def action(self, action):
12 |         if not isinstance(self.env.action_space, spaces.Box):
13 |             return action
14 | 
15 |         # rescale the action
16 |         low, high = self.env.action_space.low, self.env.action_space.high
17 |         scaled_action = low + (action + 1.0) * (high - low) / 2.0
18 |         scaled_action = np.clip(scaled_action, low, high)
19 | 
20 |         return scaled_action
21 | 
22 |     def reverse_action(self, action):
23 |         raise NotImplementedError
24 | 
25 | normalize = NormalizeActionWrapper
26 | 


--------------------------------------------------------------------------------
/models/base_model.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | class BaseModel(abc.ABC):
 4 | 
 5 |     @abc.abstractmethod
 6 |     def predict(self, x):
 7 |         """ Make predictions, should return (mean, var) if model is probabilistic or mean else"""
 8 |         raise NotImplementedError
 9 | 
10 |     @abc.abstractmethod
11 |     def train(self, x, y, ):
12 |         """ Make predictions, should return (mean, var) if model is probabilistic or mean else"""
13 |         raise NotImplementedError
14 | 
15 |     @abc.abstractproperty
16 |     def is_probabilistic(self):
17 |         """ indicates whether model predictions are probabilistic or deterministic """
18 |         raise NotImplementedError 
19 | 
20 |     @abc.abstractproperty
21 |     def is_ensemble(self):
22 |         """ indicates whether model is an ensemble """
23 |         raise NotImplementedError 
24 | 
25 |     @abc.abstractproperty
26 |     def in_dim(self):
27 |         """ dimension of inputs """
28 |         raise NotImplementedError 
29 | 
30 |     @abc.abstractproperty
31 |     def out_dim(self):
32 |         """ dimension of outputs """
33 |         raise NotImplementedError 
34 | 
35 | class EnsembleModel(BaseModel):
36 |     @abc.abstractmethod
37 |     def predict_ensemble(self, x):
38 |         """ Make predictions of whole ensemble, output shape should be (ensemble, batch_size, y_shape)"""
39 |         raise NotImplementedError
40 |     
41 |     @abc.abstractmethod
42 |     def elite_inds(self,):
43 |         """ Returns indices of the elite models"""
44 |         raise NotImplementedError


--------------------------------------------------------------------------------
/models/fake_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import pdb
  4 | 
  5 | from models.pens.pe_factory import build_PE, format_samples_for_dyn, format_samples_for_cost
  6 | from models.pens.utils import average_dkl, median_dkl
  7 | from models.statics import (REWS_BY_TASK, COST_BY_TASK, TERMS_BY_TASK)
  8 | 
  9 | from itertools import count
 10 | import warnings
 11 | import time
 12 | 
 13 | EPS = 1e-8
 14 | 
 15 | class FakeEnv:
 16 | 
 17 |     def __init__(self, 
 18 |                     true_environment,
 19 |                     task,
 20 |                     model,
 21 |                     predicts_delta,
 22 |                     predicts_rew,
 23 |                     predicts_cost,
 24 |                     ):
 25 |         """
 26 |         Creates a fake environment that emulates common RL env methodology:
 27 |         Args:
 28 |             true_environment(`env`): true environment, used for shapes
 29 |             task(`str`): name of the task, used to locate static fallback functions for r, c, or term
 30 |             model(`BaseModel`): dynamics model, should inherit from BaseModel and implement the corresponding 
 31 |                 methods, 
 32 |                 inputs dim should be (obs_dim + act_dim,)
 33 |             predicts_delta(`bool`): Does the model predict state-changes or absolute next-states?
 34 |             predicts_rew(`bool`): Does the model predict rewards? 
 35 |                 If yes: rewards should be included in outputs after dynamics, 
 36 |                 i.e.: dim(outputs) = (..., (next_obs, r))
 37 |             predicts_cost(`bool`): Does the model predict costs? 
 38 |                 If yes: costs should be included in outputs after dynamics and rewards (if applicable),
 39 |                 i.e.: dim(outputs) = (..., (next_obs, r, c))
 40 |         """
 41 |         self.env = true_environment
 42 |         self.obs_dim = np.prod(self.observation_space.shape)
 43 |         self.act_dim = np.prod(self.action_space.shape)
 44 |         self._task = task
 45 |         
 46 |         self._model = model
 47 |         self._uses_ensemble = self._model.is_ensemble
 48 |         self._is_probabilistic = self._model.is_probabilistic
 49 |         
 50 |         self._predicts_delta = predicts_delta
 51 |         self._predicts_rew = predicts_rew
 52 |         self._predicts_cost = predicts_cost
 53 | 
 54 |         #### create fake env from model
 55 |         self.input_dim = self._model.in_dim
 56 |         self.output_dim = self._model.out_dim
 57 |         
 58 |     @property
 59 |     def observation_space(self):
 60 |         return self.env.observation_space
 61 | 
 62 |     @property
 63 |     def action_space(self):
 64 |         return self.env.action_space
 65 |     
 66 |     def step(self, obs, act, deterministic=True):
 67 |         assert len(obs.shape) == len(act.shape)
 68 |         assert obs.shape[-1]==self.obs_dim and act.shape[-1]==self.act_dim
 69 | 
 70 |         ### check dimensionality of obs
 71 |         obs_depth = len(obs.shape)
 72 |         if obs_depth == 1:
 73 |             obs = obs[None]
 74 |             act = act[None]
 75 |             return_single=True
 76 |         else:
 77 |             return_single = False
 78 | 
 79 | 
 80 |         ### create model inputs
 81 |         inputs = np.concatenate((obs, act), axis=-1)
 82 | 
 83 |         ### if 3D-inputs, we shuffle so different models predict at every step
 84 |         if obs_depth==3:
 85 |             inputs, shuffle_indxs = self.forward_shuffle(inputs)
 86 | 
 87 |         ### predict
 88 |         if self._uses_ensemble:
 89 |             pred = self._model.predict_ensemble(inputs)       #### dyn_vars gives ep. vars for 
 90 |         else:
 91 |             pred = self._model.predict(inputs)
 92 |         
 93 |         ### split predictions if probabilistic
 94 |         if self._is_probabilistic:
 95 |             pred_mean, pred_var = pred
 96 |         else:
 97 |             pred_mean, pred_var = pred, np.zeros_like(pred)
 98 | 
 99 |         ### shuffle back
100 |         if obs_depth==3:
101 |             pred_mean, pred_var = self.inverse_shuffle(pred_mean, shuffle_indxs), self.inverse_shuffle(pred_var, shuffle_indxs)
102 |         
103 |         #### probabilistic transitions if var is predicted and deterministic is passed
104 |         pred_std = np.sqrt(pred_var)
105 |         if not deterministic:
106 |             next_obs = pred_mean[...,:self.obs_dim] + pred_std[...,:self.obs_dim]
107 |         else:
108 |             next_obs = pred_mean[...,:self.obs_dim]
109 | 
110 |         #### extract uncertainty measures
111 |         if self._uses_ensemble:
112 |             ens_ep_var = np.var(next_obs, axis=0)
113 |             ens_dkl_path = np.mean(average_dkl(next_obs, pred_std[...,:self.obs_dim]), axis=-1) ##@anyboby gives ugly numbers if var=0
114 |             ens_dkl_mean = np.mean(ens_dkl_path)
115 |         else:
116 |             ens_ep_var = 0
117 |             ens_dkl_path = np.zeros(shape=obs.shape[1])
118 |             ens_dkl_mean = 0
119 | 
120 |         #### choose one model from ensemble randomly, if ensemble and not 3d inputs
121 |         if self._uses_ensemble and obs_depth<3:
122 |             _, batch_size, _ = next_obs.shape
123 |             model_inds = self.random_inds(batch_size) ## only elites
124 |             batch_inds = np.arange(0, batch_size)
125 |             next_obs = next_obs[model_inds, batch_inds]
126 |         else:
127 |             next_obs = next_obs
128 |         
129 |         #### add to obs if delta predictions
130 |         if self._predicts_delta:
131 |             next_obs += obs
132 | 
133 |         #### extract rew, cost, or call fallback functions for terms, rews and costs
134 |         if TERMS_BY_TASK.get(self._task, None):
135 |             terms = TERMS_BY_TASK[self._task](obs, act, next_obs)
136 |         else: 
137 |             terms = TERMS_BY_TASK['default'](obs, act, next_obs)
138 | 
139 |         if self._predicts_cost:
140 |             c = pred_mean[...,-1:]
141 |             c = c[model_inds, batch_inds]
142 |             pred_mean = pred_mean[...,:-1]
143 |         elif COST_BY_TASK.get(self._task, None):
144 |             c = COST_BY_TASK[self._task](obs, act, next_obs)
145 |         else: 
146 |             c = np.zeros_like(terms)
147 | 
148 |         if self._predicts_rew:
149 |             r = pred_mean[...,-1:]
150 |             r = r[model_inds, batch_inds]
151 |             pred_mean = pred_mean[...,:-1]
152 |         elif REWS_BY_TASK.get(self._task, None):
153 |             r = REWS_BY_TASK[self._task](obs, act, next_obs)
154 | 
155 |         assert r is not None, \
156 |             "Please provide either static functions or predictions for rewards, costs and terms"
157 | 
158 |         if return_single:
159 |             next_obs = next_obs[0]
160 |             r = r[0]
161 |             c = c[0]
162 |             terms = terms[0]
163 | 
164 |         info = {
165 |                 'ensemble_dkl_mean' : ens_dkl_mean,
166 |                 'ensemble_dkl_path' : ens_dkl_path,
167 |                 'ensemble_ep_var' : ens_ep_var,
168 |                 'rew':r,
169 |                 'cost':c,
170 |                 }
171 | 
172 |         return next_obs, r, terms, info
173 | 
174 |     def random_inds(self, size):
175 |         if self._model.is_ensemble:
176 |             return np.random.choice(self._model.elite_inds, (size))
177 |         else:
178 |             return np.random.choice([0], (size))
179 |         
180 |     def forward_shuffle(self, ndarray):
181 |         """
182 |         shuffles ndarray forward along axis 0 with random elite indices, 
183 |         Returns shuffled copy of ndarray and indices with which was shuffled
184 |         """
185 |         idxs = np.random.permutation(ndarray.shape[0])
186 |         shuffled = ndarray[idxs]
187 |         return shuffled, idxs
188 | 
189 |     def inverse_shuffle(self, ndarray, idxs):
190 |         """
191 |         inverses a shuffle of ndarray forward along axis 0, given the used indices. 
192 |         Returns unshuffled copy of ndarray
193 |         """
194 |         unshuffled = ndarray[idxs]
195 |         return unshuffled
196 | 
197 |     def close(self):
198 |         pass
199 | 


--------------------------------------------------------------------------------
/models/pens/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/models/pens/__init__.py


--------------------------------------------------------------------------------
/models/pens/logger.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import math
  3 | import pdb
  4 | 
  5 | 
  6 | 
  7 | def update_dict(dict_a, dict_b, weight_a=.5, weight_b=.5):
  8 | 	"""
  9 | 	creates new updated dict and adds entries according to weights. 
 10 | 	for both weights = 1 the entries are added
 11 | 	"""
 12 | 	dict_a_cp = dict(dict_a)
 13 | 	dict_a_cp.update(dict_b)
 14 | 	for k,v in dict_b.items():
 15 | 		if k in dict_a.keys():
 16 | 			dict_a_cp[k] = weight_b*dict_b[k] + weight_a*dict_a[k]
 17 | 	return dict_a_cp
 18 | 
 19 | class Progress:
 20 | 
 21 | 	def __init__(self, total, name = 'Progress', ncol=3, max_length=20, indent=0, line_width=100, speed_update_freq=100):
 22 | 		self.total = total
 23 | 		self.name = name
 24 | 		self.ncol = ncol
 25 | 		self.max_length = max_length
 26 | 		self.indent = indent
 27 | 		self.line_width = line_width
 28 | 		self._speed_update_freq = speed_update_freq
 29 | 
 30 | 		self._step = 0
 31 | 		self._prev_line = '\033[F'
 32 | 		self._clear_line = ' ' * self.line_width
 33 | 
 34 | 		self._pbar_size = self.ncol * self.max_length
 35 | 		self._complete_pbar = '#' * self._pbar_size
 36 | 		self._incomplete_pbar = ' ' * self._pbar_size
 37 | 
 38 | 		self.lines = ['']
 39 | 		self.fraction = '{} / {}'.format(0, self.total)
 40 | 
 41 | 		self.resume()
 42 | 
 43 | 		
 44 | 	def update(self, n=1):
 45 | 		self._step += n
 46 | 		if self._step % self._speed_update_freq == 0:
 47 | 			self._time0 = time.time()
 48 | 			self._step0 = self._step
 49 | 
 50 | 	def resume(self):
 51 | 		self._skip_lines = 1
 52 | 		print('\n', end='')
 53 | 		self._time0 = time.time()
 54 | 		self._step0 = self._step
 55 | 
 56 | 	def pause(self):
 57 | 		self._clear()
 58 | 		self._skip_lines = 1
 59 | 
 60 | 	def set_description(self, params=[]):
 61 | 
 62 | 		############
 63 | 		# Position #
 64 | 		############
 65 | 		self._clear()
 66 | 
 67 | 		###########
 68 | 		# Percent #
 69 | 		###########
 70 | 		percent, fraction = self._format_percent(self._step, self.total)
 71 | 		self.fraction = fraction
 72 | 
 73 | 		#########
 74 | 		# Speed #
 75 | 		#########
 76 | 		speed = self._format_speed(self._step)
 77 | 
 78 | 		##########
 79 | 		# Params #
 80 | 		##########
 81 | 		num_params = len(params)
 82 | 		nrow = math.ceil(num_params / self.ncol)
 83 | 		params_split = self._chunk(params, self.ncol)
 84 | 		params_string, lines = self._format(params_split)
 85 | 		self.lines = lines
 86 | 
 87 | 
 88 | 		description = '{} | {}{}'.format(percent, speed, params_string)
 89 | 		print(description)
 90 | 		self._skip_lines = nrow + 1
 91 | 
 92 | 	def append_description(self, descr):
 93 | 		self.lines.append(descr)
 94 | 
 95 | 	def _clear(self):
 96 | 		position = self._prev_line * self._skip_lines
 97 | 		empty = '\n'.join([self._clear_line for _ in range(self._skip_lines)])
 98 | 		print(position, end='')
 99 | 		print(empty)
100 | 		print(position, end='')
101 | 		
102 | 	def _format_percent(self, n, total):
103 | 		if total:
104 | 			percent = n / float(total)
105 | 
106 | 			complete_entries = int(percent * self._pbar_size)
107 | 			incomplete_entries = self._pbar_size - complete_entries
108 | 
109 | 			pbar = self._complete_pbar[:complete_entries] + self._incomplete_pbar[:incomplete_entries]
110 | 			fraction = '{} / {}'.format(n, total)
111 | 			string = '{} [{}] {:3d}%'.format(fraction, pbar, int(percent*100))
112 | 		else:
113 | 			fraction = '{}'.format(n)
114 | 			string = '{} iterations'.format(n)
115 | 		return string, fraction
116 | 
117 | 	def _format_speed(self, n):
118 | 		num_steps = n - self._step0
119 | 		t = time.time() - self._time0
120 | 		speed = num_steps / t
121 | 		string = '{:.1f} Hz'.format(speed)
122 | 		if num_steps > 0:
123 | 			self._speed = string
124 | 		return string
125 | 
126 | 	def _chunk(self, l, n):
127 | 		return [l[i:i+n] for i in range(0, len(l), n)]
128 | 
129 | 	def _format(self, chunks):
130 | 		lines = [self._format_chunk(chunk) for chunk in chunks]
131 | 		lines.insert(0,'')
132 | 		padding = '\n' + ' '*self.indent
133 | 		string = padding.join(lines)
134 | 		return string, lines
135 | 
136 | 	def _format_chunk(self, chunk):
137 | 		line = ' | '.join([self._format_param(param) for param in chunk])
138 | 		return line
139 | 
140 | 	def _format_param(self, param):
141 | 		k, v = param
142 | 		return '{} : {}'.format(k, v)[:self.max_length]
143 | 
144 | 	def stamp(self):
145 | 		if self.lines != ['']:
146 | 			params = ' | '.join(self.lines)
147 | 			string = '[ {} ] {}{} | {}'.format(self.name, self.fraction, params, self._speed)
148 | 			self._clear()
149 | 			print(string, end='\n')
150 | 			self._skip_lines = 1
151 | 		else:
152 | 			self._clear()
153 | 			self._skip_lines = 0
154 | 
155 | 	def close(self):
156 | 		self.pause()
157 | 
158 | class Silent:
159 | 
160 | 	def __init__(self, *args, **kwargs):
161 | 		pass
162 | 
163 | 	def __getattr__(self, attr):
164 | 		return lambda *args: None
165 | 
166 | 
167 | if __name__ == '__main__':
168 | 	silent = Silent()
169 | 	silent.update()
170 | 	silent.stamp()
171 | 
172 | 	num_steps = 1000
173 | 	progress = Progress(num_steps)
174 | 	for i in range(num_steps):
175 | 		progress.update()
176 | 		params = [
177 | 			['A', '{:06d}'.format(i)],
178 | 			['B', '{:06d}'.format(i)],
179 | 			['C', '{:06d}'.format(i)],
180 | 			['D', '{:06d}'.format(i)],
181 | 			['E', '{:06d}'.format(i)],
182 | 			['F', '{:06d}'.format(i)],
183 | 			['G', '{:06d}'.format(i)],
184 | 			['H', '{:06d}'.format(i)],
185 |         ]
186 | 		progress.set_description(params)
187 | 		time.sleep(0.01)
188 | 	progress.close()
189 | 


--------------------------------------------------------------------------------
/models/pens/pe_factory.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numpy.ma as ma
  3 | import tensorflow as tf
  4 | 
  5 | import copy
  6 | from .fc import FC
  7 | from .pe import PE
  8 | 
  9 | def build_PE(in_dim, 
 10 | 					out_dim,
 11 | 					name='BNN',
 12 | 					hidden_dims=(200, 200, 200), 
 13 | 					num_networks=7, 
 14 | 					num_elites=5,
 15 | 					loss = 'MSPE', 
 16 | 					activation = 'swish',
 17 | 					output_activation = None,
 18 | 					decay=1e-4,
 19 | 					lr = 1e-3,
 20 | 					lr_decay = None,
 21 | 					decay_steps=None,
 22 | 					use_scaler_in = False,
 23 | 					use_scaler_out = False,
 24 | 					clip_loss = False,
 25 | 					kl_cliprange = 0.1,
 26 | 					max_logvar = .5,
 27 | 					min_logvar = -6,
 28 | 					session=None):
 29 | 	"""
 30 | 	Constructs a tf probabilistic ensemble model.
 31 | 	Args:
 32 | 		loss: Choose from 'MSPE', 'NLL', 'MSE', 'Huber', or 'CE'. 
 33 | 				choosing MSPE or NLL will construct a model with variance output
 34 | 	"""
 35 | 	print('[PE] dim in / out: {} / {} | Hidden dim: {}'.format(in_dim, out_dim, hidden_dims))
 36 | 	#print('[ BNN ] Input Layer dim: {} | Output Layer dim: {} '.format(obs_dim_in+act_dim+prior_dim, obs_dim_out+rew_dim))
 37 | 	params = {'name': name, 
 38 | 				'loss':loss, 
 39 | 				'num_networks': num_networks, 
 40 | 				'num_elites': num_elites, 
 41 | 				'sess': session,
 42 | 				'use_scaler_in': use_scaler_in,
 43 | 				'use_scaler_out': use_scaler_out,
 44 | 				'clip_loss': clip_loss,
 45 | 				'kl_cliprange':kl_cliprange,
 46 | 				'max_logvar':max_logvar,
 47 | 				'min_logvar':min_logvar,
 48 | 				}
 49 | 	model = PE(params)
 50 | 	model.add(FC(hidden_dims[0], input_dim=in_dim, activation=activation, weight_decay=decay/4))	# def dec: 0.000025))
 51 | 	
 52 | 	for hidden_dim in hidden_dims[1:]:
 53 | 		model.add(FC(hidden_dim, activation=activation, weight_decay=decay/2))						# def dec: 0.00005))
 54 | 	
 55 | 	model.add(FC(out_dim, activation=output_activation, weight_decay=decay))						# def dec: 0.0001
 56 | 	
 57 | 	opt_params = {"learning_rate":lr} if lr_decay is None else {"learning_rate":lr, 
 58 | 																"learning_rate_decay":lr_decay,
 59 | 																"decay_steps":decay_steps}
 60 | 	model.finalize(tf.train.AdamOptimizer, opt_params, lr_decay=lr_decay)
 61 | 
 62 | 	total_parameters = 0
 63 | 	for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name):
 64 | 		# shape is an array of tf.Dimension
 65 | 		shape = variable.get_shape()
 66 | 		variable_parameters = 1
 67 | 		for dim in shape:
 68 | 			variable_parameters *= dim.value
 69 | 		total_parameters += variable_parameters
 70 | 	print('[ Probabilistic Ensemble ] Total trainable Parameteres: {} '.format(total_parameters))
 71 | 
 72 | 	return model
 73 | 
 74 | def format_samples_for_dyn(samples, append_r=True, append_c=False, noise=None):
 75 | 	"""
 76 | 	formats samples to fit training, specifically returns: 
 77 | 	inputs, outputs:
 78 | 
 79 | 	inputs = np.concatenate((observations, act, priors), axis=-1)
 80 | 	outputs = np.concatenate(delta_observations, rewards ,costs), axis=-1)  
 81 | 
 82 | 	where rewards and costs are optional
 83 | 	"""
 84 | 	obs = samples['observations']
 85 | 	act = samples['actions']
 86 | 	next_obs = samples['next_observations']
 87 | 	terms = np.squeeze(samples['terminals'])[..., None]
 88 | 
 89 | 	delta_obs = next_obs - obs
 90 | 
 91 | 	#### ----END preprocess samples for model training in safety gym -----####
 92 | 	inputs = np.concatenate((obs, act), axis=-1)
 93 | 	
 94 | 	outputs = delta_obs
 95 | 	
 96 | 	if append_r:
 97 | 		rew = np.squeeze(samples['rewards'])[..., None]
 98 | 		outputs = np.concatenate((outputs, rew), axis=-1)
 99 | 	
100 | 	if append_c:
101 | 		costs = np.squeeze(samples['costs'])[..., None]
102 | 		outputs = np.concatenate((outputs, costs), axis=-1)
103 | 
104 | 	# add noise
105 | 	if noise:
106 | 		inputs = _add_noise(inputs, noise)		### noise helps (sometimes)
107 | 
108 | 	return inputs, outputs
109 | 
110 | 
111 | ### @anyboby, try to include this in the model rather than separately
112 | def format_samples_for_cost(samples, oversampling=False, one_hot = True, num_classes=2, noise=None):
113 | 	"""
114 | 	formats samples to fit training for cost, specifically returns: 
115 | 	(obs, act, next_obs)
116 | 
117 | 	Args:
118 | 		one_hot: determines whether targets are structured as classification or regression
119 | 					one_hot: True will output targets with shape [batch_size, num_classes]
120 | 					one_hot: False wil output targets with shape [batch_size,] and scalar targets
121 | 	"""
122 | 	next_obs = samples['next_observations']
123 | 	obs = samples['observations']
124 | 	cost = samples['costs']
125 | 	act = samples['actions']
126 | 
127 | 	if one_hot:
128 | 		cost_one_hot = np.zeros(shape=(len(cost), num_classes))
129 | 		batch_indcs = np.arange(0, len(cost))
130 | 		costs = cost.astype(int)
131 | 		cost_one_hot[(batch_indcs, costs)] = 1
132 | 		outputs = cost_one_hot
133 | 	else:
134 | 		outputs = cost[:, None]
135 | 
136 | 	inputs = np.concatenate((obs, act, next_obs), axis=-1)
137 | 	## ________________________________ ##
138 | 	##      oversample cost classes     ##
139 | 	## ________________________________ ##
140 | 	if oversampling:
141 | 		if len(outputs[np.where(costs>0)[0]])>0:
142 | 			imbalance_ratio = len(outputs[np.where(costs==0)[0]])//len(outputs[np.where(costs>0)[0]])
143 | 			extra_outputs = np.tile(outputs[np.where(costs>0)[0]], (1+imbalance_ratio//3,1))		## don't need to overdo it
144 | 			outputs = np.concatenate((outputs, extra_outputs), axis=0)
145 | 			extra_inputs = np.tile(inputs[np.where(costs>0)[0]], (1+imbalance_ratio//3,1))
146 | 			extra_inputs = _add_noise(extra_inputs, 0.0001)
147 | 			inputs = np.concatenate((inputs, extra_inputs), axis=0)
148 | 	
149 | 	### ______ add noise _____ ###
150 | 	if noise:
151 | 		inputs = _add_noise(inputs, noise)		### noise helps 
152 | 	
153 | 	return inputs, outputs
154 | 
155 | def _add_noise(data_inp, noiseToSignal):
156 |     data= copy.deepcopy(data_inp)
157 |     mean_data = np.mean(data, axis = 0)
158 |     std_of_noise = mean_data*noiseToSignal
159 |     for j in range(mean_data.shape[0]):
160 |         if(std_of_noise[j]>0):
161 |             data[:,j] = np.copy(data[:,j]+np.random.normal(0, np.absolute(std_of_noise[j]), (data.shape[0],)))
162 |     return data
163 | 
164 | def reset_model(model):
165 | 	model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=model.name)
166 | 	model.sess.run(tf.initialize_vars(model_vars))
167 | 


--------------------------------------------------------------------------------
/models/pens/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | from __future__ import absolute_import
  4 | 
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | EPS = 1e-10
  8 | 
  9 | def get_required_argument(dotmap, key, message, default=None):
 10 |     val = dotmap.get(key, default)
 11 |     if val is default:
 12 |         raise ValueError(message)
 13 |     return val
 14 | 
 15 | def gaussian_kl_np(mu0, log_std0, mu1, log_std1):
 16 |     """interprets each entry in mu_i and log_std_i as independent, 
 17 |     preserves shape
 18 |     output clipped to {0, 1e10}
 19 |     """
 20 |     var0, var1 = np.exp(2 * log_std0), np.exp(2 * log_std1)
 21 |     pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1+EPS) - 1) +  log_std1 - log_std0
 22 |     all_kls = pre_sum
 23 |     #all_kls = np.mean(all_kls)
 24 |     all_kls = np.clip(all_kls, 0, 1/EPS)        ### for stability
 25 |     return all_kls
 26 | 
 27 | def gaussian_jsd_np(mu0, log_std0, mu1, log_std1):
 28 |     pass
 29 | 
 30 | def average_dkl(mu, std):
 31 |     """
 32 |     Calculates the average kullback leiber divergences of multiple  univariate gaussian distributions.
 33 |     
 34 |     K(P1,…Pk) = 1/(k(k−1)) ∑_[k_(i,j)=1] DKL(Pi||Pj)
 35 |     
 36 |         (Andrea Sgarro, Informational divergence and the dissimilarity of probability distributions.)
 37 |     
 38 |     expects the distributions along axis 0, and samples along axis 1.
 39 |     Output is reduced by axis 0
 40 | 
 41 |     Args:
 42 |         mu: array-like means
 43 |         std: array-like stds
 44 |     """
 45 |     ## clip log
 46 |     log_std = np.log(std)
 47 |     log_std = np.clip(log_std, -100, 1e8)
 48 |     assert len(mu.shape)>=2 and len(log_std.shape)>=2
 49 |     num_models = len(mu)
 50 |     d_kl = None
 51 |     for i in range(num_models):
 52 |         for j in range(num_models):
 53 |             if d_kl is None:
 54 |                 d_kl = gaussian_kl_np(mu[i], log_std[i], mu[j], log_std[j])
 55 |             else: d_kl+= gaussian_kl_np(mu[i], log_std[i], mu[j], log_std[j])
 56 |     d_kl = d_kl/(num_models*(num_models-1)+EPS)
 57 |     return d_kl
 58 | 
 59 | def median_dkl(mu, std):
 60 |     """
 61 |     Calculates the median kullback leiber divergences of multiple  univariate gaussian distributions.
 62 |     
 63 |     K(P1,…Pk) = 1/(k(k−1)) ∑_[k_(i,j)=1] DKL(Pi||Pj)
 64 |     
 65 |         (Andrea Sgarro, Informational divergence and the dissimilarity of probability distributions.)
 66 |     
 67 |     expects the distributions along axis 0, and samples along axis 1.
 68 |     Output is reduced by axis 0
 69 | 
 70 |     Args:
 71 |         mu: array-like means
 72 |         std: array-like stds
 73 |     """
 74 |     ## clip log
 75 |     log_std = np.log(std)
 76 |     log_std = np.clip(log_std, -100, 1e8)
 77 |     assert len(mu.shape)>=2 and len(log_std.shape)>=2
 78 |     num_models = len(mu)
 79 |     d_kl = np.zeros(shape=(num_models*(num_models-1),) +  mu.shape[1:])
 80 |     n = 0
 81 |     for i in range(num_models):
 82 |         for j in range(num_models):
 83 |             if i != j:
 84 |                 d_kl[n] = gaussian_kl_np(mu[i], log_std[i], mu[j], log_std[j])
 85 |                 n += 1
 86 |     d_kl_med = np.median(d_kl, axis=0)
 87 |     return d_kl_med
 88 | 
 89 | 
 90 | class TensorStandardScaler:
 91 |     """Helper class for automatically normalizing inputs into the network.
 92 |     """
 93 |     def __init__(self, x_dim, name='Scaler'):
 94 |         """Initializes a scaler.
 95 | 
 96 |         Arguments:
 97 |         x_dim (int): The dimensionality of the inputs into the scaler.
 98 | 
 99 |         Returns: None.
100 |         """
101 |         self.fitted = False
102 |         with tf.variable_scope(name):
103 |             self.count = tf.get_variable(
104 |                 name=name+'_count', shape=(), initializer=tf.constant_initializer(0),
105 |                 trainable=False
106 |             )
107 | 
108 |             self.mu = tf.get_variable(
109 |                 name=name+'_mu', shape=[1, x_dim], initializer=tf.constant_initializer(0.0),
110 |                 trainable=False
111 |             )
112 |             self.var = tf.get_variable(
113 |                 name=name+'_std', shape=[1, x_dim], initializer=tf.constant_initializer(1.0),
114 |                 trainable=False
115 |             )
116 | 
117 |         self.cached_count, self.cached_mu, self.cached_var = 0, np.zeros([1, x_dim]), np.ones([1, x_dim])
118 | 
119 |     def fit(self, data):
120 |         """Runs two ops, one for assigning the mean of the data to the internal mean, and
121 |         another for assigning the standard deviation of the data to the internal standard deviation.
122 |         This function must be called within a 'with <session>.as_default()' block.
123 | 
124 |         Arguments:
125 |         data (np.ndarray): A numpy array containing the input
126 | 
127 |         Returns: None.
128 |         """
129 |         batch_count = data.shape[0]
130 |         batch_mu = np.mean(data, axis=0, keepdims=True)
131 |         batch_var = np.var(data, axis=0, keepdims=True)
132 |         new_mean, new_var, new_count = self.running_mean_var_from_batch(batch_mu, batch_var, batch_count)
133 |         #sigma[sigma < 1e-8] = 1.0
134 |         self.mu.load(new_mean)
135 |         self.var.load(new_var)
136 |         self.count.load(new_count)
137 |         self.fitted = True
138 |         self.cache()
139 | 
140 |     def transform(self, data):
141 |         """Transforms the input matrix data using the parameters of this scaler.
142 |         
143 |         can be adjusted to scale with a factor, to control sensitivity to ood data:
144 |         d = (d-mu)/sigma = d + (d-mu)/sigma - d = d + (d(1-sigma)-mu)/sigma 
145 |         and the version with scaling factor thus becomes
146 |         d = d + sc_factor*(d(1-sigma)-mu)/sigma
147 | 
148 |         Arguments:
149 |         data (np.array): A numpy array containing the points to be transformed.
150 |         sc_factor: Factor to what degree the original dataset is transformed
151 | 
152 |         Returns: (np.array) The transformed dataset.
153 | 
154 | 
155 |         """
156 |         scaled_transform = (data-self.mu)/(tf.maximum(tf.sqrt(self.var), 1e-2))
157 |         return scaled_transform
158 | 
159 |     def inverse_transform(self, data):
160 |         """Undoes the transformation performed by this scaler.
161 | 
162 |         Arguments:
163 |         data (np.array): A numpy array containing the points to be transformed.
164 | 
165 |         Returns: (np.array) The transformed dataset.
166 |         """
167 |         return (tf.maximum(tf.sqrt(self.var), 1e-2)) * data + self.mu
168 | 
169 |     def inverse_transform_var(self, data):
170 |         """Undoes the transformation performed by this scaler for variances.
171 | 
172 |         Arguments:
173 |         data (np.array): A numpy array containing the points to be transformed.
174 | 
175 |         Returns: (np.array) The transformed dataset.
176 |         """
177 |         return tf.square(tf.maximum(tf.sqrt(self.var), 1e-2)) * data
178 | 
179 |     def inverse_transform_logvar(self, data):
180 |         """Undoes the transformation performed by this scaler for variances.
181 | 
182 |         Arguments:
183 |         data (np.array): A numpy array containing the points to be transformed.
184 | 
185 |         Returns: (np.array) The transformed dataset.
186 |         """
187 |         return 2*tf.log(tf.maximum(tf.sqrt(self.var), 1e-2)) + data
188 | 
189 |     def get_vars(self):
190 |         """Returns a list of variables managed by this object.
191 | 
192 |         Returns: (list<tf.Variable>) The list of variables.
193 |         """
194 |         return [self.mu, self.var]
195 | 
196 |     def get_mu(self):
197 |         return self.mu
198 | 
199 |     def get_var(self):
200 |         return self.var
201 | 
202 |     def cache(self):
203 |         """Caches current values of this scaler.
204 | 
205 |         Returns: None.
206 |         """
207 |         self.cached_mu = self.mu.eval()
208 |         self.cached_var = self.var.eval()
209 |         self.cached_count = self.count.eval()
210 | 
211 |     def load_cache(self):
212 |         """Loads values from the cache
213 | 
214 |         Returns: None.
215 |         """
216 |         self.mu.load(self.cached_mu)
217 |         self.var.load(self.cached_var)
218 |         self.count.load(self.cached_count)
219 | 
220 |     def running_mean_var_from_batch(self, batch_mean, batch_var, batch_count):
221 |         delta = batch_mean - self.cached_mu
222 |         tot_count = self.cached_count + batch_count
223 | 
224 |         new_mean = self.cached_mu + delta * batch_count / tot_count
225 |         m_a = self.cached_var * self.cached_count
226 |         m_b = batch_var * batch_count
227 |         M2 = m_a + m_b + np.square(delta) * self.cached_count * batch_count / tot_count
228 |         new_var = M2 / tot_count
229 |         new_count = tot_count
230 | 
231 |         return new_mean, new_var, new_count
232 | 


--------------------------------------------------------------------------------
/models/statics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def no_done(obs, act, next_obs):
 4 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
 5 | 
 6 |     done = np.zeros(shape=obs.shape[:-1], dtype=np.bool) #np.array([False]).repeat(len(obs))
 7 |     done = done[...,None]
 8 |     return done
 9 |     
10 | def hcs_cost_f(obs, act, next_obs):
11 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
12 | 
13 |     xdist = next_obs[...,-1]*10
14 |     obj_cost = np.array((np.abs(xdist)<2.0), dtype=np.float32)[..., None]
15 |     return obj_cost
16 | 
17 | def antsafe_term_fn(obs, act, next_obs):
18 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
19 | 
20 |     z = next_obs[..., 0]
21 |     body_quat = next_obs[...,1:5]
22 |     z_rot = 1-2*(body_quat[...,1]**2+body_quat[...,2]**2)
23 | 
24 |     notdone = np.isfinite(next_obs).all(axis=-1) \
25 |         * (z >= 0.2) \
26 |         * (z <= 1.0) \
27 |         * z_rot >= -0.7
28 | 
29 |     done = ~notdone
30 |     done = done[...,None]
31 |     return done
32 | 
33 | def antsafe_c_fn(obs, act, next_obs):
34 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
35 | 
36 |     z = next_obs[..., 0]
37 |     body_quat = next_obs[...,1:5]
38 |     z_rot = 1-2*(body_quat[...,1]**2+body_quat[...,2]**2)
39 |     y_dist = next_obs[..., -1:]
40 |     
41 |     obj_cost = np.any(abs(y_dist)>3.2, axis=-1)[...,None]*1.0
42 | 
43 |     notdone = np.isfinite(next_obs).all(axis=-1) \
44 |         * (z >= 0.2) \
45 |         * (z <= 1.0) \
46 |         * z_rot >= -0.7
47 | 
48 |     done = ~notdone
49 |     done = done[...,None]
50 | 
51 |     done_cost = done*1.0
52 |     cost = np.clip(done_cost+obj_cost, 0, 1)
53 |     return cost
54 | 
55 | 
56 | TERMS_BY_TASK = {
57 |     'default':no_done,
58 |     'HalfCheetah-v2':no_done,
59 |     'HalfCheetahSafe-v2':no_done,
60 |     'AntSafe-v2':antsafe_term_fn,
61 | }
62 | 
63 | REWS_BY_TASK = {
64 |     
65 | }
66 | 
67 | COST_BY_TASK = {
68 |     'HalfCheetahSafe-v2':hcs_cost_f,
69 |     'AntSafe-v2':antsafe_c_fn,
70 | }


--------------------------------------------------------------------------------
/network/ac_network.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from gym.spaces import Box, Discrete
  4 | from utilities.utils import combined_shape, EPS
  5 | 
  6 | """
  7 | Network utils
  8 | """
  9 | 
 10 | def placeholder(dim=None):
 11 |     return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim))
 12 | 
 13 | def placeholders(*args):
 14 |     return [placeholder(dim) for dim in args]
 15 | 
 16 | def placeholder_from_space(space):
 17 |     if isinstance(space, Box):
 18 |         return placeholder(space.shape)
 19 |     elif isinstance(space, Discrete):
 20 |         return tf.placeholder(dtype=tf.int32, shape=(None,))
 21 |     raise NotImplementedError('bad space {}'.format(space))
 22 | 
 23 | def placeholders_from_spaces(*args):
 24 |     return [placeholder_from_space(space) for space in args]
 25 | 
 26 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None, ensemble_size = 1):
 27 |     for h in hidden_sizes[:-1]:
 28 |         if ensemble_size==1:
 29 |             x = tf.layers.dense(x, units=h, activation=activation)
 30 |         else:
 31 |             x = tf.layers.dense(x, units=(ensemble_size,)+(h,), activation=activation)
 32 |             x = tf.transpose(x)
 33 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 34 | 
 35 | def get_vars(scope=''):
 36 |     return [x for x in tf.trainable_variables() if '/'+scope+'/' in x.name]
 37 | 
 38 | def count_vars(scope=''):
 39 |     v = get_vars(scope)
 40 |     return sum([np.prod(var.shape.as_list()) for var in v])
 41 | 
 42 | """
 43 | Gaussian distributions
 44 | """
 45 | 
 46 | def gaussian_likelihood(x, mu, log_std):
 47 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 48 |     return tf.reduce_sum(pre_sum, axis=1)
 49 | 
 50 | def gaussian_kl(mu0, log_std0, mu1, log_std1):
 51 |     """Returns average kl divergence between two batches of dists"""
 52 |     var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1)
 53 |     pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1 + EPS) - 1) +  log_std1 - log_std0
 54 |     all_kls = tf.reduce_sum(pre_sum, axis=1)
 55 |     return tf.reduce_mean(all_kls)
 56 | 
 57 | def gaussian_entropy(log_std):
 58 |     """Returns average entropy over a batch of dists"""
 59 |     pre_sum = log_std + 0.5 * np.log(2*np.pi*np.e)
 60 |     all_ents = tf.reduce_sum(pre_sum, axis=-1)
 61 |     return tf.reduce_mean(all_ents)
 62 | 
 63 | """
 64 | Categorical distributions
 65 | """
 66 | 
 67 | def categorical_kl(logp0, logp1):
 68 |     """Returns average kl divergence between two batches of dists"""
 69 |     all_kls = tf.reduce_sum(tf.exp(logp1) * (logp1 - logp0), axis=1)
 70 |     return tf.reduce_mean(all_kls)
 71 | 
 72 | def categorical_entropy(logp):
 73 |     """Returns average entropy over a batch of dists"""
 74 |     all_ents = -tf.reduce_sum(logp * tf.exp(logp), axis=1)
 75 |     return tf.reduce_mean(all_ents)
 76 | 
 77 | 
 78 | """
 79 | Policies
 80 | """
 81 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 82 |     act_dim = action_space.n
 83 |     logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None)
 84 |     logp_all = tf.nn.log_softmax(logits)
 85 |     pi = tf.squeeze(tf.multinomial(logits,1), axis=1)
 86 |     logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
 87 |     logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1)
 88 | 
 89 |     old_logp_all = placeholder(act_dim)
 90 |     d_kl = categorical_kl(logp_all, old_logp_all)
 91 |     ent = categorical_entropy(logp_all)
 92 | 
 93 |     pi_info = {'logp_all': logp_all}
 94 |     pi_info_phs = {'logp_all': old_logp_all}
 95 | 
 96 |     return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent
 97 | 
 98 | 
 99 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
100 |     act_dim = a.shape.as_list()[-1]
101 | 
102 |     
103 |     mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
104 |     log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
105 |     ### @anyboby testing: higher starting std, ppo1 uses log_std=0 at the beginning
106 |     # log_std = tf.get_variable(name='log_std', shape=act_dim ,initializer=tf.zeros_initializer(), dtype=tf.float32)
107 |     std = tf.exp(log_std)
108 | 
109 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
110 |     logp = gaussian_likelihood(a, mu, log_std)
111 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
112 | 
113 |     old_mu_ph, old_log_std_ph = placeholders(act_dim, act_dim)
114 |     d_kl = gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph)
115 |     ent = gaussian_entropy(log_std)
116 | 
117 |     # adjust log_std to input dim, even though it doesn't depend on it
118 |     # @anyboby lol this is so bad
119 |     log_std_info = tf.tensordot(tf.ones(tf.shape(x)[0]), log_std, axes=0)
120 |     pi_info = {'mu': mu, 'log_std': log_std_info}
121 |     pi_info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph}
122 | 
123 |     return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent
124 | 
125 | 
126 | LOG_STD_MAX = 2
127 | LOG_STD_MIN = -20
128 | 
129 | def mlp_squashed_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
130 |     """
131 |     Experimental code for squashed gaussian policies, not yet tested
132 |     """
133 |     act_dim = a.shape.as_list()[-1]
134 |     net = mlp(x, list(hidden_sizes), activation, activation)
135 |     mu = tf.layers.dense(net, act_dim, activation=output_activation)
136 |     log_std = tf.layers.dense(net, act_dim, activation=None)
137 |     log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)
138 | 
139 |     std = tf.exp(log_std)
140 |     u = mu + tf.random_normal(tf.shape(mu)) * std
141 |     pi = tf.tanh(u)
142 | 
143 |     old_mu_ph, old_log_std_ph, u_ph = placeholders(act_dim, act_dim, act_dim)
144 |     d_kl = gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph)  # kl is invariant to squashing transform
145 | 
146 |     def apply_squashing_func(log_prob, raw_action):
147 |         # Adjustment to log prob
148 |         act = tf.tanh(raw_action)
149 |         log_prob -= tf.reduce_sum(2*(np.log(2) - act - tf.nn.softplus(-2*act)), axis=1)
150 |         return log_prob
151 | 
152 |     # Base log probs
153 |     logp = gaussian_likelihood(u_ph, mu, log_std)
154 |     logp_pi = gaussian_likelihood(u, mu, log_std)
155 | 
156 |     # Squashed log probs
157 |     logp = apply_squashing_func(logp, u_ph)
158 |     logp_pi = apply_squashing_func(logp_pi, u)
159 | 
160 |     # Approximate entropy
161 |     ent = -tf.reduce_mean(logp_pi)  # approximate! hacky!
162 | 
163 |     pi_info = {'mu': mu, 'log_std': log_std, 'raw_action': u}
164 |     pi_info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph, 'raw_action': u_ph}
165 | 
166 |     return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent
167 | 
168 | 
169 | 
170 | """
171 | Actor-Critics
172 | """
173 | def mlp_actor_critic(x, a, hidden_sizes_a=(64,64), hidden_sizes_c=(64,64), critic_ensemble_size=1, activation=tf.tanh,
174 |                      output_activation=None, policy=None, action_space=None):
175 | 
176 |     # default policy builder depends on action space
177 |     if policy is None and isinstance(action_space, Box):
178 |         policy = mlp_gaussian_policy
179 |     elif policy is None and isinstance(action_space, Discrete):
180 |         policy = mlp_categorical_policy
181 | 
182 |     with tf.variable_scope('pi'):
183 |         policy_outs = policy(x, a, hidden_sizes_a, activation, output_activation, action_space)
184 |         pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent = policy_outs
185 | 
186 |     with tf.variable_scope('vf'):
187 |         v = tf.squeeze(mlp(x, list(hidden_sizes_c)+[1], activation, None, ensemble_size=critic_ensemble_size))
188 | 
189 |     with tf.variable_scope('vc'):
190 |         vc = tf.squeeze(mlp(x, list(hidden_sizes_c)+[1], activation, None, ensemble_size=critic_ensemble_size))
191 | 
192 |     return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent, v, vc
193 | 
194 | def mlp_actor(x, a, hidden_sizes=(64,64), activation=tf.tanh,
195 |                      output_activation=None, policy=None, action_space=None):
196 | 
197 |     # default policy builder depends on action space
198 |     if policy is None and isinstance(action_space, Box):
199 |         policy = mlp_gaussian_policy
200 |     elif policy is None and isinstance(action_space, Discrete):
201 |         policy = mlp_categorical_policy
202 | 
203 |     with tf.variable_scope('pi'):
204 |         policy_outs = policy(x, a, hidden_sizes, activation, output_activation, action_space)
205 |         pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent = policy_outs
206 |     return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent
207 | 
208 | def mlp_critic (x, hidden_sizes=(64,64), activation=tf.tanh,
209 |                      output_activation=None, policy=None, action_space=None, name='V'):
210 |     with tf.variable_scope(name+'f'):
211 |         v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None))
212 | 
213 |     with tf.variable_scope(name+'c'):
214 |         vc = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None))
215 |     
216 |     return v, vc
217 | 


--------------------------------------------------------------------------------
/policies/base_policy.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from collections import OrderedDict
 3 | 
 4 | import numpy as np
 5 | 
 6 | class BasePolicy:
 7 |     def __init__(self):
 8 |         self._deterministic = False
 9 | 
10 |     def reset(self):
11 |         """Reset and clean the policy."""
12 |         raise NotImplementedError
13 | 
14 |     def actions(self, conditions):
15 |         """Compute (symbolic) actions given conditions (observations)"""
16 |         raise NotImplementedError
17 | 
18 |     def log_pis(self, conditions, actions):
19 |         """Compute (symbolic) log probs for given observations and actions."""
20 |         raise NotImplementedError
21 | 
22 |     def actions_np(self, conditions):
23 |         """Compute (numeric) actions given conditions (observations)"""
24 |         raise NotImplementedError
25 | 
26 |     def log_pis_np(self, conditions, actions):
27 |         """Compute (numeric) log probs for given observations and actions."""
28 |         raise NotImplementedError
29 | 
30 |     def get_diagnostics(self, conditions):
31 |         """Return diagnostic information of the policy.
32 | 
33 |         Arguments:
34 |             conditions: Observations to run the diagnostics for.
35 |         Returns:
36 |             diagnostics: OrderedDict of diagnostic information.
37 |         """
38 |         diagnostics = OrderedDict({})
39 |         return diagnostics


--------------------------------------------------------------------------------
/policies/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | def get_cpo_policy(env, session, *args, **kwargs):
 4 |     from policies.cpo_policy import CPOPolicy
 5 |     policy = CPOPolicy(
 6 |         obs_space=env.observation_space,
 7 |         act_space=env.action_space,
 8 |         session=session,
 9 |         *args,
10 |         **kwargs)
11 |     return policy
12 | 
13 | POLICY_FUNCTIONS = {
14 |     'cpopolicy': get_cpo_policy,
15 | }
16 | 
17 | 
18 | def get_policy(policy_type, *args, **kwargs):
19 |     return POLICY_FUNCTIONS[policy_type](*args, **kwargs)
20 | 
21 | def get_policy_from_params(params, env, *args, **kwargs):
22 |     policy_params = params['policy_params']
23 |     policy_type = policy_params['type']
24 |     policy_kwargs = deepcopy(policy_params['kwargs'])
25 | 
26 |     policy = POLICY_FUNCTIONS[policy_type](
27 |         env,
28 |         *args,
29 |         **policy_kwargs,
30 |         **kwargs)
31 | 
32 |     return policy
33 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | conda-env-export==0.3.2
 2 | dotmap==1.3.8
 3 | gtimer==1.0.0b5
 4 | gym==0.18.0
 5 | joblib==0.14.1
 6 | mkl-fft==1.2.0
 7 | mkl-random==1.1.0
 8 | mkl-service==2.3.0
 9 | mujoco-py==2.0.2.13
10 | olefile==0.46
11 | pyOpenSSL==19.1.0
12 | PySocks==1.7.1
13 | PyYAML==5.4.1
14 | ray==0.6.4
15 | sip==4.19.24
16 | tensorflow==1.14.0
17 | tornado==6.0.4
18 | 


--------------------------------------------------------------------------------
/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/samplers/__init__.py


--------------------------------------------------------------------------------
/samplers/base_sampler.py:
--------------------------------------------------------------------------------
 1 | from collections import deque, OrderedDict
 2 | from itertools import islice
 3 | 
 4 | 
 5 | class BaseSampler(object):
 6 |     def __init__(self,
 7 |                  max_path_length,
 8 |                  min_pool_size,
 9 |                  batch_size,
10 |                  store_last_n_paths=10,
11 |                  preprocess_type='default'):
12 |         self._max_path_length = max_path_length
13 |         self._min_pool_size = min_pool_size
14 |         self._batch_size = batch_size
15 |         self._store_last_n_paths = store_last_n_paths
16 |         self._last_n_paths = deque(maxlen=store_last_n_paths)
17 |         self._obs_process_type = preprocess_type
18 |         self.env = None
19 |         self.policy = None
20 |         self.pool = None
21 | 
22 |     def initialize(self, env, policy, pool):
23 |         self.env = env
24 |         self.policy = policy
25 |         self.pool = pool
26 | 
27 |     def set_policy(self, policy):
28 |         self.policy = policy
29 | 
30 |     def clear_last_n_paths(self):
31 |         self._last_n_paths.clear()
32 | 
33 |     def get_last_n_paths(self, n=None):
34 |         if n is None:
35 |             n = self._store_last_n_paths
36 | 
37 |         last_n_paths = tuple(islice(self._last_n_paths, None, n))
38 | 
39 |         return last_n_paths
40 | 
41 |     def sample(self):
42 |         raise NotImplementedError
43 | 
44 |     def batch_ready(self):
45 |         enough_samples = self.pool.size >= self._min_pool_size
46 |         return enough_samples
47 | 
48 |     def random_batch(self, batch_size=None, **kwargs):
49 |         batch_size = batch_size or self._batch_size
50 |         return self.pool.random_batch(batch_size, **kwargs)
51 | 
52 |     def terminate(self):
53 |         self.env.close()
54 | 
55 |     def get_diagnostics(self):
56 |         diagnostics = OrderedDict({'pool-size': self.pool.size})
57 |         return diagnostics
58 | 
59 |     def __getstate__(self):
60 |         state = {
61 |             key: value for key, value in self.__dict__.items()
62 |             if key not in ('env', 'policy', 'pool')
63 |         }
64 | 
65 |         return state
66 | 
67 |     def __setstate__(self, state):
68 |         self.__dict__.update(state)
69 | 
70 |         self.env = None
71 |         self.policy = None
72 |         self.pool = None
73 | 


--------------------------------------------------------------------------------
/samplers/simple_sampler.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | 
  4 | 
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | from samplers.base_sampler import BaseSampler
  9 | 
 10 | class SimpleSampler(BaseSampler):
 11 |     def __init__(self, **kwargs):
 12 |         super(SimpleSampler, self).__init__(**kwargs)
 13 | 
 14 |         self._path_length = 0
 15 |         self._path_return = 0
 16 |         self._current_path = defaultdict(list)
 17 |         self._last_path_return = 0
 18 |         self._max_path_return = -np.inf
 19 |         self._n_episodes = 0
 20 |         self._current_observation = None
 21 |         self._total_samples = 0
 22 |         self._last_action = None
 23 | 
 24 |     def _process_observations(self,
 25 |                               observation,
 26 |                               action,
 27 |                               reward,
 28 |                               cost,
 29 |                               terminal,
 30 |                               next_observation,
 31 |                               info):
 32 | 
 33 |         processed_observation = {
 34 |             'observations': observation,
 35 |             'actions': action,
 36 |             'rewards': [reward],
 37 |             'cost'     : [cost],
 38 |             'terminals': [terminal],
 39 |             'next_observations': next_observation,
 40 |             'infos': info,
 41 |         }
 42 | 
 43 |         return processed_observation
 44 | 
 45 |     def sample(self):
 46 |         if self._current_observation is None:
 47 |             self._current_observation = np.squeeze(self.env.reset())
 48 |             self._last_action = np.zeros(shape=self.env.action_space.shape)
 49 | 
 50 |         action = self.policy.actions_np(
 51 |             self.env.convert_to_active_observation(
 52 |                 self._current_observation)[None]
 53 |         )[0]
 54 | 
 55 |         next_observation, reward, terminal, info = self.env.step(action)
 56 |         next_observation = np.squeeze(next_observation)
 57 |         reward = np.squeeze(reward)
 58 |         terminal = np.squeeze(terminal)
 59 |         cost = info.get('cost', 0)
 60 |         
 61 |         self._path_length += 1
 62 |         self._path_return += reward
 63 |         self._total_samples += 1
 64 | 
 65 |         processed_sample = self._process_observations(
 66 |             observation=self._current_observation,
 67 |             action=action,
 68 |             reward=reward,
 69 |             cost=cost,
 70 |             terminal=terminal,
 71 |             next_observation=next_observation,
 72 |             info=info,
 73 |         )
 74 | 
 75 |         for key, value in processed_sample.items():
 76 |             self._current_path[key].append(value)
 77 | 
 78 |         #### add to pool only after full epoch or terminal path
 79 |         if terminal or self._path_length >= self._max_path_length:
 80 |             last_path = {
 81 |                 field_name: np.array(values)
 82 |                 for field_name, values in self._current_path.items()
 83 |             }
 84 |             
 85 |             self.pool.add_path(last_path)
 86 |             self._last_n_paths.appendleft(last_path)
 87 | 
 88 |             self._max_path_return = max(self._max_path_return,
 89 |                                         self._path_return)
 90 |             self._last_path_return = self._path_return
 91 | 
 92 |             self.policy.reset()
 93 |             self._current_observation = None
 94 |             self._path_length = 0
 95 |             self._path_return = 0
 96 |             self._current_path = defaultdict(list)
 97 |             self._last_action = np.zeros(shape=self.env.action_space.shape)
 98 |             self._n_episodes += 1
 99 |         else:
100 |             self._current_observation = next_observation
101 |             self._last_action = action
102 | 
103 |         return next_observation, reward, terminal, info
104 | 
105 |     def random_batch(self, batch_size=None, **kwargs):
106 |         batch_size = batch_size or self._batch_size
107 |         observation_keys = getattr(self.env, 'observation_keys', None)
108 | 
109 |         return self.pool.random_batch(
110 |             batch_size, observation_keys=observation_keys, **kwargs)
111 | 
112 |     def get_diagnostics(self):
113 |         diagnostics = super(SimpleSampler, self).get_diagnostics()
114 |         diagnostics.update({
115 |             'max-path-return': self._max_path_return,
116 |             'last-path-return': self._last_path_return,
117 |             'episodes': self._n_episodes,
118 |             'total-samples': self._total_samples,
119 |         })
120 | 
121 |         return diagnostics
122 |     
123 | 


--------------------------------------------------------------------------------
/samplers/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | import numpy as np
 4 | 
 5 | def get_cposampler(*args, **kwargs):
 6 |     from samplers.cpo_sampler import CpoSampler
 7 |     sampler = CpoSampler(
 8 |         *args,
 9 |         **kwargs)
10 | 
11 |     return sampler
12 | 
13 | SAMPLERS_FUNCTIONS = {
14 |         'CPOSampler' : get_cposampler,
15 |     }
16 | 
17 | 
18 | def get_sampler_from_params(params, *args, **kwargs):
19 | 
20 |     sampler_params = params['sampler_params']
21 |     sampler_type = sampler_params['type']
22 | 
23 |     sampler_args = deepcopy(sampler_params.get('args', ()))
24 |     sampler_kwargs = deepcopy(sampler_params.get('kwargs', {}))
25 | 
26 |     sampler = SAMPLERS_FUNCTIONS[sampler_type](
27 |         *sampler_args, *args, **sampler_kwargs, **kwargs)
28 | 
29 |     return sampler


--------------------------------------------------------------------------------
/scripts/console_scripts.py:
--------------------------------------------------------------------------------
  1 | """A command line interface that exposes softlearning examples to user.
  2 | 
  3 | This package exposes the functions in examples.instrument module to the user
  4 | through a cli, which allows seamless runs of examples in different modes (e.g.
  5 | locally, in google compute engine, or ec2).
  6 |     
  7 | 
  8 | There are two types of cli commands in this file (each have their corresponding
  9 | function in examples.instrument):
 10 | 1. run_example_* methods, which run the experiments by invoking
 11 |     `tune.run_experiments` function.
 12 | 2. launch_example_* methods, which are helpers function to submit an
 13 |     example to be run in the cloud. In practice, these launch a cluster,
 14 |     and then run the `run_example_cluster` method with the provided
 15 |     arguments and options.
 16 | """
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import logging
 23 | 
 24 | import click
 25 | 
 26 | from utilities.instrument import (
 27 |     run_example_dry,
 28 |     run_example_local,
 29 |     run_example_debug,
 30 |     run_example_cluster,
 31 |     launch_example_cluster,
 32 |     launch_example_gce,
 33 |     launch_example_ec2)
 34 | 
 35 | 
 36 | logging.basicConfig(level=logging.INFO)
 37 | logger = logging.getLogger(__name__)
 38 | logger.setLevel(logging.INFO)
 39 | 
 40 | def add_options(options):
 41 |     def decorator(f):
 42 |         for option in options[::-1]:
 43 |             click.decorators._param_memo(f, option)
 44 |         return f
 45 |     return decorator
 46 | 
 47 | 
 48 | @click.group()
 49 | def cli():
 50 |     pass
 51 | 
 52 | 
 53 | @cli.command(
 54 |     name='run_example_dry',
 55 |     context_settings={'ignore_unknown_options': True})
 56 | @click.argument("example_module_name", required=True, type=str)
 57 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
 58 | def run_example_dry_cmd(example_module_name, example_argv):
 59 |     """Print the variant spec and related information of an example."""
 60 |     return run_example_dry(example_module_name, example_argv)
 61 | 
 62 | 
 63 | @cli.command(
 64 |     name='run_local',
 65 |     context_settings={'ignore_unknown_options': True})
 66 | @click.argument("example_module_name", required=True, type=str)
 67 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
 68 | def run_example_local_cmd(example_module_name, example_argv):
 69 |     """Run example locally, potentially parallelizing across cpus/gpus."""
 70 |     return run_example_local(example_module_name, example_argv)
 71 | 
 72 | 
 73 | @cli.command(
 74 |     name='run_example_debug',
 75 |     context_settings={'ignore_unknown_options': True})
 76 | @click.argument("example_module_name", required=True, type=str)
 77 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
 78 | def run_example_debug_cmd(example_module_name, example_argv):
 79 |     """The debug mode limits tune trial runs to enable use of debugger."""
 80 |     return run_example_debug(example_module_name, example_argv)
 81 | 
 82 | @cli.command(
 83 |     name='run_example_cluster',
 84 |     context_settings={'ignore_unknown_options': True})
 85 |     
 86 | @click.argument("example_module_name", required=True, type=str)
 87 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
 88 | def run_example_cluster_cmd(example_module_name, example_argv):
 89 |     """Run example on cluster mode.
 90 | 
 91 |     This functions is very similar to the local mode, except that it
 92 |     correctly sets the redis address to make ray/tune work on a cluster.
 93 |     """
 94 |     run_example_cluster(example_module_name, example_argv)
 95 | 
 96 | @cli.command(
 97 |     name='launch_example_cluster',
 98 |     context_settings={
 99 |         'allow_extra_args': True,
100 |         'ignore_unknown_options': True
101 |     })
102 | @click.argument("example_module_name", required=True, type=str)
103 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
104 | @click.option(
105 |     "--config_file",
106 |     required=False,
107 |     type=str)
108 | @click.option(
109 |     "--stop/--no-stop",
110 |     is_flag=True,
111 |     default=True,
112 |     help="Stop the cluster after the command finishes running.")
113 | @click.option(
114 |     "--start/--no-start",
115 |     is_flag=True,
116 |     default=True,
117 |     help="Start the cluster if needed.")
118 | @click.option(
119 |     "--screen/--no-screen",
120 |     is_flag=True,
121 |     default=False,
122 |     help="Run the command in a screen.")
123 | @click.option(
124 |     "--tmux/--no-tmux",
125 |     is_flag=True,
126 |     default=True,
127 |     help="Run the command in tmux.")
128 | @click.option(
129 |     "--override-cluster-name",
130 |     required=False,
131 |     type=str,
132 |     help="Override the configured cluster name.")
133 | @click.option(
134 |     "--port-forward", required=False, type=int, help="Port to forward.")
135 | def launch_example_cluster_cmd(*args, **kwargs):
136 |     """Launches the example on autoscaled ray cluster through ray exec_cmd.
137 | 
138 |     This handles basic validation and sanity checks for the experiment, and
139 |     then executes the command on autoscaled ray cluster. If necessary, it will
140 |     also fill in more useful defaults for our workflow (i.e. for tmux and
141 |     override_cluster_name).
142 |     """
143 |     return launch_example_cluster(*args, **kwargs)
144 | 
145 | 
146 | @cli.command(
147 |     name='launch_example_gce',
148 |     context_settings={
149 |         'allow_extra_args': True,
150 |         'ignore_unknown_options': True
151 |     })
152 | @add_options(launch_example_cluster_cmd.params)
153 | def launch_example_gce_cmd(*args, **kwargs):
154 |     """Forwards call to `launch_example_cluster` after adding gce defaults.
155 | 
156 |     This optionally sets the ray autoscaler configuration file to the default
157 |     gce configuration file, and then calls `launch_example_cluster` to
158 |     execute the original command on autoscaled gce cluster by parsing the args.
159 | 
160 |     See `launch_example_cluster` for further details.
161 |     """
162 |     return launch_example_gce(*args, **kwargs)
163 | 
164 | 
165 | @cli.command(
166 |     name='launch_example_ec2',
167 |     context_settings={
168 |         'allow_extra_args': True,
169 |         'ignore_unknown_options': True
170 |     })
171 | @add_options(launch_example_cluster_cmd.params)
172 | def launch_example_ec2_cmd(*args, **kwargs):
173 |     """Forwards call to `launch_example_cluster` after adding ec2 defaults.
174 | 
175 |     This optionally sets the ray autoscaler configuration file to the default
176 |     ec2 configuration file, and then calls `launch_example_cluster` to
177 |     execute the original command on autoscaled ec2 cluster by parsing the args.
178 | 
179 |     See `launch_example_cluster` for further details.
180 |     """
181 |     return launch_example_ec2(*args, **kwargs)
182 | 
183 | cli.add_command(run_example_local_cmd)
184 | cli.add_command(run_example_dry_cmd)
185 | cli.add_command(run_example_cluster_cmd)
186 | 
187 | # Alias for run_example_local
188 | cli.add_command(run_example_local_cmd, name='launch_example_local')
189 | # Alias for run_example_dry
190 | cli.add_command(run_example_dry_cmd, name='launch_example_dry')
191 | # Alias for run_example_debug
192 | cli.add_command(run_example_debug_cmd, name='launch_example_debug')
193 | cli.add_command(launch_example_cluster_cmd)
194 | cli.add_command(launch_example_gce_cmd)
195 | cli.add_command(launch_example_ec2_cmd)
196 | 
197 | 
198 | def main():
199 |     return cli()
200 | 
201 | 
202 | if __name__ == "__main__":
203 |     main()
204 | 


--------------------------------------------------------------------------------
/scripts/run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import copy
  3 | import glob
  4 | import pickle
  5 | import sys
  6 | import pdb
  7 | import importlib
  8 | from dotmap import DotMap
  9 | 
 10 | import tensorflow as tf
 11 | import ray
 12 | from ray import tune
 13 | from ray.autoscaler.commands import exec_cluster
 14 | 
 15 | from envs.utils import get_env_from_params
 16 | from algorithms.utils import get_algorithm_from_params
 17 | from policies.utils import get_policy_from_params
 18 | from buffers.utils import get_buffer_from_params
 19 | from samplers.utils import get_sampler_from_params
 20 | from utilities.utils import set_seed, initialize_tf_variables
 21 | from utilities.instrument import create_trial_name_creator
 22 | 
 23 | class SimpleExperiment(tune.Trainable):
 24 |     def _setup(self, params):
 25 |         self._params = params
 26 |         
 27 |         #### set up tf session
 28 |         set_seed(params['run_params']['seed'])
 29 |         gpu_options = tf.GPUOptions(allow_growth=True)
 30 |         session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
 31 |         tf.keras.backend.set_session(session)
 32 | 
 33 |         self._session = tf.keras.backend.get_session()
 34 |         self.train_generator = None
 35 |         self._built = False
 36 | 
 37 |     def _stop(self):
 38 |         tf.reset_default_graph()
 39 |         tf.keras.backend.clear_session()
 40 | 
 41 |     def _build(self):
 42 |         """
 43 |         called by tune to build algorithm 
 44 |         """
 45 | 
 46 |         #### set up building blocks for algorithm
 47 |         params = copy.deepcopy(self._params)
 48 |         env_params = params['environment_params']
 49 |         env = self.env = (
 50 |             get_env_from_params(env_params))
 51 | 
 52 |         buffer = self.buffer = (
 53 |             get_buffer_from_params(params, env))
 54 | 
 55 |         sampler = self.sampler = get_sampler_from_params(params)
 56 | 
 57 |         policy = self.policy = get_policy_from_params(
 58 |             params, env, self._session)
 59 | 
 60 |         #### build algorithm 
 61 |         self.algorithm = get_algorithm_from_params(
 62 |             variant=self._params,
 63 |             env=env,
 64 |             policy=policy,
 65 |             buffer=buffer,
 66 |             sampler=sampler,
 67 |             session=self._session)
 68 | 
 69 |         #### finalize graph
 70 |         initialize_tf_variables(self._session, only_uninitialized=True)
 71 |         tf.get_default_graph().finalize()
 72 | 
 73 |         #### set train generator function
 74 |         self.train_generator = self.algorithm.train()
 75 |         self._built = True
 76 |         
 77 |     def _train(self):
 78 |         if not self._built:
 79 |             self._build()
 80 | 
 81 |         diagnostics = next(self.train_generator)
 82 |         return diagnostics
 83 | 
 84 | def main(argv=None):
 85 |     """
 86 |     run simple ray tune experiment.
 87 | 
 88 |     Please provide config file location, e.g.
 89 | 
 90 |     <python run.py configs.cmbpo_hcs>
 91 |     """
 92 |     assert argv[0] is not None, "Please provide config file location, e.g."
 93 | 
 94 |     #### create
 95 |     base_module = 'configs.baseconfig'
 96 |     base_module = importlib.import_module(base_module)
 97 | 
 98 |     #### tune configs
 99 |     trial_name_template = 'seed:{trial.config[run_params][seed]}'
100 |     trial_name_creator = create_trial_name_creator(trial_name_template) ## generator for trial name (determines logdir)
101 |     gpus=1      ## gpus to be used
102 |     trial_gpus=1    ## gpus to be used in trial
103 |     mode='local'    ## local or remote, currently only local supported
104 | 
105 |     config=str(argv[0])  ## config file location
106 | 
107 |     exp_config = DotMap(dict(
108 |         gpus=gpus,
109 |         trial_gpus=trial_gpus,
110 |         mode=mode,
111 |         config=config,
112 |     ))
113 | 
114 |     ### build the experiment
115 |     exp_config = base_module.get_variant_spec(exp_config)   ## merge base config and config file to final config
116 |     exp_id = exp_config.get('exp_name')     ## name of the experiment
117 |     exp_class = SimpleExperiment    ## tune trainable class that runs the experiments
118 |     local_dir = os.path.join(exp_config.get('log_dir'), exp_config.get('task'))     ## directory for tf summaries, configs etc.
119 | 
120 |     ### define experiment
121 |     experiment = {
122 |         exp_id:{
123 |         'run': exp_class,
124 |         'config': exp_config,
125 |         'local_dir': local_dir,
126 |         'trial_name_creator': trial_name_creator,
127 |         }
128 |     }
129 | 
130 |     ### initialize ray und run experiments
131 |     ray.init(
132 |         num_gpus=gpus,
133 |         local_mode=True,
134 |         object_store_memory=100 * 1024 * 1024,  #@anyboby TODO: test the memory config 
135 |         )
136 | 
137 |     tune.run_experiments(
138 |         experiment,
139 |         server_port=4321,
140 |     )
141 | 
142 | if __name__ == '__main__':
143 |     main(argv=sys.argv[1:])


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from setuptools import find_packages
 3 | 
 4 | setup(
 5 |     name='cmbpo',
 6 |     packages=find_packages(),
 7 |     version='0.0.1',
 8 |     description='Constrained Model-based policy optimization',
 9 |     long_description=open('./README.md').read(),
10 |     author='Moritz Zanger',
11 |     author_email='zanger.moritz@gmail.com',
12 |     entry_points={
13 |         'console_scripts': (
14 |             'cmbpo=scripts.console_scripts:main',
15 |         )
16 |     },
17 |     requires=(),
18 |     zip_safe=True,
19 |     license='MIT'
20 | )
21 | 


--------------------------------------------------------------------------------
/utilities/logging.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import math
  3 | import pdb
  4 | 
  5 | 
  6 | 
  7 | def update_dict(dict_a, dict_b, weight_a=.5, weight_b=.5):
  8 | 	"""
  9 | 	creates new updated dict and adds entries according to weights. 
 10 | 	for both weights = 1 the entries are added
 11 | 	"""
 12 | 	dict_a_cp = dict(dict_a)
 13 | 	dict_a_cp.update(dict_b)
 14 | 	for k,v in dict_b.items():
 15 | 		if k in dict_a.keys():
 16 | 			dict_a_cp[k] = weight_b*dict_b[k] + weight_a*dict_a[k]
 17 | 	return dict_a_cp
 18 | 
 19 | class Progress:
 20 | 
 21 | 	def __init__(self, total, name = 'Progress', ncol=3, max_length=20, indent=0, line_width=100, speed_update_freq=100):
 22 | 		self.total = total
 23 | 		self.name = name
 24 | 		self.ncol = ncol
 25 | 		self.max_length = max_length
 26 | 		self.indent = indent
 27 | 		self.line_width = line_width
 28 | 		self._speed_update_freq = speed_update_freq
 29 | 
 30 | 		self._step = 0
 31 | 		self._prev_line = '\033[F'
 32 | 		self._clear_line = ' ' * self.line_width
 33 | 
 34 | 		self._pbar_size = self.ncol * self.max_length
 35 | 		self._complete_pbar = '#' * self._pbar_size
 36 | 		self._incomplete_pbar = ' ' * self._pbar_size
 37 | 
 38 | 		self.lines = ['']
 39 | 		self.fraction = '{} / {}'.format(0, self.total)
 40 | 
 41 | 		self.resume()
 42 | 
 43 | 		
 44 | 	def update(self, n=1):
 45 | 		self._step += n
 46 | 		if self._step % self._speed_update_freq == 0:
 47 | 			self._time0 = time.time()
 48 | 			self._step0 = self._step
 49 | 
 50 | 	def resume(self):
 51 | 		self._skip_lines = 1
 52 | 		print('\n', end='')
 53 | 		self._time0 = time.time()
 54 | 		self._step0 = self._step
 55 | 
 56 | 	def pause(self):
 57 | 		self._clear()
 58 | 		self._skip_lines = 1
 59 | 
 60 | 	def set_description(self, params=[]):
 61 | 
 62 | 		############
 63 | 		# Position #
 64 | 		############
 65 | 		self._clear()
 66 | 
 67 | 		###########
 68 | 		# Percent #
 69 | 		###########
 70 | 		percent, fraction = self._format_percent(self._step, self.total)
 71 | 		self.fraction = fraction
 72 | 
 73 | 		#########
 74 | 		# Speed #
 75 | 		#########
 76 | 		speed = self._format_speed(self._step)
 77 | 
 78 | 		##########
 79 | 		# Params #
 80 | 		##########
 81 | 		num_params = len(params)
 82 | 		nrow = math.ceil(num_params / self.ncol)
 83 | 		params_split = self._chunk(params, self.ncol)
 84 | 		params_string, lines = self._format(params_split)
 85 | 		self.lines = lines
 86 | 
 87 | 
 88 | 		description = '{} | {}{}'.format(percent, speed, params_string)
 89 | 		print(description)
 90 | 		self._skip_lines = nrow + 1
 91 | 
 92 | 	def append_description(self, descr):
 93 | 		self.lines.append(descr)
 94 | 
 95 | 	def _clear(self):
 96 | 		position = self._prev_line * self._skip_lines
 97 | 		empty = '\n'.join([self._clear_line for _ in range(self._skip_lines)])
 98 | 		print(position, end='')
 99 | 		print(empty)
100 | 		print(position, end='')
101 | 		
102 | 	def _format_percent(self, n, total):
103 | 		if total:
104 | 			percent = n / float(total)
105 | 
106 | 			complete_entries = int(percent * self._pbar_size)
107 | 			incomplete_entries = self._pbar_size - complete_entries
108 | 
109 | 			pbar = self._complete_pbar[:complete_entries] + self._incomplete_pbar[:incomplete_entries]
110 | 			fraction = '{} / {}'.format(n, total)
111 | 			string = '{} [{}] {:3d}%'.format(fraction, pbar, int(percent*100))
112 | 		else:
113 | 			fraction = '{}'.format(n)
114 | 			string = '{} iterations'.format(n)
115 | 		return string, fraction
116 | 
117 | 	def _format_speed(self, n):
118 | 		num_steps = n - self._step0
119 | 		t = time.time() - self._time0
120 | 		speed = num_steps / t
121 | 		string = '{:.1f} Hz'.format(speed)
122 | 		if num_steps > 0:
123 | 			self._speed = string
124 | 		return string
125 | 
126 | 	def _chunk(self, l, n):
127 | 		return [l[i:i+n] for i in range(0, len(l), n)]
128 | 
129 | 	def _format(self, chunks):
130 | 		lines = [self._format_chunk(chunk) for chunk in chunks]
131 | 		lines.insert(0,'')
132 | 		padding = '\n' + ' '*self.indent
133 | 		string = padding.join(lines)
134 | 		return string, lines
135 | 
136 | 	def _format_chunk(self, chunk):
137 | 		line = ' | '.join([self._format_param(param) for param in chunk])
138 | 		return line
139 | 
140 | 	def _format_param(self, param):
141 | 		k, v = param
142 | 		return '{} : {}'.format(k, v)[:self.max_length]
143 | 
144 | 	def stamp(self):
145 | 		if self.lines != ['']:
146 | 			params = ' | '.join(self.lines)
147 | 			string = '[ {} ] {}{} | {}'.format(self.name, self.fraction, params, self._speed)
148 | 			self._clear()
149 | 			print(string, end='\n')
150 | 			self._skip_lines = 1
151 | 		else:
152 | 			self._clear()
153 | 			self._skip_lines = 0
154 | 
155 | 	def close(self):
156 | 		self.pause()
157 | 
158 | class Silent:
159 | 
160 | 	def __init__(self, *args, **kwargs):
161 | 		pass
162 | 
163 | 	def __getattr__(self, attr):
164 | 		return lambda *args: None
165 | 
166 | 
167 | if __name__ == '__main__':
168 | 	silent = Silent()
169 | 	silent.update()
170 | 	silent.stamp()
171 | 
172 | 	num_steps = 1000
173 | 	progress = Progress(num_steps)
174 | 	for i in range(num_steps):
175 | 		progress.update()
176 | 		params = [
177 | 			['A', '{:06d}'.format(i)],
178 | 			['B', '{:06d}'.format(i)],
179 | 			['C', '{:06d}'.format(i)],
180 | 			['D', '{:06d}'.format(i)],
181 | 			['E', '{:06d}'.format(i)],
182 | 			['F', '{:06d}'.format(i)],
183 | 			['G', '{:06d}'.format(i)],
184 | 			['H', '{:06d}'.format(i)],
185 |         ]
186 | 		progress.set_description(params)
187 | 		time.sleep(0.01)
188 | 	progress.close()
189 | 


--------------------------------------------------------------------------------
/utilities/mpi_tf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from mpi4py import MPI
 4 | from utilities.mpi_tools import broadcast
 5 | 
 6 | 
 7 | def flat_concat(xs):
 8 |     return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0)
 9 | 
10 | def assign_params_from_flat(x, params):
11 |     flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars
12 |     splits = tf.split(x, [flat_size(p) for p in params])
13 |     new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)]
14 |     return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)])
15 | 
16 | def sync_params(params):
17 |     get_params = flat_concat(params)
18 |     def _broadcast(x):
19 |         broadcast(x)
20 |         return x
21 |     synced_params = tf.py_func(_broadcast, [get_params], tf.float32)
22 |     return assign_params_from_flat(synced_params, params)
23 | 
24 | def sync_all_params():
25 |     """Sync all tf variables across MPI processes."""
26 |     return sync_params(tf.global_variables())
27 | 
28 | 
29 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
30 |     """
31 |     Adam optimizer that averages gradients across MPI processes.
32 | 
33 |     The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. 
34 |     For documentation on method arguments, see the Tensorflow docs page for 
35 |     the base `AdamOptimizer`_.
36 | 
37 |     .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py
38 |     .. _`AdamOptimizer`: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
39 |     """
40 | 
41 |     def __init__(self, **kwargs):
42 |         self.comm = MPI.COMM_WORLD
43 |         tf.train.AdamOptimizer.__init__(self, **kwargs)
44 | 
45 |     def compute_gradients(self, loss, var_list, **kwargs):
46 |         """
47 |         Same as normal compute_gradients, except average grads over processes.
48 |         """
49 |         grads_and_vars = super().compute_gradients(loss, var_list, **kwargs)
50 |         grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
51 |         flat_grad = flat_concat([g for g, v in grads_and_vars])
52 |         shapes = [v.shape.as_list() for g, v in grads_and_vars]
53 |         sizes = [int(np.prod(s)) for s in shapes]
54 | 
55 |         num_tasks = self.comm.Get_size()
56 |         buf = np.zeros(flat_grad.shape, np.float32)
57 | 
58 |         def _collect_grads(flat_grad):
59 |             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
60 |             np.divide(buf, float(num_tasks), out=buf)
61 |             return buf
62 | 
63 |         avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
64 |         avg_flat_grad.set_shape(flat_grad.shape)
65 |         avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
66 |         avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
67 |                     for g, (_, v) in zip(avg_grads, grads_and_vars)]
68 | 
69 |         return avg_grads_and_vars
70 | 
71 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
72 |         """
73 |         Same as normal apply_gradients, except sync params after update.
74 |         """
75 |         opt = super().apply_gradients(grads_and_vars, global_step, name)
76 |         with tf.control_dependencies([opt]):
77 |             sync = sync_params([v for g,v in grads_and_vars])
78 |         return tf.group([opt, sync])


--------------------------------------------------------------------------------
/utilities/mpi_tools.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from mpi4py import MPI
 3 | import os, subprocess, sys
 4 | import numpy as np
 5 | 
 6 | 
 7 | def mpi_fork(n, bind_to_core=False):
 8 |     """
 9 |     Re-launches the current script with workers linked by MPI.
10 | 
11 |     Also, terminates the original process that launched it.
12 | 
13 |     Taken almost without modification from the Baselines function of the
14 |     `same name`_.
15 | 
16 |     .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
17 | 
18 |     Args:
19 |         n (int): Number of process to split into.
20 | 
21 |         bind_to_core (bool): Bind each MPI process to a core.
22 |     """
23 |     if n<=1: 
24 |         return
25 |     if os.getenv("IN_MPI") is None:
26 |         env = os.environ.copy()
27 |         env.update(
28 |             MKL_NUM_THREADS="1",
29 |             OMP_NUM_THREADS="1",
30 |             IN_MPI="1"
31 |         )
32 |         args = ["mpirun", "-np", str(n)]
33 |         if bind_to_core:
34 |             args += ["-bind-to", "core"]
35 |         args += [sys.executable] + sys.argv
36 |         subprocess.check_call(args, env=env)
37 |         sys.exit()
38 | 
39 | 
40 | def msg(m, string=''):
41 |     print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m))
42 | 
43 | def proc_id():
44 |     """Get rank of calling process."""
45 |     return MPI.COMM_WORLD.Get_rank()
46 | 
47 | def allreduce(*args, **kwargs):
48 |     return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
49 | 
50 | def num_procs():
51 |     """Count active MPI processes."""
52 |     return MPI.COMM_WORLD.Get_size()
53 | 
54 | def broadcast(x, root=0):
55 |     MPI.COMM_WORLD.Bcast(x, root=root)
56 | 
57 | def mpi_op(x, op):
58 |     x, scalar = ([x], True) if np.isscalar(x) else (x, False)
59 |     x = np.asarray(x, dtype=np.float32)
60 |     buff = np.zeros_like(x, dtype=np.float32)
61 |     allreduce(x, buff, op=op)
62 |     return buff[0] if scalar else buff
63 | 
64 | def mpi_sum(x):
65 |     return mpi_op(x, MPI.SUM)
66 | 
67 | def mpi_avg(x):
68 |     """Average a scalar or vector over MPI processes."""
69 |     return mpi_sum(x) / num_procs()
70 |     
71 | def mpi_statistics_scalar(x, with_min_and_max=False):
72 |     """
73 |     Get mean/std and optional min/max of scalar x across MPI processes.
74 | 
75 |     Args:
76 |         x: An array containing samples of the scalar to produce statistics
77 |             for.
78 | 
79 |         with_min_and_max (bool): If true, return min and max of x in 
80 |             addition to mean and std.
81 |     """
82 |     x = np.array(x, dtype=np.float32)
83 |     global_sum, global_n = mpi_sum([np.sum(x), len(x)])
84 |     mean = global_sum / global_n
85 | 
86 |     global_sum_sq = mpi_sum(np.sum((x - mean)**2))
87 |     std = np.sqrt(global_sum_sq / global_n)  # compute global std
88 | 
89 |     if with_min_and_max:
90 |         global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
91 |         global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
92 |         return mean, std, global_min, global_max
93 |     return mean, std


--------------------------------------------------------------------------------
/utilities/serialization_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | def convert_json(obj):
 4 |     """ Convert obj to a version which can be serialized with JSON. """
 5 |     if is_json_serializable(obj):
 6 |         return obj
 7 |     else:
 8 |         if isinstance(obj, dict):
 9 |             serializables = {}
10 |             for k,v in obj.items():
11 |                 if is_json_serializable(k) and is_json_serializable(v):
12 |                     serializables[convert_json(k)]=convert_json(v)
13 |             
14 |             return serializables
15 | 
16 |         elif isinstance(obj, tuple):
17 |             return (convert_json(x) for x in obj)
18 | 
19 |         elif isinstance(obj, list):
20 |             return [convert_json(x) for x in obj]
21 | 
22 |         elif hasattr(obj,'__name__') and not('lambda' in obj.__name__):
23 |             return convert_json(obj.__name__)
24 | 
25 |         elif hasattr(obj,'__dict__') and obj.__dict__:
26 |             obj_dict = {convert_json(k): convert_json(v) 
27 |                         for k,v in obj.__dict__.items()}
28 |             return {str(obj): obj_dict}
29 | 
30 |         return str(obj)
31 | 
32 | def is_json_serializable(v):
33 |     try:
34 |         json.dumps(v)
35 |         return True
36 |     except:
37 |         return False


--------------------------------------------------------------------------------
/utilities/trust_region.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | from utilities.utils import EPS
 5 | """
 6 | Tensorflow utilities for trust region optimization
 7 | """
 8 | 
 9 | def flat_concat(xs):
10 |     return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0)
11 | 
12 | def flat_grad(f, params):
13 |     return flat_concat(tf.gradients(xs=params, ys=f))
14 | 
15 | def hessian_vector_product(f, params):
16 |     # for H = grad**2 f, compute Hx
17 |     g = flat_grad(f, params)
18 |     x = tf.placeholder(tf.float32, shape=g.shape)
19 |     return x, flat_grad(tf.reduce_sum(g*x), params)
20 | 
21 | def assign_params_from_flat(x, params):
22 |     flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars
23 |     splits = tf.split(x, [flat_size(p) for p in params])
24 |     new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)]
25 |     return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)])
26 | 
27 | 
28 | """
29 | Conjugate gradient
30 | """
31 | 
32 | def cg(Ax, b, cg_iters=10):
33 |     x = np.zeros_like(b)
34 |     r = b.copy() # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
35 |     p = r.copy()
36 |     r_dot_old = np.dot(r,r)
37 |     for _ in range(cg_iters):
38 |         z = Ax(p)
39 |         alpha = r_dot_old / (np.dot(p, z) + EPS)
40 |         x += alpha * p
41 |         r -= alpha * z
42 |         r_dot_new = np.dot(r,r)
43 |         p = r + (r_dot_new / r_dot_old) * p
44 |         r_dot_old = r_dot_new
45 |     return x


--------------------------------------------------------------------------------