├── .gitignore
├── README.md
├── algorithms
├── cmbpo.py
├── rl_algorithm.py
└── utils.py
├── buffers
├── cpobuffer.py
├── modelbuffer.py
└── utils.py
├── cmbpo.yml
├── configs
├── baseconfig
│ ├── __init__.py
│ ├── base.py
│ ├── main.py
│ └── utils.py
├── cmbpo_antsafe.py
├── cmbpo_hcs.py
├── cmbpo_hs.py
├── cpo_hcs.py
└── trpo_hcs.py
├── envs
├── __init__.py
├── mujoco_safety_gym
│ ├── __init__.py
│ └── envs
│ │ ├── __init__.py
│ │ ├── ant.py
│ │ ├── ant_viz.py
│ │ ├── assets
│ │ ├── ant.xml
│ │ ├── ant_viz.xml
│ │ ├── fetch
│ │ │ ├── pick_and_place.xml
│ │ │ ├── push.xml
│ │ │ ├── reach.xml
│ │ │ ├── robot.xml
│ │ │ ├── shared.xml
│ │ │ └── slide.xml
│ │ ├── half_cheetah.xml
│ │ ├── hopper.xml
│ │ ├── humanoid.xml
│ │ └── textures
│ │ │ ├── block.png
│ │ │ └── block_hidden.png
│ │ ├── fetch
│ │ ├── pick_and_place.py
│ │ ├── push.py
│ │ ├── reach.py
│ │ └── slide.py
│ │ ├── fetch_env.py
│ │ ├── half_cheetah.py
│ │ ├── hopper.py
│ │ ├── humanoid.py
│ │ ├── mujoco_env.py
│ │ └── robot_env.py
├── utils.py
└── wrappers
│ ├── __init__.py
│ └── normalize_action.py
├── models
├── base_model.py
├── fake_env.py
├── pens
│ ├── __init__.py
│ ├── fc.py
│ ├── logger.py
│ ├── pe.py
│ ├── pe_factory.py
│ └── utils.py
└── statics.py
├── network
└── ac_network.py
├── policies
├── base_policy.py
├── cpo_policy.py
└── utils.py
├── requirements.txt
├── samplers
├── __init__.py
├── base_sampler.py
├── cpo_sampler.py
├── model_sampler.py
├── simple_sampler.py
└── utils.py
├── scripts
├── console_scripts.py
└── run.py
├── setup.py
└── utilities
├── instrument.py
├── logging.py
├── logx.py
├── mpi_tf.py
├── mpi_tools.py
├── serialization_utils.py
├── trust_region.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | *.stl
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 | /environment/src/
31 | /src/
32 | /softlearning/environments/rllab/
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # pyenv
81 | .python-version
82 |
83 | # celery beat schedule file
84 | celerybeat-schedule
85 |
86 | # SageMath parsed files
87 | *.sage.py
88 |
89 | # Environments
90 | .venv
91 | env/
92 | venv/
93 | ENV/
94 | env.bak/
95 | venv.bak/
96 |
97 | # Spyder project settings
98 | .spyderproject
99 | .spyproject
100 |
101 | # Rope project settings
102 | .ropeproject
103 |
104 | # mkdocs documentation
105 | /site
106 |
107 | # mypy
108 | .mypy_cache/
109 |
110 | # soft learning specific things
111 | *.swp
112 | .idea
113 | *.mp4
114 | data/
115 | vis/
116 | tmp/
117 | vendor/*
118 | .pkl
119 |
120 |
121 | .mujoco/
122 | .vscode/
123 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Constrained Model-Based Policy Optimization
2 |
3 |
4 |
5 |
6 |
7 |
8 | This repository contains code for Constrained Model-Based Policy Optimization (CMBPO), a model-based version of Constrained Policy Optimization (Achiam et al.). Installation, execution and code examples for the reproduction of the experiments described in [Safe Continuous Control with Constrained Model-Based Policy Optimization](https://arxiv.org/abs/2104.06922?context=cs) are provided below.
9 |
10 | # Prerequisites
11 |
12 | 1. The simulation experiments using [mujoco-Py](https://github.com/openai/mujoco-py) require a working install of [MuJoCo 2.0](https://www.roboti.us/license.html) and a valid license.
13 | 2. We use conda environments for installs (tested on conda 4.6 - 4.10), please refer to [Anaconda](https://docs.anaconda.com/anaconda/install/) for instructions.
14 |
15 | # Installation
16 |
17 | 1. Clone this repository
18 | ```
19 | git clone https://github.com/anyboby/Constrained-Model-Based-Policy-Optimization.git
20 | ```
21 | 2. Create a conda environment using the cmbpo yml-file
22 | ```sh
23 | cd Constrained-Model-Based-Policy-Optimization/
24 | conda env create -f cmbpo.yml
25 | conda activate cmbpo
26 | pip install -e .
27 | ```
28 | This should create a conda environment labeled 'cmbpo' with the necessary packages and modules. The number of required modules is limited, so it is worth taking a look at the [cmbpo.yml](cmbpo.yml) and [requirements.txt](requirements.txt) files in case of troubles with the installs.
29 |
30 | # Usage
31 | To start an experiment with cmbpo, run
32 | ```sh
33 | cmbpo run_local configs.baseconfig --config=configs.cmbpo_hcs --gpus=1 --trial-gpus=1
34 | ```
35 |
36 | `-- config` specifies the configuration file for experiment (here: CMBPO for HalfCheetahSafe)\
37 | `-- gpus` specifies the number of gpus to use
38 |
39 | A list of all available flags is provided in [baseconfig/utils](configs/baseconfig/utils.py). As of writing,only local running is supported. For further options, refer to the ray documentation.
40 |
41 | The `cmbpo` command uses the [console scripts](scripts/console_scripts.py) as an entry point for running experiments. A simple workflow of running experiments with ray-tune is illustrated in [run.py](scripts/run.py), which can be executed with
42 | ```sh
43 | python scripts/run.py configs.cmbpo_hcs
44 | ```
45 |
46 | ## Algorithms
47 | Constrained Model-Based Policy Optimization aims at combining Constrained Policy Optimization with model-based data augmentation and reconciling constraint satisfaction with the entailed model-errors.
48 |
49 | This repository can therefore also be used to run experiments with model-free versions of Constrained Policy Optimization and Trust-Region Policy Optimization by configuring the `use_model` and `constrain_cost` flags accordingly in the experiment configurations (see [CPO - HalfCheetahSafe](configs/cpo_hcs.py) and [TRPO - HalfCheetahSafe](configs/trpo_hcs.py)):
50 | ```py
51 | 'use_model': False, # set to True for model-based
52 | 'constrain_cost': False, # set to True for cost-constrained optimziation
53 | ```
54 |
55 | ## Adding new environments and running custom experiments
56 | Different environments can be tested by creating a config file in the [configs](configs/) directory. OpenAi gym environments can be loaded directly with the corresponding parameters, for example:
57 | ```py
58 | 'universe': 'gym',
59 | 'task': 'HalfCheetahSafe-v2',
60 | ```
61 | Environments from other sources require an entry in the `ENVS_FUNCTIONS` dict in the [environment utils](envs/utils.py) that specifies how to create an instance of the environment. For example, the Gym environments are specified with the following entries:
62 | ```py
63 | def get_gym_env():
64 | import gym
65 |
66 | return gym.make
67 |
68 | ENVS_FUNCTIONS = {
69 | 'gym':get_gym_env()
70 | }
71 | ```
72 |
73 | ## Model-Learning with custom environments
74 | When using a model with custom environments, the model requires a few interfaces to function with the provided code. The [base model](models/base_model.py) should be inherited by a learned (or handcrafted) model and specify whether rewards, costs, and termination functions are predicted alongside the dynamics.
75 |
76 | By default our algorithm learns to predict rewards but assumes handcrafted cost- and termination-functions `c(s,a,s')` and `t(s,a,s')`. When adding a new environment, these functions should be defined (if not provided by the model) in the [statics](models/statics.py) file. For example, a default termination function that continues episodes for all states looks like this:
77 | ```py
78 | def no_done(obs, act, next_obs):
79 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
80 |
81 | done = np.zeros(shape=obs.shape[:-1], dtype=np.bool) #always false
82 | done = done[...,None]
83 | return done
84 | ```
85 | The static functions should then be linked by the environments' task name, such that the [Fake Environment](models/fake_env.py) correctly discovers them:
86 | ```py
87 | TERMS_BY_TASK = {
88 | 'default':no_done,
89 | 'HalfCheetah-v2':no_done,
90 | }
91 | ```
92 |
93 | ## Hyperparameters
94 | Hyperparameters for a new experiment can be defined in the [configs](configs/) folder. The general form of our config files follows the following structure:
95 | ```py
96 | params = {
97 | 'universe': 'gym',
98 | 'task': 'HalfCheetahSafe-v2',
99 | 'algorithm_params': {...},
100 | 'policy_params':{...},
101 | 'buffer_params': {...},
102 | 'sampler_params': {...},
103 | 'run_params': {...},
104 | }
105 | ```
106 | Parameters specified in a config file overwrite the [base config](configs/baseconfig/base.py) file. For new algorithms or a new suite of environments, it might be practical to directly change the base config.
107 |
108 | In addition to model-parameters and policy-parameters, the main parameters of concern in CMPBO define rollout- and sampling-behavior of the algorithm.
109 | ```py
110 | 'n_initial_exploration_steps': int(10000), ### number of initial exploration steps for model-learning and
111 | # determining uncertainty calibration measurements
112 | 'sampling_alpha': 2, ### temperature for boltzman-sampling
113 | 'rollout_mode' : 'uncertainty', ### model rollouts terminate based on per-step uncertainty
114 | 'rollout_schedule': [10, 500, 5, 30], ### if rollout_mode:'schedule' this schedule is defined as
115 | # [min_epoch, max_epoch, min_horizon, max_horizon]
116 | ## if rollout_mode:'uncertainty', 'min_horizon' is used as
117 | # the initial rollout horizon and adapted throughout
118 | # training based on per-step uncertainty estimates
119 | # (KL-Divergence).
120 | 'batch_size_policy': 50000, ### batch size per policy update
121 | 'initial_real_samples_per_epoch': 1500, ### initial number of real samples per policy update,
122 | # adapted throughout training based on average uncertainty
123 | # estimates (mean KL-Divergence).
124 | 'min_real_samples_per_epoch': 500, ### absolute minimum number of real samples per policy update
125 | ```
126 | ## Logging
127 | A range of measurements is logged automatically in tensorboard, the parameter configuration is saved as a JSON file. The location for summaries and checkpoints can be defined by specifying a `'log_dir'` in the configuration files. By default, this location will be set to `'~/ray_cmbpo/{env-taks}/defaults/{seed}'` and can be accessed with tensorboard by
128 | ```sh
129 | tensorboard --logdir ~/ray_cmbpo//defaults/
130 | ```
131 |
132 | # Acknowledgments
133 | Several sections of this repository contain code from other repositories, notably from [Tuomas Haarnoja](https://scholar.google.com/citations?user=VT7peyEAAAAJ&hl=en), [Kristian Hartikainen's](https://github.com/rail-berkeley/softlearning), [Michael Janner](https://github.com/JannerM/mbpo), [Kurtland Chua](https://github.com/kchua/handful-of-trials), and CPO by [Joshua Achiam and Alex Ray](https://github.com/openai/safety-starter-agents).
134 |
--------------------------------------------------------------------------------
/algorithms/rl_algorithm.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from collections import OrderedDict
3 | from itertools import count
4 | import gtimer as gt
5 | import math
6 | import os
7 | import pdb
8 |
9 | import tensorflow as tf
10 | import numpy as np
11 |
12 | from utilities.utils import save_video
13 |
14 |
15 | class RLAlgorithm(tf.contrib.checkpoint.Checkpointable):
16 | """Abstract RLAlgorithm.
17 |
18 | Implements the _train and _evaluate methods to be used
19 | by classes inheriting from RLAlgorithm.
20 | """
21 |
22 | def __init__(
23 | self,
24 | sampler,
25 | n_epochs=int(10e7),
26 | n_initial_exploration_steps=0,
27 | initial_exploration_policy=None,
28 | epoch_length=1000,
29 | eval_n_episodes=10,
30 | eval_deterministic=True,
31 | eval_render_mode=None,
32 | video_save_frequency=0,
33 | session=None,
34 | ):
35 | """
36 | Args:
37 | n_epochs (`int`): Number of epochs to run the training for.
38 | n_initial_exploration_steps: Number of steps in the beginning to
39 | take using actions drawn from a separate exploration policy.
40 | initial_exploration_policy: policy to follow during initial
41 | exploration hook
42 | epoch_length (`int`): Epoch length.
43 | eval_n_episodes (`int`): Number of rollouts to evaluate.
44 | eval_deterministic (`int`): Whether or not to run the policy in
45 | deterministic mode when evaluating policy.
46 | eval_render_mode (`str`): Mode to render evaluation rollouts in.
47 | None to disable rendering.
48 | """
49 | self.sampler = sampler
50 |
51 | self._n_epochs = n_epochs
52 | self._epoch_length = epoch_length
53 | self._n_initial_exploration_steps = n_initial_exploration_steps
54 | self._initial_exploration_policy = initial_exploration_policy
55 |
56 | self._eval_n_episodes = eval_n_episodes
57 | self._eval_deterministic = eval_deterministic
58 | self._video_save_frequency = video_save_frequency
59 |
60 | if self._video_save_frequency > 0:
61 | assert eval_render_mode != 'human', (
62 | "RlAlgorithm cannot render and save videos at the same time")
63 | self._eval_render_mode = 'rgb_array'
64 | else:
65 | self._eval_render_mode = eval_render_mode
66 |
67 | self._session = session or tf.keras.backend.get_session()
68 |
69 | self._epoch = 0
70 | self._timestep = 0
71 | self._num_train_steps = 0
72 |
73 | def _initial_exploration_hook(self, env, initial_exploration_policy, pool):
74 | if self._n_initial_exploration_steps < 1: return
75 |
76 | if not initial_exploration_policy:
77 | raise ValueError(
78 | "Initial exploration policy must be provided when"
79 | " n_initial_exploration_steps > 0.")
80 |
81 | self.sampler.initialize(env, initial_exploration_policy, pool)
82 | while pool.size < self._n_initial_exploration_steps:
83 | self.sampler.sample()
84 |
85 | def _training_before_hook(self):
86 | """Method called before the actual training loops."""
87 | pass
88 |
89 | def _training_after_hook(self):
90 | """Method called after the actual training loops."""
91 | pass
92 |
93 | def _timestep_before_hook(self, *args, **kwargs):
94 | """Hook called at the beginning of each timestep."""
95 | pass
96 |
97 | def _timestep_after_hook(self, *args, **kwargs):
98 | """Hook called at the end of each timestep."""
99 | pass
100 |
101 | def _epoch_before_hook(self):
102 | """Hook called at the beginning of each epoch."""
103 | self._train_steps_this_epoch = 0
104 |
105 | def _epoch_after_hook(self, *args, **kwargs):
106 | """Hook called at the end of each epoch."""
107 | pass
108 |
109 | def _training_batch(self, batch_size=None):
110 | return self.sampler.random_batch(batch_size)
111 |
112 | def _evaluation_batch(self, *args, **kwargs):
113 | return self._training_batch(*args, **kwargs)
114 |
115 | @property
116 | def _training_started(self):
117 | return self._total_timestep > 0
118 |
119 | @property
120 | def _total_timestep(self):
121 | total_timestep = self._epoch * self._epoch_length + self._timestep
122 | return total_timestep
123 |
124 | def _train(self):
125 | """Return a generator that performs RL training.
126 | """
127 | raise NotImplementedError
128 |
129 | @abc.abstractmethod
130 | def get_diagnostics(self,):
131 | raise NotImplementedError
132 |
133 | @property
134 | def ready_to_train(self):
135 | return self.sampler.batch_ready()
136 |
137 | def _do_sampling(self, timestep):
138 | return self.sampler.sample()
139 |
140 | @property
141 | def tf_saveables(self):
142 | return {}
143 |
144 | def __getstate__(self):
145 | state = {
146 | '_epoch_length': self._epoch_length,
147 | '_epoch': (
148 | self._epoch + int(self._timestep >= self._epoch_length)),
149 | '_timestep': self._timestep % self._epoch_length,
150 | }
151 |
152 | return state
153 |
154 | def __setstate__(self, state):
155 | self.__dict__.update(state)
156 |
--------------------------------------------------------------------------------
/algorithms/utils.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from dotmap import DotMap
3 | from collections import OrderedDict
4 |
5 | def create_CMBPO_algorithm(variant, *args, **kwargs):
6 | from algorithms.cmbpo import CMBPO
7 | algorithm = CMBPO(*args, **kwargs)
8 |
9 | return algorithm
10 |
11 |
12 | ALGORITHM_CLASSES = {
13 | 'CMBPO': create_CMBPO_algorithm,
14 | }
15 |
16 |
17 | def get_algorithm_from_params(variant,
18 | *args,
19 | **kwargs):
20 | algorithm_params = variant['algorithm_params']
21 | algorithm_type = algorithm_params['type']
22 | algorithm_kwargs = deepcopy(algorithm_params['kwargs'])
23 | # @anyboby, workaround for local_example_debug mode, for some reason gets DotMap isntead of
24 | # OrderedDict as algorithm_kwargs, which doesn't seem to work for double asteriks !
25 | if isinstance(algorithm_kwargs, DotMap):
26 | algorithm_kwargs = algorithm_kwargs.toDict()
27 |
28 | algorithm = ALGORITHM_CLASSES[algorithm_type](
29 | variant, *args, **algorithm_kwargs, **kwargs)
30 |
31 | return algorithm
32 |
--------------------------------------------------------------------------------
/buffers/utils.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 |
3 | def get_cpobuffer(env, *args, **kwargs):
4 | from buffers.cpobuffer import CPOBuffer
5 |
6 | buffer = CPOBuffer(
7 | *args,
8 | observation_space=env.observation_space,
9 | action_space=env.action_space,
10 | *args,
11 | **kwargs)
12 |
13 | return buffer
14 |
15 | BUFFER_FUNCTIONS = {
16 | 'CPOBuffer': get_cpobuffer,
17 | }
18 |
19 | def get_buffer_from_params(params, env, *args, **kwargs):
20 | buffer_params = params['buffer_params']
21 | buffer_type = buffer_params['type']
22 | buffer_kwargs = deepcopy(buffer_params['kwargs'])
23 |
24 | buffer = BUFFER_FUNCTIONS[buffer_type](
25 | env,
26 | *args,
27 | **buffer_kwargs,
28 | **kwargs)
29 |
30 | return buffer
31 |
--------------------------------------------------------------------------------
/cmbpo.yml:
--------------------------------------------------------------------------------
1 | name: cmbpo
2 | channels:
3 | - anaconda
4 | - defaults
5 | dependencies:
6 | - click=7.0
7 | - matplotlib=3.3.4
8 | - mpi4py=3.0.3
9 | - pip=21.0.1
10 | - python=3.6.12
11 | - requests=2.20.1
12 | - tensorflow-gpu=1.14.0
13 | - pip:
14 | - conda-env-export==0.3.2
15 | - dotmap==1.3.8
16 | - gtimer==1.0.0b5
17 | - gym==0.18.0
18 | - joblib==0.14.1
19 | - mkl-fft==1.2.0
20 | - mkl-random==1.1.0
21 | - mkl-service==2.3.0
22 | - mujoco-py==2.0.2.13
23 | - olefile==0.46
24 | - pyOpenSSL==19.1.0
25 | - PySocks==1.7.1
26 | - PyYAML==5.4.1
27 | - ray==0.6.4
28 | - sip==4.19.24
29 | - tensorflow==1.14.0
30 | - tornado==6.0.4
31 | prefix: /home/mo/anaconda3/envs/cmbpo_test2
32 |
--------------------------------------------------------------------------------
/configs/baseconfig/__init__.py:
--------------------------------------------------------------------------------
1 | """Provides functions that are utilized by the command line interface.
2 |
3 | In particular, the examples are exposed to the command line interface
4 | (defined in `scripts.console_scripts`) through the
5 | `get_trainable_class`, `get_variant_spec`, and `get_parser` functions.
6 | """
7 |
8 |
9 | def get_trainable_class(*args, **kwargs):
10 | from .main import ExperimentRunner
11 | return ExperimentRunner
12 |
13 | def get_params_from_file(filepath, params_name='params'):
14 | import importlib
15 | from dotmap import DotMap
16 | module = importlib.import_module(filepath)
17 | params = getattr(module, params_name)
18 | params = DotMap(params)
19 | return params
20 |
21 | def get_variant_spec(command_line_args, *args, **kwargs):
22 | from .base import get_variant_spec
23 | import importlib
24 | params = get_params_from_file(command_line_args.config)
25 | variant_spec = get_variant_spec(command_line_args, *args, params, **kwargs)
26 | return variant_spec
27 |
28 | def get_parser():
29 | from .utils import get_parser
30 | parser = get_parser()
31 | return parser
32 |
--------------------------------------------------------------------------------
/configs/baseconfig/base.py:
--------------------------------------------------------------------------------
1 | from ray import tune
2 | import numpy as np
3 | import pdb
4 |
5 | from utilities.utils import deep_update
6 |
7 | M = 256 #256
8 |
9 | NUM_COUPLING_LAYERS = 2
10 |
11 | DEFAULT_MAX_PATH_LENGTH = 1000
12 |
13 | CPO_POLICY_PARAMS_BASE = {
14 | 'type': 'CPOPolicy',
15 | 'kwargs': {
16 | 'a_hidden_layer_sizes': (M, M), # policy network hidden layers
17 | 'constrain_cost': True, # constrain_cost=False will perform TRPO updates
18 | 'vf_lr': 3e-4, # learn rate for value learning
19 | 'vf_hidden_layer_sizes':(M,M), # nn hidden layers for vf
20 | 'vf_epochs': 8, # number of training epochs for values
21 | 'vf_batch_size': 2048, # minibatches for value training
22 | 'vf_ensemble_size': 3, # vf ensemble size
23 | 'vf_elites': 2, # vf elites
24 | 'vf_activation': 'swish', # activation function
25 | 'vf_loss': 'MSE', # choose from 'NLL', 'MSPE' (inc. var); 'MSE' ; 'Huber'
26 | 'vf_decay': 1e-6, # decay for nn regularization
27 | 'vf_clipping': False, # clip losses for a trust-region like vf update
28 | 'vf_kl_cliprange': 0.0, # only applicable if vf_clippping=True
29 | 'ent_reg': 0, # 5e-3 # exploration bonus for maintaining pol. entropy
30 | 'target_kl': 0.01, # trust region diameter
31 | 'cost_lim': 10, # cost limit for whole task length
32 | 'cost_lam': .5, # gae lambda
33 | 'cost_gamma': 0.97, # discounts
34 | 'lam': .95, # gae lambda
35 | 'gamma': 0.99, # discounts
36 | 'epoch_length': tune.sample_from(lambda spec: (
37 | spec.get('config', spec)
38 | ['algorithm_params']['kwargs']['epoch_length']
39 | )),
40 | 'max_path_length': tune.sample_from(lambda spec: (
41 | spec.get('config', spec)
42 | ['sampler_params']['kwargs']['max_path_length']
43 | )),
44 | 'log_dir': tune.sample_from(lambda spec: (
45 | spec.get('config', spec)
46 | ['log_dir']
47 | )),
48 | }
49 | }
50 |
51 | POLICY_PARAMS_BASE = {
52 | 'CPOPolicy' : CPO_POLICY_PARAMS_BASE,
53 | }
54 |
55 | POLICY_PARAMS_BASE.update({
56 | 'cpopolicy': POLICY_PARAMS_BASE['CPOPolicy']
57 | })
58 |
59 | ALGORITHM_PARAMS = {
60 | 'CMBPO': {
61 | 'type': 'CMBPO',
62 | 'kwargs': {
63 | 'task': tune.sample_from(lambda spec: (
64 | spec.get('config', spec)
65 | ['environment_params']['task']
66 | )),
67 | 'n_env_interacts': int(10e6),
68 | 'epoch_length': 50000,
69 | 'eval_render_mode': 'human',
70 | 'eval_n_episodes': 1,
71 | 'eval_every_n_steps': 5e3,
72 | 'eval_deterministic': False,
73 | 'n_initial_exploration_steps': int(10000), # number of initial exploration steps for model-learning and
74 | # determining uncertainty calibration measurements
75 | #### it is crucial to choose a model that doesn't overfit when trained too often on seen data
76 | ## for model architecture finding: 1. play around with the start samples to find an architecture, that doesn't really overfit
77 | # 2. m_train_freq in can somewhat limit overfitting, but is only treating the symptom
78 | # 3. try finding a balance between the size of new samples per number of
79 | # updates of the model network (with m_train_freq)
80 | 'use_model': True,
81 | 'm_hidden_dims':(512,512), # hidden layer size of model bnn
82 | 'm_loss_type': 'MSPE',
83 | 'm_use_scaler_in': True,
84 | 'm_use_scaler_out': True,
85 | 'm_lr': 1e-3,
86 | 'm_train_freq': 4000, # model is only trained every (self._timestep % self._model_train_freq==0) steps (terminates when stops improving)
87 | 'rollout_batch_size': 1.0e3, # rollout_batch_size is the size of randomly chosen states to start from when rolling out model
88 | 'm_networks': 7, # size of model network ensemble
89 | 'm_elites': 5, # best networks to select from num_networks
90 | 'max_model_t': None, # a timeout for model training (e.g. for speeding up wallclock time)
91 | 'sampling_alpha': 2, # temperature for boltzman-sampling
92 | 'rollout_mode' : 'uncertainty', #
93 | 'rollout_schedule': [10, 500, 5, 30], # if rollout_mode:'schedule' this schedule is defined as
94 | #[min_epoch, max_epoch, min_horizon, max_horizon]
95 | # if rollout_mode:'uncertainty', 'min_horizon' is used as
96 | # the initial rollout horizon and adapted throughout
97 | # training based on per-step uncertainty estimates
98 | # (KL-Divergence).
99 | 'maxroll': 35, # maximum rollout horizon
100 | 'batch_size_policy': 50000, # batch size per policy update
101 | 'initial_real_samples_per_epoch': 15000, # number of real samples contained in first batch
102 | 'min_real_samples_per_epoch': 500, # absolute minimum of samples
103 | }
104 | },
105 | }
106 |
107 | BUFFER_PARAMS_PER_ALGO = {
108 | 'CMBPO': {
109 | 'type': 'CPOBuffer',
110 | 'preprocess_type': 'default',
111 | 'kwargs': {
112 | 'size': tune.sample_from(lambda spec: (
113 | spec.get('config', spec)
114 | ['algorithm_params']['kwargs']['epoch_length']
115 | )),
116 | 'archive_size': tune.sample_from(lambda spec: (
117 | {
118 | 'SimpleReplayPool': int(1e6),
119 | 'CPOBuffer':int(3e5),
120 | }.get(
121 | spec.get('config', spec)
122 | ['buffer_params']['type'],
123 | int(1e6))
124 | )),
125 | }
126 | },
127 | }
128 |
129 | SAMPLER_PARAMS_PER_ALGO = {
130 | 'default': {
131 | 'type':'CPOSampler',
132 | 'kwargs':{
133 | 'max_path_length': DEFAULT_MAX_PATH_LENGTH,
134 | 'render_mode': None,
135 | },
136 | },
137 | 'CMBPO': {
138 | 'type':'CPOSampler',
139 | 'kwargs':{
140 | 'max_path_length': DEFAULT_MAX_PATH_LENGTH,
141 | 'render_mode': None,
142 | },
143 | }
144 | }
145 |
146 | RUN_PARAMS = {
147 | 'seed': tune.sample_from(
148 | lambda spec: np.random.randint(0, 10000)),
149 | 'checkpoint_at_end': True,
150 | 'checkpoint_frequency': 50,
151 | 'checkpoint_buffer': False,
152 | }
153 |
154 | ENV_PARAMS = {
155 | 'normalize_actions':False,
156 | 'kwargs':{}
157 | }
158 |
159 | def get_variant_spec(args, params):
160 | assert hasattr(params, 'universe') and \
161 | hasattr(params, 'task') and \
162 | hasattr(params, 'algorithm') and \
163 | hasattr(params, 'policy')
164 |
165 | universe, task = params.universe, params.task
166 | ENV_PARAMS.update({
167 | 'universe': universe,
168 | 'task': task,
169 | })
170 |
171 | algorithm, policy = params.algorithm_params.type, params.policy_params.type
172 | base_spec = {
173 | 'log_dir': f'~/ray_{algorithm.lower()}',
174 | 'exp_name': 'defaults',
175 | 'environment_params': ENV_PARAMS,
176 | 'policy_params': POLICY_PARAMS_BASE[policy],
177 | 'algorithm_params': ALGORITHM_PARAMS[algorithm],
178 | 'buffer_params': BUFFER_PARAMS_PER_ALGO[algorithm],
179 | 'sampler_params': SAMPLER_PARAMS_PER_ALGO[algorithm],
180 | 'run_params': RUN_PARAMS,
181 | }
182 |
183 | variant_spec = deep_update(
184 | base_spec,
185 | params
186 | )
187 | return variant_spec
188 |
--------------------------------------------------------------------------------
/configs/baseconfig/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import glob
4 | import pickle
5 | import sys
6 | import pdb
7 |
8 | import tensorflow as tf
9 | from ray import tune
10 |
11 | from envs.utils import get_env_from_params
12 | from algorithms.utils import get_algorithm_from_params
13 | from policies.utils import get_policy_from_params
14 | from buffers.utils import get_buffer_from_params
15 | from samplers.utils import get_sampler_from_params
16 |
17 | from utilities.utils import set_seed, initialize_tf_variables
18 | from utilities.instrument import run_example_local, run_example_debug
19 |
20 | class ExperimentRunner(tune.Trainable):
21 | def _setup(self, variant):
22 | set_seed(variant['run_params']['seed'])
23 |
24 | self._variant = variant
25 | gpu_options = tf.GPUOptions(allow_growth=True)
26 | session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
27 | tf.keras.backend.set_session(session)
28 | self._session = tf.keras.backend.get_session()
29 |
30 | self.train_generator = None
31 | self._built = False
32 |
33 | def _stop(self):
34 | tf.reset_default_graph()
35 | tf.keras.backend.clear_session()
36 |
37 | def _build(self):
38 | """
39 | called by tune to build algorithm
40 | """
41 | variant = copy.deepcopy(self._variant)
42 |
43 | env_params = variant['environment_params']
44 | env = self.env = (
45 | get_env_from_params(env_params))
46 |
47 | buffer = self.buffer = (
48 | get_buffer_from_params(variant, env))
49 | sampler = self.sampler = get_sampler_from_params(variant)
50 | policy = self.policy = get_policy_from_params(
51 | variant, env, self._session)
52 |
53 | #### build algorithm
54 | self.algorithm = get_algorithm_from_params(
55 | variant=self._variant,
56 | env=env,
57 | policy=policy,
58 | buffer=buffer,
59 | sampler=sampler,
60 | session=self._session)
61 |
62 | initialize_tf_variables(self._session, only_uninitialized=True)
63 |
64 | # add graph since ray doesn't seem to automatically add that
65 | graph_writer = tf.summary.FileWriter(self.logdir, self._session.graph)
66 | graph_writer.flush()
67 | graph_writer.close()
68 |
69 | #### finalize graph
70 | tf.get_default_graph().finalize()
71 | self._built = True
72 |
73 |
74 | def _train(self):
75 | if not self._built:
76 | self._build()
77 |
78 | if self.train_generator is None:
79 | self.train_generator = self.algorithm.train()
80 |
81 | diagnostics = next(self.train_generator)
82 |
83 | return diagnostics
84 |
85 | def _pickle_path(self, checkpoint_dir):
86 | return os.path.join(checkpoint_dir, 'checkpoint.pkl')
87 |
88 | def _replay_pool_pickle_path(self, checkpoint_dir):
89 | return os.path.join(checkpoint_dir, 'replay_pool.pkl')
90 |
91 | def _tf_checkpoint_prefix(self, checkpoint_dir):
92 | return os.path.join(checkpoint_dir, 'checkpoint')
93 |
94 | def _get_tf_checkpoint(self):
95 | tf_checkpoint = tf.train.Checkpoint(**self.algorithm.tf_saveables)
96 |
97 | return tf_checkpoint
98 |
99 | def _save_replay_pool(self, checkpoint_dir):
100 | replay_pool_pickle_path = self._replay_pool_pickle_path(
101 | checkpoint_dir)
102 | self.buffer.save_latest_experience(replay_pool_pickle_path)
103 |
104 | def _restore_replay_pool(self, current_checkpoint_dir):
105 | experiment_root = os.path.dirname(current_checkpoint_dir)
106 |
107 | experience_paths = [
108 | self._replay_pool_pickle_path(checkpoint_dir)
109 | for checkpoint_dir in sorted(glob.iglob(
110 | os.path.join(experiment_root, 'checkpoint_*')))
111 | ]
112 | for experience_path in experience_paths:
113 | self.buffer.load_experience(experience_path)
114 |
115 | def _save(self, checkpoint_dir):
116 | """Implements the saving logic.
117 | @anyboby: implementation very cmbpo specific saving methods, not optimal!
118 | but general interfaces seem hard to implement due to all the different
119 | frameworks (Keras, tf, pickling etc.)
120 | """
121 |
122 | ## only saves model atm
123 | self.policy_path = self.policy.save(checkpoint_dir) ### @anyboby: this saves all tf objects
124 | self.algorithm.save(checkpoint_dir)
125 |
126 | if self._variant['run_params'].get('checkpoint_replay_pool', False):
127 | self._save_replay_pool(checkpoint_dir)
128 |
129 | return os.path.join(checkpoint_dir, '')
130 |
131 | def _restore(self, checkpoint_dir):
132 | raise NotImplementedError
133 |
134 | def main(argv=None):
135 | """Run ExperimentRunner locally on ray.
136 | """
137 | run_example_local(__package__, argv)
138 |
139 | if __name__ == '__main__':
140 | main(argv=sys.argv[1:])
--------------------------------------------------------------------------------
/configs/baseconfig/utils.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import argparse
3 | from distutils.util import strtobool
4 | import json
5 |
6 | from ray.tune import sample_from
7 |
8 | def add_ray_init_args(parser):
9 |
10 | def init_help_string(help_string):
11 | return help_string + " Passed to `ray.init`."
12 |
13 | parser.add_argument(
14 | '--cpus',
15 | type=int,
16 | default=None,
17 | help=init_help_string("Cpus to allocate to ray process."))
18 | parser.add_argument(
19 | '--gpus',
20 | type=int,
21 | default=None,
22 | help=init_help_string("Gpus to allocate to ray process."))
23 | parser.add_argument(
24 | '--resources',
25 | type=json.loads,
26 | default=None,
27 | help=init_help_string("Resources to allocate to ray process."))
28 | parser.add_argument(
29 | '--include-webui',
30 | type=str,
31 | default=False,
32 | help=init_help_string("Boolean flag indicating whether to start the"
33 | "web UI, which is a Jupyter notebook."))
34 | parser.add_argument(
35 | '--temp-dir',
36 | type=str,
37 | default=None,
38 | help=init_help_string("If provided, it will specify the root temporary"
39 | " directory for the Ray process."))
40 |
41 | return parser
42 |
43 |
44 | def add_ray_tune_args(parser):
45 |
46 | def tune_help_string(help_string):
47 | return help_string + " Passed to `tune.run_experiments`."
48 |
49 | parser.add_argument(
50 | '--resources-per-trial',
51 | type=json.loads,
52 | default={},
53 | help=tune_help_string("Resources to allocate for each trial."))
54 | parser.add_argument(
55 | '--trial-gpus',
56 | type=float,
57 | default=None,
58 | help=("Resources to allocate for each trial. Passed"
59 | " to `tune.run_experiments`."))
60 | parser.add_argument(
61 | '--trial-extra-cpus',
62 | type=int,
63 | default=None,
64 | help=("Extra CPUs to reserve in case the trials need to"
65 | " launch additional Ray actors that use CPUs."))
66 | parser.add_argument(
67 | '--trial-extra-gpus',
68 | type=float,
69 | default=None,
70 | help=("Extra GPUs to reserve in case the trials need to"
71 | " launch additional Ray actors that use GPUs."))
72 | parser.add_argument(
73 | '--num-samples',
74 | default=1,
75 | type=int,
76 | help=tune_help_string("Number of times to repeat each trial."))
77 | parser.add_argument(
78 | '--upload-dir',
79 | type=str,
80 | default='',
81 | help=tune_help_string("Optional URI to sync training results to (e.g."
82 | " s3:// or gs://)."))
83 | parser.add_argument(
84 | '--trial-name-template',
85 | type=str,
86 | default='seed:{trial.config[run_params][seed]}',
87 | help=tune_help_string(
88 | "Optional string template for trial name. For example:"
89 | " '{trial.trial_id}-seed={trial.config[run_params][seed]}'"))
90 | parser.add_argument(
91 | '--trial-cpus',
92 | type=int,
93 | default=multiprocessing.cpu_count(),
94 | help=tune_help_string("Resources to allocate for each trial."))
95 | parser.add_argument(
96 | '--checkpoint-frequency',
97 | type=int,
98 | default=None,
99 | help=tune_help_string(
100 | "How many training iterations between checkpoints."
101 | " A value of 0 (default) disables checkpointing. If set,"
102 | " takes precedence over variant['run_params']"
103 | "['checkpoint_frequency']."))
104 | parser.add_argument(
105 | '--checkpoint-at-end',
106 | type=lambda x: bool(strtobool(x)),
107 | default=None,
108 | help=tune_help_string(
109 | "Whether to checkpoint at the end of the experiment. If set,"
110 | " takes precedence over variant['run_params']"
111 | "['checkpoint_at_end']."))
112 | parser.add_argument(
113 | '--max-failures',
114 | default=3,
115 | type=int,
116 | help=tune_help_string(
117 | "Try to recover a trial from its last checkpoint at least this "
118 | "many times. Only applies if checkpointing is enabled."))
119 | parser.add_argument(
120 | '--restore',
121 | type=str,
122 | default=None,
123 | help=tune_help_string(
124 | "Path to checkpoint. Only makes sense to set if running 1 trial."
125 | " Defaults to None."))
126 | parser.add_argument(
127 | '--with-server',
128 | type=str,
129 | default=False,
130 | help=tune_help_string("Starts a background Tune server. Needed for"
131 | " using the Client API."))
132 |
133 | return parser
134 |
135 |
136 | def get_parser():
137 | parser = argparse.ArgumentParser()
138 |
139 | parser.add_argument(
140 | '--config',
141 | type=str)
142 |
143 | parser.add_argument(
144 | '--checkpoint-replay-pool',
145 | type=lambda x: bool(strtobool(x)),
146 | default=None,
147 | help=("Whether a checkpoint should also saved the replay"
148 | " pool. If set, takes precedence over"
149 | " variant['run_params']['checkpoint_replay_pool']."
150 | " Note that the replay pool is saved (and "
151 | " constructed) piece by piece so that each"
152 | " experience is saved only once."))
153 |
154 | parser.add_argument(
155 | '--policy',
156 | type=str,
157 | choices=('cpopolicy'),
158 | default='cpopolicy')
159 |
160 | parser.add_argument(
161 | '--mode', type=str, default='local')
162 | parser.add_argument(
163 | '--confirm-remote',
164 | type=lambda x: bool(strtobool(x)),
165 | nargs='?',
166 | const=True,
167 | default=True,
168 | help="Whether or not to query yes/no on remote run.")
169 |
170 | parser.add_argument(
171 | '--video-save-frequency',
172 | type=int,
173 | default=None,
174 | help="Save frequency for videos.")
175 |
176 | parser = add_ray_init_args(parser)
177 | parser = add_ray_tune_args(parser)
178 |
179 | return parser
180 |
181 | def variant_equals(*keys):
182 | def get_from_spec(spec):
183 | # TODO(hartikainen): This may break in some cases. ray.tune seems to
184 | # add a 'config' key at the top of the spec, whereas `generate_variants`
185 | # does not.
186 | node = spec.get('config', spec)
187 | for key in keys:
188 | node = node[key]
189 |
190 | return node
191 |
192 | return sample_from(get_from_spec)
193 |
--------------------------------------------------------------------------------
/configs/cmbpo_antsafe.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | params = {
4 | 'universe': 'gym',
5 | 'task': 'AntSafe-v2',
6 | 'environment_params': {
7 | 'normalize_actions': True,
8 | },
9 | 'algorithm_params': {
10 | 'type': 'CMBPO',
11 | 'kwargs':{
12 | 'n_env_interacts': int(10e6),
13 | 'epoch_length': 50000,
14 | 'eval_every_n_steps': 5e3,
15 | 'n_initial_exploration_steps': int(10000),
16 | #### it is crucial to choose a model that doesn't overfit when trained too often on seen data
17 | ## for model architecture finding: 1. play around with the start samples to find an architecture, that doesn't really overfit
18 | # 2. m_train_freq in can somewhat limit overfitting, but is only treating the symptom
19 | # 3. try finding a balance between the size of new samples per number of
20 | # updates of the model network (with m_train_freq)
21 | 'use_model': True,
22 | 'm_hidden_dims':(512,512), # hidden layer size of model bnn
23 | 'm_loss_type': 'MSPE',
24 | 'm_use_scaler_in': True,
25 | 'm_use_scaler_out': True,
26 | 'm_lr': 1e-3,
27 | 'm_train_freq': 4000, # model is only trained every (self._timestep % self._model_train_freq==0) steps (terminates when stops improving)
28 | 'rollout_batch_size': 1.0e3, # rollout_batch_size is the size of randomly chosen states to start from when rolling out model
29 | 'm_networks': 7, # size of model network ensemble
30 | 'm_elites': 5, # best networks to select from num_networks
31 | 'max_model_t': None, # a timeout for model training (e.g. for speeding up wallclock time)
32 | 'sampling_alpha': 2,
33 | 'rollout_mode' : 'uncertainty', #### choose from 'schedule', or 'uncertainty'
34 | 'rollout_schedule': [10, 500, 5, 30], #[15, 100, 1, 15], # min_epoch, max_epoch, min_length, max_length = self._rollout_schedule
35 | # increases rollout length from min_length to max_length over
36 | # range of (min_epoch, max_epoch)
37 | ### Only applies if rollout_mode=='schedule'
38 | 'maxroll': 35, # maximum rollout horizon
39 | 'batch_size_policy': 50000, # batch size before policy is updates
40 | 'initial_real_samples_per_epoch': 20000, # number of real samples contained in first batch
41 | 'min_real_samples_per_epoch': 500, # absolute minimum of samples
42 | }
43 | },
44 | 'policy_params':{
45 | 'type':'cpopolicy',
46 | 'kwargs':{
47 | 'constrain_cost': True, # constrain_cost=False will perform TRPO updates
48 | 'a_hidden_layer_sizes': (128, 128), # policy network hidden layers
49 | 'vf_lr': 3e-4, # learn rate for value learning
50 | 'vf_hidden_layer_sizes':(128,128), # nn hidden layers for vf
51 | 'vf_epochs': 8, # number of training epochs for values
52 | 'vf_batch_size': 2048, # minibatches for value training
53 | 'vf_ensemble_size': 3, # vf ensemble size
54 | 'vf_elites': 2, # vf elites
55 | 'vf_activation': 'swish', # activation function
56 | 'vf_loss': 'MSE', # choose from 'NLL', 'MSPE' (inc. var); 'MSE' ; 'Huber'
57 | 'vf_decay': 1e-6, # decay for nn regularization
58 | 'vf_clipping': False, # clip losses for a trust-region like vf update
59 | 'vf_kl_cliprange': 0.0, # only applicable if vf_clippping=True
60 | 'ent_reg': 0, # 5e-3 # exploration bonus for maintaining pol. entropy
61 | 'target_kl': 0.01, # trust region diameter
62 | 'cost_lim': 10,
63 | 'cost_lam': .5, # gae lambda
64 | 'cost_gamma': 0.97, # discounts
65 | 'lam': .95, # gae lambda
66 | 'gamma': 0.99, # discounts
67 | }
68 | },
69 | 'buffer_params': {},
70 | 'sampler_params': {
71 | 'kwargs':{
72 | 'render_mode':None, #'human'
73 | }
74 | },
75 | 'run_params': {},
76 | }
--------------------------------------------------------------------------------
/configs/cmbpo_hcs.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | params = {
4 | 'universe': 'gym',
5 | 'task': 'HalfCheetahSafe-v2',
6 | 'environment_params': {
7 | 'normalize_actions': True,
8 | },
9 | 'algorithm_params': {
10 | 'type': 'CMBPO',
11 | 'kwargs':{
12 | 'n_env_interacts': int(10e6),
13 | 'epoch_length': 50000,
14 | 'eval_every_n_steps': 5e3,
15 | 'n_initial_exploration_steps': int(10000),
16 | 'use_model': True,
17 | 'm_hidden_dims':(512,512),
18 | 'm_loss_type': 'MSPE',
19 | 'm_use_scaler_in': True,
20 | 'm_use_scaler_out': True,
21 | 'm_lr': 1e-3,
22 | 'm_train_freq': 4000,
23 | 'rollout_batch_size': 1.0e3,
24 | 'm_networks': 7,
25 | 'm_elites': 5,
26 | 'max_model_t': None,
27 | 'sampling_alpha': 2,
28 | 'rollout_mode' : 'uncertainty',
29 | 'rollout_schedule': [10, 500, 5, 30],
30 | 'maxroll': 35,
31 | 'batch_size_policy': 50000,
32 | 'initial_real_samples_per_epoch': 15000,
33 | 'min_real_samples_per_epoch': 500,
34 | }
35 | },
36 | 'policy_params':{
37 | 'type':'cpopolicy',
38 | 'kwargs':{
39 | 'constrain_cost': False,
40 | 'a_hidden_layer_sizes': (128, 128),
41 | 'vf_lr': 3e-4,
42 | 'vf_hidden_layer_sizes':(128,128),
43 | 'vf_epochs': 8,
44 | 'vf_batch_size': 2048,
45 | 'vf_ensemble_size': 3,
46 | 'vf_elites': 2,
47 | 'vf_activation': 'swish',
48 | 'vf_loss': 'MSE',
49 | 'vf_decay': 1e-6,
50 | 'vf_clipping': False,
51 | 'vf_kl_cliprange': 0.0,
52 | 'ent_reg': 0, # 5e-3
53 | 'target_kl': 0.01,
54 | 'lam': .95,
55 | 'gamma': 0.99,
56 | }
57 | },
58 | 'buffer_params': {},
59 | 'sampler_params': {
60 | 'kwargs':{
61 | 'render_mode':'human',
62 | }
63 | },
64 | 'run_params': {},
65 | }
--------------------------------------------------------------------------------
/configs/cmbpo_hs.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | params = {
4 | 'universe': 'gym',
5 | 'task': 'HumanoidSafe-v2',
6 | 'environment_params': {
7 | 'normalize_actions': True,
8 | },
9 | 'algorithm_params': {
10 | 'type': 'CMBPO',
11 | 'kwargs':{
12 | 'n_env_interacts': int(10e6),
13 | 'epoch_length': 50000,
14 | 'eval_every_n_steps': 5e3,
15 | 'n_initial_exploration_steps': int(10000),
16 | 'use_model': True,
17 | 'm_hidden_dims':(512,512),
18 | 'm_loss_type': 'MSPE',
19 | 'm_use_scaler_in': True,
20 | 'm_use_scaler_out': True,
21 | 'm_lr': 1e-3,
22 | 'm_train_freq': 4000,
23 | 'rollout_batch_size': 1.0e3,
24 | 'm_networks': 7,
25 | 'm_elites': 5,
26 | 'max_model_t': None,
27 | 'sampling_alpha': 2,
28 | 'rollout_mode' : 'uncertainty',
29 | 'rollout_schedule': [10, 500, 5, 30],
30 | 'maxroll': 35,
31 | 'batch_size_policy': 50000,
32 | 'initial_real_samples_per_epoch': 15000,
33 | 'min_real_samples_per_epoch': 500,
34 | }
35 | },
36 | 'policy_params':{
37 | 'type':'cpopolicy',
38 | 'kwargs':{
39 | 'constrain_cost': False,
40 | 'a_hidden_layer_sizes': (128, 128),
41 | 'vf_lr': 3e-4,
42 | 'vf_hidden_layer_sizes':(128,128),
43 | 'vf_epochs': 8,
44 | 'vf_batch_size': 2048,
45 | 'vf_ensemble_size': 3,
46 | 'vf_elites': 2,
47 | 'vf_activation': 'swish',
48 | 'vf_loss': 'MSE',
49 | 'vf_decay': 1e-6,
50 | 'vf_clipping': False,
51 | 'vf_kl_cliprange': 0.0,
52 | 'ent_reg': 0, # 5e-3
53 | 'target_kl': 0.01,
54 | 'lam': .95,
55 | 'gamma': 0.99,
56 | }
57 | },
58 | 'buffer_params': {},
59 | 'sampler_params': {
60 | 'kwargs':{
61 | 'render_mode':'human',
62 | }
63 | },
64 | 'run_params': {},
65 | }
--------------------------------------------------------------------------------
/configs/cpo_hcs.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | params = {
4 | 'universe': 'gym',
5 | 'task': 'HalfCheetahSafe-v2',
6 | 'environment_params': {
7 | 'normalize_actions': True,
8 | },
9 | 'algorithm_params': {
10 | 'type': 'CMBPO',
11 | 'kwargs':{
12 | 'n_env_interacts': int(10e6),
13 | 'epoch_length': 50000,
14 | 'eval_every_n_steps': 5e3,
15 | 'n_initial_exploration_steps': int(0),
16 | 'use_model': False,
17 | 'batch_size_policy': 35000,
18 | }
19 | },
20 | 'policy_params':{
21 | 'type':'cpopolicy',
22 | 'kwargs':{
23 | 'constrain_cost': True,
24 | 'a_hidden_layer_sizes': (128, 128),
25 | 'vf_lr': 3e-4,
26 | 'vf_hidden_layer_sizes':(128,128),
27 | 'vf_epochs': 8,
28 | 'vf_batch_size': 2048,
29 | 'vf_ensemble_size': 3,
30 | 'vf_elites': 2,
31 | 'vf_activation': 'swish',
32 | 'vf_loss': 'MSE',
33 | 'vf_decay': 1e-6,
34 | 'vf_clipping': False,
35 | 'vf_kl_cliprange': 0.0,
36 | 'ent_reg': 0, # 5e-3
37 | 'target_kl': 0.01,
38 | 'lam': .95,
39 | 'gamma': 0.99,
40 | }
41 | },
42 | 'buffer_params': {},
43 | 'sampler_params': {
44 | 'kwargs':{
45 | 'render_mode':None,
46 | }
47 | },
48 | 'run_params': {},
49 | }
--------------------------------------------------------------------------------
/configs/trpo_hcs.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | params = {
4 | 'universe': 'gym',
5 | 'task': 'HalfCheetahSafe-v2',
6 | 'environment_params': {
7 | 'normalize_actions': True,
8 | },
9 | 'algorithm_params': {
10 | 'type': 'CMBPO',
11 | 'kwargs':{
12 | 'n_env_interacts': int(10e6),
13 | 'epoch_length': 50000,
14 | 'eval_render_mode': 'human',
15 | 'eval_n_episodes': 1,
16 | 'eval_every_n_steps': 5e3,
17 | 'eval_deterministic': False,
18 | 'n_initial_exploration_steps': int(0),
19 | 'use_model': False,
20 | 'batch_size_policy': 25000,
21 | }
22 | },
23 | 'policy_params':{
24 | 'type':'cpopolicy',
25 | 'kwargs':{
26 | 'constrain_cost': False,
27 | 'a_hidden_layer_sizes': (128, 128),
28 | 'vf_lr': 3e-4,
29 | 'vf_hidden_layer_sizes':(128,128),
30 | 'vf_epochs': 8,
31 | 'vf_batch_size': 2048,
32 | 'vf_ensemble_size': 3,
33 | 'vf_elites': 2,
34 | 'vf_activation': 'swish',
35 | 'vf_loss': 'MSE',
36 | 'vf_decay': 1e-6,
37 | 'vf_clipping': False,
38 | 'vf_kl_cliprange': 0.0,
39 | 'ent_reg': 0,
40 | 'target_kl': 0.01,
41 | 'cost_lim': 10,
42 | 'cost_lam': .5,
43 | 'cost_gamma': 0.97,
44 | 'lam': .95,
45 | 'gamma': 0.99,
46 | }
47 | },
48 | 'buffer_params': {},
49 | 'sampler_params': {
50 | 'kwargs':{
51 | 'render_mode':None,
52 | }
53 | },
54 | 'run_params': {},
55 | }
--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/envs/__init__.py
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | import gym
3 |
4 | import os
5 | import sys
6 | dirpath = os.path.dirname(os.path.dirname(__file__))
7 | sys.path.append(dirpath)
8 |
9 | env_specs = gym.envs.registry.env_specs
10 |
11 | if 'HumanoidSafe-v2' not in env_specs:
12 | register(
13 | id='HumanoidSafe-v2',
14 | entry_point='mujoco_safety_gym.envs:HumanoidEnv',
15 | max_episode_steps=1000,
16 | )
17 | if 'AntSafe-v2' not in env_specs:
18 | register(
19 | id='AntSafe-v2',
20 | entry_point='mujoco_safety_gym.envs:AntEnv',
21 | max_episode_steps=1000,
22 | )
23 | if 'AntSafeVisualize-v2' not in env_specs:
24 | register(
25 | id='AntSafeVisualize-v2',
26 | entry_point='mujoco_safety_gym.envs:AntEnvVisualize',
27 | max_episode_steps=1000,
28 | )
29 | if 'HopperSafe-v2' not in env_specs:
30 | register(
31 | id='HopperSafe-v2',
32 | entry_point='mujoco_safety_gym.envs:HopperEnv',
33 | max_episode_steps=1000,
34 | )
35 | if 'HalfCheetahSafe-v2' not in env_specs:
36 | register(
37 | id='HalfCheetahSafe-v2',
38 | entry_point='mujoco_safety_gym.envs:HalfCheetahEnv',
39 | max_episode_steps=1000,
40 | )
41 | if 'FetchPushSafety-v0' not in env_specs:
42 | register(
43 | id='FetchPushSafety-v0',
44 | entry_point='mujoco_safety_gym.envs:FetchPushEnv',
45 | max_episode_steps=1000,
46 | )
47 | if 'FetchReachSafety-v0' not in env_specs:
48 | register(
49 | id='FetchReachSafety-v0',
50 | entry_point='mujoco_safety_gym.envs:FetchReachEnv',
51 | max_episode_steps=1000,
52 | )
53 | if 'FetchSlideSafety-v0' not in env_specs:
54 | register(
55 | id='FetchSlideSafety-v0',
56 | entry_point='mujoco_safety_gym.envs:FetchSlideEnv',
57 | max_episode_steps=1000,
58 | )
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from mujoco_safety_gym.envs.humanoid import HumanoidEnv
2 | from mujoco_safety_gym.envs.ant import AntEnv
3 | from mujoco_safety_gym.envs.ant_viz import AntEnvVisualize
4 | from mujoco_safety_gym.envs.half_cheetah import HalfCheetahEnv
5 | from mujoco_safety_gym.envs.hopper import HopperEnv
6 | from mujoco_safety_gym.envs.fetch.push import FetchPushEnv
7 | from mujoco_safety_gym.envs.fetch.reach import FetchReachEnv
8 | from mujoco_safety_gym.envs.fetch.slide import FetchSlideEnv
9 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/ant.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from mujoco_safety_gym.envs import mujoco_env
3 | from gym import utils
4 | import mujoco_py as mjp
5 |
6 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
7 | def __init__(self):
8 | mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5)
9 | utils.EzPickle.__init__(self)
10 |
11 | def step(self, a):
12 | xposbefore = self.get_body_com("torso")[0]
13 | self.do_simulation(a, self.frame_skip)
14 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
15 | xposafter = self.get_body_com("torso")[0]
16 | forward_reward = (xposafter - xposbefore)/self.dt
17 | ctrl_cost = .5 * np.square(a).sum()
18 | contact_cost = 0.5 * 1e-3 * np.sum(
19 | np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
20 | survive_reward = 1.0
21 |
22 | ### safety stuff
23 | yposafter = self.get_body_com("torso")[1]
24 | ywall = np.array([-5,5])
25 | if xposafter<20:
26 | y_walldist = yposafter - xposafter*np.tan(30/360*2*np.pi)+ywall
27 | elif xposafter>20 and xposafter<60:
28 | y_walldist = yposafter + (xposafter-40)*np.tan(30/360*2*np.pi) - ywall
29 | elif xposafter>60 and xposafter<100:
30 | y_walldist = yposafter - (xposafter-80)*np.tan(30/360*2*np.pi) + ywall
31 | else:
32 | y_walldist = yposafter - 20*np.tan(30/360*2*np.pi) + ywall
33 |
34 | obj_cost = (abs(y_walldist)<1.8).any()*1.0
35 | reward = forward_reward - ctrl_cost - contact_cost + survive_reward
36 |
37 | body_quat = self.data.get_body_xquat('torso')
38 | z_rot = 1-2*(body_quat[1]**2+body_quat[2]**2) ### normally xx-rotation, not sure what axes mujoco uses
39 | state = self.state_vector()
40 | notdone = np.isfinite(state).all() \
41 | and state[2] >= 0.2 and state[2] <= 1.0 \
42 | and z_rot >= -0.7
43 | done = not notdone
44 | done_cost = done*1.0
45 | cost = np.clip(obj_cost+done_cost, 0, 1)
46 | ob = self._get_obs()
47 | return ob, reward, done, dict(
48 | reward_forward=forward_reward,
49 | reward_ctrl=-ctrl_cost,
50 | reward_contact=-contact_cost,
51 | reward_survive=survive_reward,
52 | cost_obj = obj_cost,
53 | cost_done = done_cost,
54 | cost = cost,
55 | )
56 |
57 | def _get_obs(self):
58 | x = self.sim.data.qpos.flat[0]
59 | y = self.sim.data.qpos.flat[1]
60 | if x<20:
61 | y_off = y - x*np.tan(30/360*2*np.pi)
62 | elif x>20 and x<60:
63 | y_off = y + (x-40)*np.tan(30/360*2*np.pi)
64 | elif x>60 and x<100:
65 | y_off = y - (x-80)*np.tan(30/360*2*np.pi)
66 | else:
67 | y_off = y - 20*np.tan(30/360*2*np.pi)
68 |
69 | return np.concatenate([
70 | self.sim.data.qpos.flat[2:-42],
71 | self.sim.data.qvel.flat[:-36],
72 | [x/5],
73 | [y_off],
74 | # np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
75 | ])
76 |
77 | def reset_model(self):
78 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
79 | qpos[-42:] = self.init_qpos[-42:]
80 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
81 | qvel[-36:] = self.init_qvel[-36:]
82 | self.set_state(qpos, qvel)
83 | return self._get_obs()
84 |
85 |
86 | def viewer_setup(self):
87 | self.viewer.cam.distance = self.model.stat.extent * 0.5
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/ant_viz.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from mujoco_safety_gym.envs import mujoco_env
3 | from gym import utils
4 | import mujoco_py as mjp
5 |
6 | class AntEnvVisualize(mujoco_env.MujocoEnv, utils.EzPickle):
7 | def __init__(self):
8 | mujoco_env.MujocoEnv.__init__(self, 'ant_viz.xml', 5)
9 | utils.EzPickle.__init__(self)
10 |
11 | def step(self, a):
12 | xposbefore = self.get_body_com("torso")[0]
13 | self.do_simulation(a, self.frame_skip)
14 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
15 | xposafter = self.get_body_com("torso")[0]
16 | forward_reward = (xposafter - xposbefore)/self.dt
17 | ctrl_cost = .5 * np.square(a).sum()
18 | contact_cost = 0.5 * 1e-3 * np.sum(
19 | np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
20 | survive_reward = 1.0
21 |
22 | ### safety stuff
23 | yposafter = self.get_body_com("torso")[1]
24 | ywall = np.array([-5,5])
25 | if xposafter<20:
26 | y_walldist = yposafter - xposafter*np.tan(30/360*2*np.pi)+ywall
27 | elif xposafter>20 and xposafter<60:
28 | y_walldist = yposafter + (xposafter-40)*np.tan(30/360*2*np.pi) - ywall
29 | elif xposafter>60 and xposafter<100:
30 | y_walldist = yposafter - (xposafter-80)*np.tan(30/360*2*np.pi) + ywall
31 | else:
32 | y_walldist = yposafter - 20*np.tan(30/360*2*np.pi) + ywall
33 |
34 | obj_cost = (abs(y_walldist)<1.8).any()*1.0
35 | reward = forward_reward - ctrl_cost - contact_cost + survive_reward
36 |
37 | body_quat = self.data.get_body_xquat('torso')
38 | z_rot = 1-2*(body_quat[1]**2+body_quat[2]**2) ### normally xx-rotation, not sure what axes mujoco uses
39 | state = self.state_vector()
40 | notdone = np.isfinite(state).all() \
41 | and state[2] >= 0.2 and state[2] <= 1.0 \
42 | and z_rot >= -0.7
43 | done = not notdone
44 | done_cost = done*1.0
45 | cost = np.clip(obj_cost+done_cost, 0, 1)
46 | ob = self._get_obs()
47 | return ob, reward, done, dict(
48 | reward_forward=forward_reward,
49 | reward_ctrl=-ctrl_cost,
50 | reward_contact=-contact_cost,
51 | reward_survive=survive_reward,
52 | cost_obj = obj_cost,
53 | cost_done = done_cost,
54 | cost = cost,
55 | )
56 |
57 | def _get_obs(self):
58 | x = self.sim.data.qpos.flat[0]
59 | y = self.sim.data.qpos.flat[1]
60 |
61 | x2 = self.sim.data.qpos.flat[15]
62 | y2 = self.sim.data.qpos.flat[16]
63 |
64 | x3 = self.sim.data.qpos.flat[30]
65 | y3 = self.sim.data.qpos.flat[31]
66 |
67 | if x<20:
68 | y_off = y - x*np.tan(30/360*2*np.pi)
69 | elif x>20 and x<60:
70 | y_off = y + (x-40)*np.tan(30/360*2*np.pi)
71 | elif x>60 and x<100:
72 | y_off = y - (x-80)*np.tan(30/360*2*np.pi)
73 | else:
74 | y_off = y - 20*np.tan(30/360*2*np.pi)
75 |
76 | qpos1 = self.sim.data.qpos.flat[2:15]
77 | qvel1 = self.sim.data.qvel.flat[:14]
78 |
79 | if x2<20:
80 | y_off2 = y2- x2*np.tan(30/360*2*np.pi)
81 | elif x2>20 and x<60:
82 | y_off2 = y2 + (x2-40)*np.tan(30/360*2*np.pi)
83 | elif x2>60 and x<100:
84 | y_off2 = y2 - (x2-80)*np.tan(30/360*2*np.pi)
85 | else:
86 | y_off2 = y2 - 20*np.tan(30/360*2*np.pi)
87 |
88 | qpos2 = self.sim.data.qpos.flat[17:30]
89 | qvel2 = self.sim.data.qvel.flat[14:28]
90 |
91 | if x3<20:
92 | y_off3 = y3 - x3*np.tan(30/360*2*np.pi)
93 | elif x3>20 and x<60:
94 | y_off3 = y3 + (x3-40)*np.tan(30/360*2*np.pi)
95 | elif x3>60 and x<100:
96 | y_off3 = y3 - (x3-80)*np.tan(30/360*2*np.pi)
97 | else:
98 | y_off3 = y3 - 20*np.tan(30/360*2*np.pi)
99 |
100 | qpos3 = self.sim.data.qpos.flat[32:45]
101 | qvel3 = self.sim.data.qvel.flat[28:42]
102 |
103 | return np.concatenate([
104 | qpos1,
105 | qvel1,
106 | [x/5],
107 | [y_off],
108 | qpos2,
109 | qvel2,
110 | [x2/5],
111 | [y_off2],
112 | qpos3,
113 | qvel3,
114 | [x3/5],
115 | [y_off3],
116 | # np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
117 | ])
118 |
119 | def reset_model(self):
120 | # qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
121 | # qpos[-42:] = self.init_qpos[-42:]
122 | # qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
123 | # qvel[-36:] = self.init_qvel[-36:]
124 | qpos = self.init_qpos
125 | qvel = self.init_qvel
126 | self.set_state(qpos, qvel)
127 | return self._get_obs()
128 |
129 |
130 | def viewer_setup(self):
131 | self.viewer.cam.distance = self.model.stat.extent * 0.5
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/ant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
39 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/pick_and_place.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/push.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/reach.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/robot.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/shared.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/fetch/slide.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/half_cheetah.xml:
--------------------------------------------------------------------------------
1 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/hopper.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/textures/block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/envs/mujoco_safety_gym/envs/assets/textures/block.png
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/assets/textures/block_hidden.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/envs/mujoco_safety_gym/envs/assets/textures/block_hidden.png
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/fetch/pick_and_place.py:
--------------------------------------------------------------------------------
1 | import os
2 | from gym import utils
3 | from mujoco_safety_gym.envs.fetch_env import FetchEnvNew
4 |
5 |
6 | # Ensure we get the path separator correct on windows
7 | MODEL_XML_PATH = os.path.join('fetch', 'pick_and_place.xml')
8 |
9 |
10 | class FetchPickAndPlaceEnv(FetchEnvNew, utils.EzPickle):
11 | def __init__(self, reward_type='sparse'):
12 | initial_qpos = {
13 | 'robot0:slide0': 0.405,
14 | 'robot0:slide1': 0.48,
15 | 'robot0:slide2': 0.0,
16 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.],
17 | }
18 | FetchEnvNew.__init__(
19 | self, MODEL_XML_PATH, has_object=True, block_gripper=False, n_substeps=20,
20 | gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0,
21 | obj_range=0.15, target_range=0.15, distance_threshold=0.05, additional_objects=False,
22 | number_of_objects = 0, initial_qpos=initial_qpos, reward_type=reward_type)
23 | utils.EzPickle.__init__(self)
24 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/fetch/push.py:
--------------------------------------------------------------------------------
1 | import os
2 | from gym import utils
3 | from mujoco_safety_gym.envs.fetch_env import FetchEnvNew
4 |
5 |
6 | # Ensure we get the path separator correct on windows
7 | MODEL_XML_PATH = os.path.join('fetch', 'push.xml')
8 |
9 |
10 | class FetchPushEnv(FetchEnvNew, utils.EzPickle):
11 | def __init__(self, reward_type='sparse', additional_objects=False, number_of_objects=5):
12 | initial_qpos = {
13 | 'robot0:slide0': 0.405,
14 | 'robot0:slide1': 0.48,
15 | 'robot0:slide2': 0.0,
16 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.]
17 | }
18 | FetchEnvNew.__init__(
19 | self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20,
20 | gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0,
21 | obj_range=0.10, target_range=0.15, distance_threshold=0.05, additional_objects=additional_objects,
22 | number_of_objects = number_of_objects, initial_qpos=initial_qpos, reward_type=reward_type)
23 | utils.EzPickle.__init__(self)
24 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/fetch/reach.py:
--------------------------------------------------------------------------------
1 | import os
2 | from gym import utils
3 | from mujoco_safety_gym.envs.fetch_env import FetchEnvNew
4 |
5 |
6 | # Ensure we get the path separator correct on windows
7 | MODEL_XML_PATH = os.path.join('fetch', 'reach.xml')
8 |
9 |
10 | class FetchReachEnv(FetchEnvNew, utils.EzPickle):
11 | def __init__(self, reward_type='sparse', additional_objects=False, number_of_objects=5):
12 | initial_qpos = {
13 | 'robot0:slide0': 0.405,
14 | 'robot0:slide1': 0.48,
15 | 'robot0:slide2': 0.0,
16 | }
17 | FetchEnvNew.__init__(
18 | self, MODEL_XML_PATH, has_object=False, block_gripper=True, n_substeps=20,
19 | gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0,
20 | obj_range=0.1, target_range=0.2, distance_threshold=0.05, additional_objects=additional_objects,
21 | number_of_objects = number_of_objects, initial_qpos=initial_qpos, reward_type=reward_type)
22 | utils.EzPickle.__init__(self)
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/fetch/slide.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 |
4 | from gym import utils
5 | from mujoco_safety_gym.envs import fetch_env
6 |
7 |
8 | # Ensure we get the path separator correct on windows
9 | MODEL_XML_PATH = os.path.join('fetch', 'slide.xml')
10 |
11 |
12 | class FetchSlideEnv(fetch_env.FetchEnvNew, utils.EzPickle):
13 | def __init__(self, reward_type='sparse'):
14 | initial_qpos = {
15 | 'robot0:slide0': 0.05,
16 | 'robot0:slide1': 0.48,
17 | 'robot0:slide2': 0.0,
18 | 'object0:joint': [1.7, 1.1, 0.41, 1., 0., 0., 0.],
19 | }
20 | fetch_env.FetchEnvNew.__init__(
21 | self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20,
22 | gripper_extra_height=-0.02, target_in_the_air=False, target_offset=np.array([0.4, 0.0, 0.0]),
23 | obj_range=0.1, target_range=0.3, distance_threshold=0.05, additional_objects=False,
24 | number_of_objects = 0, initial_qpos=initial_qpos, reward_type=reward_type)
25 | utils.EzPickle.__init__(self)
26 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/half_cheetah.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from mujoco_safety_gym.envs import mujoco_env
4 | import mujoco_py as mjp
5 | from gym import error, spaces
6 |
7 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
8 | def __init__(self):
9 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
10 | utils.EzPickle.__init__(self)
11 |
12 | def step(self, action):
13 | xposbefore = self.sim.data.qpos[1]
14 |
15 | t = self.data.time
16 | wall_act = .02*np.sin(t/3)**2 - .004
17 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
18 | action_p_wall = np.concatenate((np.squeeze(action), [wall_act]))
19 |
20 | self.do_simulation(action_p_wall, self.frame_skip)
21 | xposafter = self.sim.data.qpos[1]
22 |
23 | wallpos = self.data.get_geom_xpos("obj_geom")[0]
24 | wallvel = self.data.get_body_xvelp("obj1")[0]
25 | xdist = wallpos-xposafter
26 | obj_cost = int(np.abs(xdist)<2)
27 | if obj_cost>0:
28 | self.model.geom_rgba[9] = [1.0, 0, 0, 1.0]
29 | else:
30 | self.model.geom_rgba[9] = [1.0, 0.5, 0.5, .8]
31 | ob = self._get_obs()
32 | reward_ctrl = - 0.1 * np.square(action).sum()
33 | reward_run = (xposafter - xposbefore)/self.dt
34 | reward = reward_ctrl + reward_run
35 | done = False
36 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl, cost=obj_cost)
37 |
38 | def _get_obs(self):
39 | wallvel = self.data.get_body_xvelp("obj1")[0]
40 | wall_f = .02*np.sin(self.data.time/3)**2 - .004
41 | xdist = (self.data.get_geom_xpos("obj_geom")[0]-self.sim.data.qpos[1])/10
42 |
43 | return np.concatenate([
44 | self.sim.data.qpos.flat[2:],
45 | self.sim.data.qvel.flat[1:],
46 | [wallvel],
47 | [wall_f],
48 | np.clip([xdist], -5, 5),
49 | ])
50 |
51 | def reset_model(self):
52 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
53 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
54 | self.set_state(qpos, qvel)
55 | return self._get_obs()
56 |
57 | def viewer_setup(self):
58 | self.viewer.cam.distance = self.model.stat.extent * 0.5
59 |
60 |
61 | def _set_action_space(self):
62 | bounds = self.model.actuator_ctrlrange.copy().astype(np.float32)
63 | low, high = bounds.T
64 | low, high = low[:-1], high[:-1]
65 | self.action_space = spaces.Box(low=low, high=high, dtype=np.float32)
66 | return self.action_space
67 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/hopper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from mujoco_safety_gym.envs import mujoco_env
3 | from gym import utils
4 | import mujoco_py as mjp
5 |
6 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
7 | def __init__(self):
8 | mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 4)
9 | utils.EzPickle.__init__(self)
10 | self.last_mocx = 5 #### vel readings are super noisy for mocap weld
11 |
12 | def step(self, a):
13 | posbefore = self.sim.data.qpos[3]
14 | t = self.data.time
15 | pos = (t + np.sin(t)) + 3
16 | self.data.set_mocap_pos('mocap1', [pos, 0, 0.5])
17 |
18 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
19 | self.do_simulation(a, self.frame_skip)
20 | posafter, height, ang = self.sim.data.qpos[3:6]
21 | alive_bonus = 1.0
22 |
23 | mocapx = self.sim.data.qpos[0]
24 | xdist = mocapx-posafter
25 | cost = int(np.abs(xdist)<1)
26 |
27 | reward = (posafter - posbefore) / self.dt
28 | reward += alive_bonus
29 | reward -= 1e-3 * np.square(a).sum()
30 | s = self.state_vector()
31 | done = not (np.isfinite(s).all() and (np.abs(s[5:]) < 100).all() and
32 | (height > .7) and (abs(ang) < .2))
33 | ob = self._get_obs()
34 | return ob, reward, done, {'cost':cost}
35 |
36 | def _get_obs(self):
37 | x = self.sim.data.qpos[3]
38 | mocapx = self.sim.data.qpos[0]
39 | mocvel = 1 + np.cos(self.data.time)
40 | mocacc = -np.sin(self.data.time)
41 | return np.concatenate([
42 | self.sim.data.qpos.flat[4:],
43 | np.clip(self.sim.data.qvel[3:].flat, -10, 10),
44 | [mocvel],
45 | [mocacc],
46 | [mocapx-x],
47 | ])
48 |
49 | def reset_model(self):
50 | qpos = self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq)
51 | qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
52 | self.set_state(qpos, qvel)
53 | return self._get_obs()
54 |
55 | def last_mocap_x(self):
56 |
57 | return self.last_mocx
58 |
59 | def viewer_setup(self):
60 | self.viewer.cam.trackbodyid = 2
61 | self.viewer.cam.distance = self.model.stat.extent * 0.75
62 | self.viewer.cam.lookat[2] = 1.15
63 | self.viewer.cam.elevation = -20
64 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/humanoid.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from mujoco_safety_gym.envs import mujoco_env
3 | from gym import utils
4 | import mujoco_py as mjp
5 |
6 | def mass_center(model, sim):
7 | mass = np.expand_dims(model.body_mass, 1)
8 | xpos = sim.data.xipos
9 | return (np.sum(mass * xpos, 0) / np.sum(mass))[0]
10 |
11 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
12 | def __init__(self):
13 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5)
14 | utils.EzPickle.__init__(self)
15 |
16 | def _get_obs(self):
17 | data = self.sim.data
18 | x = data.qpos.flat[0]
19 | y = data.qpos.flat[1]
20 | if x<20:
21 | y_off = y - x*np.tan(30/360*2*np.pi)
22 | elif x>20 and x<60:
23 | y_off = y + (x-40)*np.tan(30/360*2*np.pi)
24 | elif x>60 and x<100:
25 | y_off = y - (x-80)*np.tan(30/360*2*np.pi)
26 | else:
27 | y_off = y - 20*np.tan(30/360*2*np.pi)
28 |
29 | return np.concatenate([data.qpos.flat[2:-42],
30 | data.qvel.flat[:-36],
31 | [x/5],
32 | [y_off]])
33 |
34 | # return np.concatenate([data.qpos.flat[2:],
35 | # data.qvel.flat,
36 | # data.cinert.flat,
37 | # data.cvel.flat,
38 | # data.qfrc_actuator.flat,
39 | # data.cfrc_ext.flat])
40 |
41 | def step(self, a):
42 | pos_before = mass_center(self.model, self.sim)
43 | self.do_simulation(a, self.frame_skip)
44 | mjp.functions.mj_rnePostConstraint(self.sim.model, self.sim.data) #### calc contacts, this is a mujoco py version mismatch issue with mujoco200
45 | pos_after = mass_center(self.model, self.sim)
46 | alive_bonus = 5.0
47 | data = self.sim.data
48 | lin_vel_cost = 1.25 * (pos_after - pos_before) / self.dt
49 | quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
50 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum()
51 | quad_impact_cost = min(quad_impact_cost, 10)
52 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
53 |
54 | yposafter = self.get_body_com("torso")[1]
55 | ywall = np.array([-5,5])
56 | if pos_after<20:
57 | y_walldist = yposafter - pos_after*np.tan(30/360*2*np.pi)+ywall
58 | elif pos_after>20 and pos_after<60:
59 | y_walldist = yposafter + (pos_after-40)*np.tan(30/360*2*np.pi) - ywall
60 | elif pos_after>60 and pos_after<100:
61 | y_walldist = yposafter - (pos_after-80)*np.tan(30/360*2*np.pi) + ywall
62 | else:
63 | y_walldist = yposafter - 20*np.tan(30/360*2*np.pi) + ywall
64 | obj_cost = (abs(y_walldist)<2).any()*1.0
65 |
66 |
67 | qpos = self.sim.data.qpos
68 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
69 |
70 | done_cost = done*1.0
71 | cost = np.clip(obj_cost+done_cost, 0, 1)
72 |
73 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost,
74 | reward_quadctrl=-quad_ctrl_cost,
75 | reward_alive=alive_bonus,
76 | reward_impact=-quad_impact_cost,
77 | cost_obj = obj_cost,
78 | cost_done = done_cost,
79 | cost = cost,
80 | )
81 |
82 | def reset_model(self):
83 | c = 0.01
84 | # self.set_state(
85 | # self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
86 | # self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
87 | # )
88 | # return self._get_obs()
89 | qpos = self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq)
90 | qpos[-42:] = self.init_qpos[-42:]
91 | qvel = self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
92 | qvel[-36:] = self.init_qvel[-36:]
93 | self.set_state(qpos, qvel)
94 | return self._get_obs()
95 |
96 |
97 | def viewer_setup(self):
98 | self.viewer.cam.trackbodyid = 1
99 | self.viewer.cam.distance = self.model.stat.extent * 1.0
100 | self.viewer.cam.lookat[2] = 2.0
101 | self.viewer.cam.elevation = -20
102 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/mujoco_env.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | import os
3 |
4 |
5 | from gym import error, spaces
6 | from gym.utils import seeding
7 | import numpy as np
8 | from os import path
9 | import gym
10 |
11 | try:
12 | import mujoco_py
13 | except ImportError as e:
14 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e))
15 |
16 | DEFAULT_SIZE = 500
17 |
18 |
19 | def convert_observation_to_space(observation):
20 | if isinstance(observation, dict):
21 | space = spaces.Dict(OrderedDict([
22 | (key, convert_observation_to_space(value))
23 | for key, value in observation.items()
24 | ]))
25 | elif isinstance(observation, np.ndarray):
26 | low = np.full(observation.shape, -float('inf'), dtype=np.float32)
27 | high = np.full(observation.shape, float('inf'), dtype=np.float32)
28 | space = spaces.Box(low, high, dtype=observation.dtype)
29 | else:
30 | raise NotImplementedError(type(observation), observation)
31 |
32 | return space
33 |
34 |
35 | class MujocoEnv(gym.Env):
36 | """Superclass for all MuJoCo environments.
37 | """
38 |
39 | def __init__(self, model_path, frame_skip):
40 | if model_path.startswith("/"):
41 | fullpath = model_path
42 | else:
43 | fullpath = os.path.join(os.path.dirname(__file__), "./assets", model_path)
44 | if not path.exists(fullpath):
45 | raise IOError("File %s does not exist" % fullpath)
46 | self.frame_skip = frame_skip
47 | self.model = mujoco_py.load_model_from_path(fullpath)
48 | self.sim = mujoco_py.MjSim(self.model)
49 | self.data = self.sim.data
50 | self.viewer = None
51 | self._viewers = {}
52 |
53 | self.metadata = {
54 | 'render.modes': ['human', 'rgb_array', 'depth_array'],
55 | 'video.frames_per_second': int(np.round(1.0 / self.dt))
56 | }
57 |
58 | self.init_qpos = self.sim.data.qpos.ravel().copy()
59 | self.init_qvel = self.sim.data.qvel.ravel().copy()
60 |
61 | self._set_action_space()
62 |
63 | action = self.action_space.sample()
64 | observation, _reward, done, _info = self.step(action)
65 | # assert not done
66 |
67 | self._set_observation_space(observation)
68 |
69 | self.seed()
70 |
71 | def _set_action_space(self):
72 | bounds = self.model.actuator_ctrlrange.copy().astype(np.float32)
73 | low, high = bounds.T
74 | self.action_space = spaces.Box(low=low, high=high, dtype=np.float32)
75 | return self.action_space
76 |
77 | def _set_observation_space(self, observation):
78 | self.observation_space = convert_observation_to_space(observation)
79 | return self.observation_space
80 |
81 | def seed(self, seed=None):
82 | self.np_random, seed = seeding.np_random(seed)
83 | return [seed]
84 |
85 | # methods to override:
86 | # ----------------------------
87 |
88 | def reset_model(self):
89 | """
90 | Reset the robot degrees of freedom (qpos and qvel).
91 | Implement this in each subclass.
92 | """
93 | raise NotImplementedError
94 |
95 | def viewer_setup(self):
96 | """
97 | This method is called when the viewer is initialized.
98 | Optionally implement this method, if you need to tinker with camera position
99 | and so forth.
100 | """
101 | pass
102 |
103 | # -----------------------------
104 |
105 | def reset(self):
106 | self.sim.reset()
107 | ob = self.reset_model()
108 | return ob
109 |
110 | def set_state(self, qpos, qvel):
111 | assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
112 | old_state = self.sim.get_state()
113 | new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel,
114 | old_state.act, old_state.udd_state)
115 | self.sim.set_state(new_state)
116 | self.sim.forward()
117 |
118 | @property
119 | def dt(self):
120 | return self.model.opt.timestep * self.frame_skip
121 |
122 | def do_simulation(self, ctrl, n_frames):
123 | self.sim.data.ctrl[:] = ctrl
124 | for _ in range(n_frames):
125 | self.sim.step()
126 |
127 | def render(self,
128 | mode='human',
129 | width=DEFAULT_SIZE,
130 | height=DEFAULT_SIZE,
131 | camera_id=None,
132 | camera_name=None):
133 | if mode == 'rgb_array':
134 | if camera_id is not None and camera_name is not None:
135 | raise ValueError("Both `camera_id` and `camera_name` cannot be"
136 | " specified at the same time.")
137 |
138 | no_camera_specified = camera_name is None and camera_id is None
139 | if no_camera_specified:
140 | camera_name = 'track'
141 |
142 | if camera_id is None and camera_name in self.model._camera_name2id:
143 | camera_id = self.model.camera_name2id(camera_name)
144 |
145 | self._get_viewer(mode).render(width, height, camera_id=camera_id)
146 | # window size used for old mujoco-py:
147 | data = self._get_viewer(mode).read_pixels(width, height, depth=False)
148 | # original image is upside-down, so flip it
149 | return data[::-1, :, :]
150 | elif mode == 'depth_array':
151 | self._get_viewer(mode).render(width, height)
152 | # window size used for old mujoco-py:
153 | # Extract depth part of the read_pixels() tuple
154 | data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1]
155 | # original image is upside-down, so flip it
156 | return data[::-1, :]
157 | elif mode == 'human':
158 | self._get_viewer(mode).render()
159 |
160 | def close(self):
161 | if self.viewer is not None:
162 | # self.viewer.finish()
163 | self.viewer = None
164 | self._viewers = {}
165 |
166 | def _get_viewer(self, mode):
167 | self.viewer = self._viewers.get(mode)
168 | if self.viewer is None:
169 | if mode == 'human':
170 | self.viewer = mujoco_py.MjViewer(self.sim)
171 | elif mode == 'rgb_array' or mode == 'depth_array':
172 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1)
173 |
174 | self.viewer_setup()
175 | self._viewers[mode] = self.viewer
176 | return self.viewer
177 |
178 | def get_body_com(self, body_name):
179 | return self.data.get_body_xpos(body_name)
180 |
181 | def state_vector(self):
182 | return np.concatenate([
183 | self.sim.data.qpos.flat,
184 | self.sim.data.qvel.flat
185 | ])
186 |
187 | def place_random_objects(self):
188 | for i in range(9):
189 | random_color_array = np.append(np.random.uniform(0, 1, size=3), 1)
190 | random_pos_array = np.append(np.random.uniform(-10., 10., size=2), 0.5)
191 | site_id = self.sim.model.geom_name2id('obj' + str(i))
192 | self.sim.model.geom_rgba[site_id] = random_color_array
193 | self.sim.model.geom_pos[site_id] = random_pos_array
194 |
--------------------------------------------------------------------------------
/envs/mujoco_safety_gym/envs/robot_env.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import numpy as np
4 |
5 | import gym
6 | from gym import error, spaces
7 | from gym.utils import seeding
8 |
9 | try:
10 | import mujoco_py
11 | except ImportError as e:
12 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e))
13 |
14 | DEFAULT_SIZE = 500
15 |
16 | class RobotEnv(gym.GoalEnv):
17 | def __init__(self, model_path, initial_qpos, n_actions, n_substeps):
18 | if model_path.startswith('/'):
19 | fullpath = model_path
20 | else:
21 | fullpath = os.path.join(os.path.dirname(__file__), 'assets', model_path)
22 | if not os.path.exists(fullpath):
23 | raise IOError('File {} does not exist'.format(fullpath))
24 |
25 | model = mujoco_py.load_model_from_path(fullpath)
26 | self.sim = mujoco_py.MjSim(model, nsubsteps=n_substeps)
27 | self.viewer = None
28 | self._viewers = {}
29 |
30 | self.metadata = {
31 | 'render.modes': ['human', 'rgb_array'],
32 | 'video.frames_per_second': int(np.round(1.0 / self.dt))
33 | }
34 |
35 | self.seed()
36 | self._env_setup(initial_qpos=initial_qpos)
37 | self.initial_state = copy.deepcopy(self.sim.get_state())
38 |
39 | self.goal = self._sample_goal()
40 | obs = self._get_obs()
41 | self.action_space = spaces.Box(-1., 1., shape=(n_actions,), dtype='float32')
42 | self.observation_space = spaces.Dict(dict(
43 | desired_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'),
44 | achieved_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'),
45 | observation=spaces.Box(-np.inf, np.inf, shape=obs['observation'].shape, dtype='float32'),
46 | ))
47 |
48 | @property
49 | def dt(self):
50 | return self.sim.model.opt.timestep * self.sim.nsubsteps
51 |
52 | # Env methods
53 | # ----------------------------
54 |
55 | def seed(self, seed=None):
56 | self.np_random, seed = seeding.np_random(seed)
57 | return [seed]
58 |
59 | def step(self, action):
60 | # if (action.shape < (4,) and np.ndim(action.shape) == 1):
61 | # action = np.append(action, np.zeros(4 - action.shape[0]))
62 | action = np.clip(action, self.action_space.low, self.action_space.high)
63 | self._set_action(action)
64 | self.sim.step()
65 | self._step_callback()
66 | obs = self._get_obs()
67 |
68 | done = False
69 | info = {
70 | 'is_success': self._is_success(obs['achieved_goal'], self.goal),
71 | 'cost': self._compute_costs(obs),
72 | }
73 | reward = self.compute_reward(obs['achieved_goal'], self.goal, info)
74 | return obs, reward, done, info
75 |
76 | def reset(self, **kwargs):
77 | # Attempt to reset the simulator. Since we randomize initial conditions, it
78 | # is possible to get into a state with numerical issues (e.g. due to penetration or
79 | # Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand).
80 | # In this case, we just keep randomizing until we eventually achieve a valid initial
81 | # configuration.
82 | super(RobotEnv, self).reset()
83 | did_reset_sim = False
84 | self.goal = self._sample_goal().copy()
85 | while not did_reset_sim:
86 | did_reset_sim = self._reset_sim(**kwargs)
87 | obs = self._get_obs()
88 | return obs
89 |
90 | def close(self):
91 | if self.viewer is not None:
92 | # self.viewer.finish()
93 | self.viewer = None
94 | self._viewers = {}
95 |
96 | def render(self, mode='human', width=DEFAULT_SIZE, height=DEFAULT_SIZE):
97 | self._render_callback()
98 | if mode == 'rgb_array':
99 | self._get_viewer(mode).render(width, height)
100 | # window size used for old mujoco-py:
101 | data = self._get_viewer(mode, True).read_pixels(width, height, depth=False)
102 | # original image is upside-down, so flip it
103 | return data[::-1, :, :]
104 | elif mode == 'human':
105 | self._get_viewer(mode).render()
106 |
107 | def _get_viewer(self, mode, cam_fixed=False):
108 | self.viewer = self._viewers.get(mode)
109 | if self.viewer is None:
110 | if mode == 'human':
111 | self.viewer = mujoco_py.MjViewer(self.sim)
112 | elif mode == 'rgb_array':
113 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, 0)
114 | cam_fixed = True
115 | self._viewer_setup(cam_fixed)
116 | self._viewers[mode] = self.viewer
117 | return self.viewer
118 |
119 | # Extension methods
120 | # ----------------------------
121 |
122 | def _reset_sim(self, **kwargs):
123 | """Resets a simulation and indicates whether or not it was successful.
124 | If a reset was unsuccessful (e.g. if a randomized state caused an error in the
125 | simulation), this method should indicate such a failure by returning False.
126 | In such a case, this method will be called again to attempt a the reset again.
127 | """
128 | self.sim.set_state(self.initial_state)
129 | self.sim.forward()
130 | return True
131 |
132 | def _get_obs(self):
133 | """Returns the observation.
134 | """
135 | raise NotImplementedError()
136 |
137 | def _set_action(self, action):
138 | """Applies the given action to the simulation.
139 | """
140 | raise NotImplementedError()
141 |
142 | def _is_success(self, achieved_goal, desired_goal):
143 | """Indicates whether or not the achieved goal successfully achieved the desired goal.
144 | """
145 | raise NotImplementedError()
146 |
147 | def _sample_goal(self):
148 | """Samples a new goal and returns it.
149 | """
150 | raise NotImplementedError()
151 |
152 | def _env_setup(self, initial_qpos):
153 | """Initial configuration of the environment. Can be used to configure initial state
154 | and extract information from the simulation.
155 | """
156 | pass
157 |
158 | def _viewer_setup(self, cam_fixed=False):
159 | """Initial configuration of the viewer. Can be used to set the camera position,
160 | for example.
161 | """
162 | pass
163 |
164 | def _render_callback(self):
165 | """A custom callback that is called before rendering. Can be used
166 | to implement custom visualizations.
167 | """
168 | pass
169 |
170 | def _step_callback(self):
171 | """A custom callback that is called after stepping the simulation. Can be used
172 | to enforce additional constraints on the simulation state.
173 | """
174 | pass
175 |
176 | def _compute_costs(self, obs):
177 | """Calculate the costs for the given observation
178 | """
179 | pass
180 |
--------------------------------------------------------------------------------
/envs/utils.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import envs.mujoco_safety_gym
3 | from wrappers import NormalizeActionWrapper
4 |
5 | def get_gym_env():
6 | import gym
7 | import envs.mujoco_safety_gym
8 |
9 | return gym.make
10 |
11 | def get_safety_gym():
12 | import safety_gym
13 |
14 | return gym.make
15 |
16 | ENVS_FUNCTIONS = {
17 | 'gym':get_gym_env()
18 | }
19 |
20 | def get_environment(universe, task, environment_kwargs):
21 | env = ENVS_FUNCTIONS[universe](task, **environment_kwargs)
22 | return env
23 |
24 | def get_env_from_params(env_params):
25 | universe = env_params['universe']
26 | task = env_params['task']
27 | environment_kwargs = env_params.get('kwargs', {}).copy()
28 |
29 | env = get_environment(universe, task, environment_kwargs)
30 |
31 | #### @anyboby maybe write something nicer for wrappers
32 | if env_params.get('normalize_actions', False):
33 | env = NormalizeActionWrapper(env)
34 |
35 | return env
36 |
--------------------------------------------------------------------------------
/envs/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | from .normalize_action import NormalizeActionWrapper
--------------------------------------------------------------------------------
/envs/wrappers/normalize_action.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from gym import spaces
3 | import numpy as np
4 |
5 |
6 | __all__ = ['NormalizeActionWrapper']
7 |
8 | class NormalizeActionWrapper(gym.ActionWrapper):
9 | """Rescale the action space of the environment."""
10 |
11 | def action(self, action):
12 | if not isinstance(self.env.action_space, spaces.Box):
13 | return action
14 |
15 | # rescale the action
16 | low, high = self.env.action_space.low, self.env.action_space.high
17 | scaled_action = low + (action + 1.0) * (high - low) / 2.0
18 | scaled_action = np.clip(scaled_action, low, high)
19 |
20 | return scaled_action
21 |
22 | def reverse_action(self, action):
23 | raise NotImplementedError
24 |
25 | normalize = NormalizeActionWrapper
26 |
--------------------------------------------------------------------------------
/models/base_model.py:
--------------------------------------------------------------------------------
1 | import abc
2 |
3 | class BaseModel(abc.ABC):
4 |
5 | @abc.abstractmethod
6 | def predict(self, x):
7 | """ Make predictions, should return (mean, var) if model is probabilistic or mean else"""
8 | raise NotImplementedError
9 |
10 | @abc.abstractmethod
11 | def train(self, x, y, ):
12 | """ Make predictions, should return (mean, var) if model is probabilistic or mean else"""
13 | raise NotImplementedError
14 |
15 | @abc.abstractproperty
16 | def is_probabilistic(self):
17 | """ indicates whether model predictions are probabilistic or deterministic """
18 | raise NotImplementedError
19 |
20 | @abc.abstractproperty
21 | def is_ensemble(self):
22 | """ indicates whether model is an ensemble """
23 | raise NotImplementedError
24 |
25 | @abc.abstractproperty
26 | def in_dim(self):
27 | """ dimension of inputs """
28 | raise NotImplementedError
29 |
30 | @abc.abstractproperty
31 | def out_dim(self):
32 | """ dimension of outputs """
33 | raise NotImplementedError
34 |
35 | class EnsembleModel(BaseModel):
36 | @abc.abstractmethod
37 | def predict_ensemble(self, x):
38 | """ Make predictions of whole ensemble, output shape should be (ensemble, batch_size, y_shape)"""
39 | raise NotImplementedError
40 |
41 | @abc.abstractmethod
42 | def elite_inds(self,):
43 | """ Returns indices of the elite models"""
44 | raise NotImplementedError
--------------------------------------------------------------------------------
/models/fake_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | import pdb
4 |
5 | from models.pens.pe_factory import build_PE, format_samples_for_dyn, format_samples_for_cost
6 | from models.pens.utils import average_dkl, median_dkl
7 | from models.statics import (REWS_BY_TASK, COST_BY_TASK, TERMS_BY_TASK)
8 |
9 | from itertools import count
10 | import warnings
11 | import time
12 |
13 | EPS = 1e-8
14 |
15 | class FakeEnv:
16 |
17 | def __init__(self,
18 | true_environment,
19 | task,
20 | model,
21 | predicts_delta,
22 | predicts_rew,
23 | predicts_cost,
24 | ):
25 | """
26 | Creates a fake environment that emulates common RL env methodology:
27 | Args:
28 | true_environment(`env`): true environment, used for shapes
29 | task(`str`): name of the task, used to locate static fallback functions for r, c, or term
30 | model(`BaseModel`): dynamics model, should inherit from BaseModel and implement the corresponding
31 | methods,
32 | inputs dim should be (obs_dim + act_dim,)
33 | predicts_delta(`bool`): Does the model predict state-changes or absolute next-states?
34 | predicts_rew(`bool`): Does the model predict rewards?
35 | If yes: rewards should be included in outputs after dynamics,
36 | i.e.: dim(outputs) = (..., (next_obs, r))
37 | predicts_cost(`bool`): Does the model predict costs?
38 | If yes: costs should be included in outputs after dynamics and rewards (if applicable),
39 | i.e.: dim(outputs) = (..., (next_obs, r, c))
40 | """
41 | self.env = true_environment
42 | self.obs_dim = np.prod(self.observation_space.shape)
43 | self.act_dim = np.prod(self.action_space.shape)
44 | self._task = task
45 |
46 | self._model = model
47 | self._uses_ensemble = self._model.is_ensemble
48 | self._is_probabilistic = self._model.is_probabilistic
49 |
50 | self._predicts_delta = predicts_delta
51 | self._predicts_rew = predicts_rew
52 | self._predicts_cost = predicts_cost
53 |
54 | #### create fake env from model
55 | self.input_dim = self._model.in_dim
56 | self.output_dim = self._model.out_dim
57 |
58 | @property
59 | def observation_space(self):
60 | return self.env.observation_space
61 |
62 | @property
63 | def action_space(self):
64 | return self.env.action_space
65 |
66 | def step(self, obs, act, deterministic=True):
67 | assert len(obs.shape) == len(act.shape)
68 | assert obs.shape[-1]==self.obs_dim and act.shape[-1]==self.act_dim
69 |
70 | ### check dimensionality of obs
71 | obs_depth = len(obs.shape)
72 | if obs_depth == 1:
73 | obs = obs[None]
74 | act = act[None]
75 | return_single=True
76 | else:
77 | return_single = False
78 |
79 |
80 | ### create model inputs
81 | inputs = np.concatenate((obs, act), axis=-1)
82 |
83 | ### if 3D-inputs, we shuffle so different models predict at every step
84 | if obs_depth==3:
85 | inputs, shuffle_indxs = self.forward_shuffle(inputs)
86 |
87 | ### predict
88 | if self._uses_ensemble:
89 | pred = self._model.predict_ensemble(inputs) #### dyn_vars gives ep. vars for
90 | else:
91 | pred = self._model.predict(inputs)
92 |
93 | ### split predictions if probabilistic
94 | if self._is_probabilistic:
95 | pred_mean, pred_var = pred
96 | else:
97 | pred_mean, pred_var = pred, np.zeros_like(pred)
98 |
99 | ### shuffle back
100 | if obs_depth==3:
101 | pred_mean, pred_var = self.inverse_shuffle(pred_mean, shuffle_indxs), self.inverse_shuffle(pred_var, shuffle_indxs)
102 |
103 | #### probabilistic transitions if var is predicted and deterministic is passed
104 | pred_std = np.sqrt(pred_var)
105 | if not deterministic:
106 | next_obs = pred_mean[...,:self.obs_dim] + pred_std[...,:self.obs_dim]
107 | else:
108 | next_obs = pred_mean[...,:self.obs_dim]
109 |
110 | #### extract uncertainty measures
111 | if self._uses_ensemble:
112 | ens_ep_var = np.var(next_obs, axis=0)
113 | ens_dkl_path = np.mean(average_dkl(next_obs, pred_std[...,:self.obs_dim]), axis=-1) ##@anyboby gives ugly numbers if var=0
114 | ens_dkl_mean = np.mean(ens_dkl_path)
115 | else:
116 | ens_ep_var = 0
117 | ens_dkl_path = np.zeros(shape=obs.shape[1])
118 | ens_dkl_mean = 0
119 |
120 | #### choose one model from ensemble randomly, if ensemble and not 3d inputs
121 | if self._uses_ensemble and obs_depth<3:
122 | _, batch_size, _ = next_obs.shape
123 | model_inds = self.random_inds(batch_size) ## only elites
124 | batch_inds = np.arange(0, batch_size)
125 | next_obs = next_obs[model_inds, batch_inds]
126 | else:
127 | next_obs = next_obs
128 |
129 | #### add to obs if delta predictions
130 | if self._predicts_delta:
131 | next_obs += obs
132 |
133 | #### extract rew, cost, or call fallback functions for terms, rews and costs
134 | if TERMS_BY_TASK.get(self._task, None):
135 | terms = TERMS_BY_TASK[self._task](obs, act, next_obs)
136 | else:
137 | terms = TERMS_BY_TASK['default'](obs, act, next_obs)
138 |
139 | if self._predicts_cost:
140 | c = pred_mean[...,-1:]
141 | c = c[model_inds, batch_inds]
142 | pred_mean = pred_mean[...,:-1]
143 | elif COST_BY_TASK.get(self._task, None):
144 | c = COST_BY_TASK[self._task](obs, act, next_obs)
145 | else:
146 | c = np.zeros_like(terms)
147 |
148 | if self._predicts_rew:
149 | r = pred_mean[...,-1:]
150 | r = r[model_inds, batch_inds]
151 | pred_mean = pred_mean[...,:-1]
152 | elif REWS_BY_TASK.get(self._task, None):
153 | r = REWS_BY_TASK[self._task](obs, act, next_obs)
154 |
155 | assert r is not None, \
156 | "Please provide either static functions or predictions for rewards, costs and terms"
157 |
158 | if return_single:
159 | next_obs = next_obs[0]
160 | r = r[0]
161 | c = c[0]
162 | terms = terms[0]
163 |
164 | info = {
165 | 'ensemble_dkl_mean' : ens_dkl_mean,
166 | 'ensemble_dkl_path' : ens_dkl_path,
167 | 'ensemble_ep_var' : ens_ep_var,
168 | 'rew':r,
169 | 'cost':c,
170 | }
171 |
172 | return next_obs, r, terms, info
173 |
174 | def random_inds(self, size):
175 | if self._model.is_ensemble:
176 | return np.random.choice(self._model.elite_inds, (size))
177 | else:
178 | return np.random.choice([0], (size))
179 |
180 | def forward_shuffle(self, ndarray):
181 | """
182 | shuffles ndarray forward along axis 0 with random elite indices,
183 | Returns shuffled copy of ndarray and indices with which was shuffled
184 | """
185 | idxs = np.random.permutation(ndarray.shape[0])
186 | shuffled = ndarray[idxs]
187 | return shuffled, idxs
188 |
189 | def inverse_shuffle(self, ndarray, idxs):
190 | """
191 | inverses a shuffle of ndarray forward along axis 0, given the used indices.
192 | Returns unshuffled copy of ndarray
193 | """
194 | unshuffled = ndarray[idxs]
195 | return unshuffled
196 |
197 | def close(self):
198 | pass
199 |
--------------------------------------------------------------------------------
/models/pens/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/models/pens/__init__.py
--------------------------------------------------------------------------------
/models/pens/logger.py:
--------------------------------------------------------------------------------
1 | import time
2 | import math
3 | import pdb
4 |
5 |
6 |
7 | def update_dict(dict_a, dict_b, weight_a=.5, weight_b=.5):
8 | """
9 | creates new updated dict and adds entries according to weights.
10 | for both weights = 1 the entries are added
11 | """
12 | dict_a_cp = dict(dict_a)
13 | dict_a_cp.update(dict_b)
14 | for k,v in dict_b.items():
15 | if k in dict_a.keys():
16 | dict_a_cp[k] = weight_b*dict_b[k] + weight_a*dict_a[k]
17 | return dict_a_cp
18 |
19 | class Progress:
20 |
21 | def __init__(self, total, name = 'Progress', ncol=3, max_length=20, indent=0, line_width=100, speed_update_freq=100):
22 | self.total = total
23 | self.name = name
24 | self.ncol = ncol
25 | self.max_length = max_length
26 | self.indent = indent
27 | self.line_width = line_width
28 | self._speed_update_freq = speed_update_freq
29 |
30 | self._step = 0
31 | self._prev_line = '\033[F'
32 | self._clear_line = ' ' * self.line_width
33 |
34 | self._pbar_size = self.ncol * self.max_length
35 | self._complete_pbar = '#' * self._pbar_size
36 | self._incomplete_pbar = ' ' * self._pbar_size
37 |
38 | self.lines = ['']
39 | self.fraction = '{} / {}'.format(0, self.total)
40 |
41 | self.resume()
42 |
43 |
44 | def update(self, n=1):
45 | self._step += n
46 | if self._step % self._speed_update_freq == 0:
47 | self._time0 = time.time()
48 | self._step0 = self._step
49 |
50 | def resume(self):
51 | self._skip_lines = 1
52 | print('\n', end='')
53 | self._time0 = time.time()
54 | self._step0 = self._step
55 |
56 | def pause(self):
57 | self._clear()
58 | self._skip_lines = 1
59 |
60 | def set_description(self, params=[]):
61 |
62 | ############
63 | # Position #
64 | ############
65 | self._clear()
66 |
67 | ###########
68 | # Percent #
69 | ###########
70 | percent, fraction = self._format_percent(self._step, self.total)
71 | self.fraction = fraction
72 |
73 | #########
74 | # Speed #
75 | #########
76 | speed = self._format_speed(self._step)
77 |
78 | ##########
79 | # Params #
80 | ##########
81 | num_params = len(params)
82 | nrow = math.ceil(num_params / self.ncol)
83 | params_split = self._chunk(params, self.ncol)
84 | params_string, lines = self._format(params_split)
85 | self.lines = lines
86 |
87 |
88 | description = '{} | {}{}'.format(percent, speed, params_string)
89 | print(description)
90 | self._skip_lines = nrow + 1
91 |
92 | def append_description(self, descr):
93 | self.lines.append(descr)
94 |
95 | def _clear(self):
96 | position = self._prev_line * self._skip_lines
97 | empty = '\n'.join([self._clear_line for _ in range(self._skip_lines)])
98 | print(position, end='')
99 | print(empty)
100 | print(position, end='')
101 |
102 | def _format_percent(self, n, total):
103 | if total:
104 | percent = n / float(total)
105 |
106 | complete_entries = int(percent * self._pbar_size)
107 | incomplete_entries = self._pbar_size - complete_entries
108 |
109 | pbar = self._complete_pbar[:complete_entries] + self._incomplete_pbar[:incomplete_entries]
110 | fraction = '{} / {}'.format(n, total)
111 | string = '{} [{}] {:3d}%'.format(fraction, pbar, int(percent*100))
112 | else:
113 | fraction = '{}'.format(n)
114 | string = '{} iterations'.format(n)
115 | return string, fraction
116 |
117 | def _format_speed(self, n):
118 | num_steps = n - self._step0
119 | t = time.time() - self._time0
120 | speed = num_steps / t
121 | string = '{:.1f} Hz'.format(speed)
122 | if num_steps > 0:
123 | self._speed = string
124 | return string
125 |
126 | def _chunk(self, l, n):
127 | return [l[i:i+n] for i in range(0, len(l), n)]
128 |
129 | def _format(self, chunks):
130 | lines = [self._format_chunk(chunk) for chunk in chunks]
131 | lines.insert(0,'')
132 | padding = '\n' + ' '*self.indent
133 | string = padding.join(lines)
134 | return string, lines
135 |
136 | def _format_chunk(self, chunk):
137 | line = ' | '.join([self._format_param(param) for param in chunk])
138 | return line
139 |
140 | def _format_param(self, param):
141 | k, v = param
142 | return '{} : {}'.format(k, v)[:self.max_length]
143 |
144 | def stamp(self):
145 | if self.lines != ['']:
146 | params = ' | '.join(self.lines)
147 | string = '[ {} ] {}{} | {}'.format(self.name, self.fraction, params, self._speed)
148 | self._clear()
149 | print(string, end='\n')
150 | self._skip_lines = 1
151 | else:
152 | self._clear()
153 | self._skip_lines = 0
154 |
155 | def close(self):
156 | self.pause()
157 |
158 | class Silent:
159 |
160 | def __init__(self, *args, **kwargs):
161 | pass
162 |
163 | def __getattr__(self, attr):
164 | return lambda *args: None
165 |
166 |
167 | if __name__ == '__main__':
168 | silent = Silent()
169 | silent.update()
170 | silent.stamp()
171 |
172 | num_steps = 1000
173 | progress = Progress(num_steps)
174 | for i in range(num_steps):
175 | progress.update()
176 | params = [
177 | ['A', '{:06d}'.format(i)],
178 | ['B', '{:06d}'.format(i)],
179 | ['C', '{:06d}'.format(i)],
180 | ['D', '{:06d}'.format(i)],
181 | ['E', '{:06d}'.format(i)],
182 | ['F', '{:06d}'.format(i)],
183 | ['G', '{:06d}'.format(i)],
184 | ['H', '{:06d}'.format(i)],
185 | ]
186 | progress.set_description(params)
187 | time.sleep(0.01)
188 | progress.close()
189 |
--------------------------------------------------------------------------------
/models/pens/pe_factory.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numpy.ma as ma
3 | import tensorflow as tf
4 |
5 | import copy
6 | from .fc import FC
7 | from .pe import PE
8 |
9 | def build_PE(in_dim,
10 | out_dim,
11 | name='BNN',
12 | hidden_dims=(200, 200, 200),
13 | num_networks=7,
14 | num_elites=5,
15 | loss = 'MSPE',
16 | activation = 'swish',
17 | output_activation = None,
18 | decay=1e-4,
19 | lr = 1e-3,
20 | lr_decay = None,
21 | decay_steps=None,
22 | use_scaler_in = False,
23 | use_scaler_out = False,
24 | clip_loss = False,
25 | kl_cliprange = 0.1,
26 | max_logvar = .5,
27 | min_logvar = -6,
28 | session=None):
29 | """
30 | Constructs a tf probabilistic ensemble model.
31 | Args:
32 | loss: Choose from 'MSPE', 'NLL', 'MSE', 'Huber', or 'CE'.
33 | choosing MSPE or NLL will construct a model with variance output
34 | """
35 | print('[PE] dim in / out: {} / {} | Hidden dim: {}'.format(in_dim, out_dim, hidden_dims))
36 | #print('[ BNN ] Input Layer dim: {} | Output Layer dim: {} '.format(obs_dim_in+act_dim+prior_dim, obs_dim_out+rew_dim))
37 | params = {'name': name,
38 | 'loss':loss,
39 | 'num_networks': num_networks,
40 | 'num_elites': num_elites,
41 | 'sess': session,
42 | 'use_scaler_in': use_scaler_in,
43 | 'use_scaler_out': use_scaler_out,
44 | 'clip_loss': clip_loss,
45 | 'kl_cliprange':kl_cliprange,
46 | 'max_logvar':max_logvar,
47 | 'min_logvar':min_logvar,
48 | }
49 | model = PE(params)
50 | model.add(FC(hidden_dims[0], input_dim=in_dim, activation=activation, weight_decay=decay/4)) # def dec: 0.000025))
51 |
52 | for hidden_dim in hidden_dims[1:]:
53 | model.add(FC(hidden_dim, activation=activation, weight_decay=decay/2)) # def dec: 0.00005))
54 |
55 | model.add(FC(out_dim, activation=output_activation, weight_decay=decay)) # def dec: 0.0001
56 |
57 | opt_params = {"learning_rate":lr} if lr_decay is None else {"learning_rate":lr,
58 | "learning_rate_decay":lr_decay,
59 | "decay_steps":decay_steps}
60 | model.finalize(tf.train.AdamOptimizer, opt_params, lr_decay=lr_decay)
61 |
62 | total_parameters = 0
63 | for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name):
64 | # shape is an array of tf.Dimension
65 | shape = variable.get_shape()
66 | variable_parameters = 1
67 | for dim in shape:
68 | variable_parameters *= dim.value
69 | total_parameters += variable_parameters
70 | print('[ Probabilistic Ensemble ] Total trainable Parameteres: {} '.format(total_parameters))
71 |
72 | return model
73 |
74 | def format_samples_for_dyn(samples, append_r=True, append_c=False, noise=None):
75 | """
76 | formats samples to fit training, specifically returns:
77 | inputs, outputs:
78 |
79 | inputs = np.concatenate((observations, act, priors), axis=-1)
80 | outputs = np.concatenate(delta_observations, rewards ,costs), axis=-1)
81 |
82 | where rewards and costs are optional
83 | """
84 | obs = samples['observations']
85 | act = samples['actions']
86 | next_obs = samples['next_observations']
87 | terms = np.squeeze(samples['terminals'])[..., None]
88 |
89 | delta_obs = next_obs - obs
90 |
91 | #### ----END preprocess samples for model training in safety gym -----####
92 | inputs = np.concatenate((obs, act), axis=-1)
93 |
94 | outputs = delta_obs
95 |
96 | if append_r:
97 | rew = np.squeeze(samples['rewards'])[..., None]
98 | outputs = np.concatenate((outputs, rew), axis=-1)
99 |
100 | if append_c:
101 | costs = np.squeeze(samples['costs'])[..., None]
102 | outputs = np.concatenate((outputs, costs), axis=-1)
103 |
104 | # add noise
105 | if noise:
106 | inputs = _add_noise(inputs, noise) ### noise helps (sometimes)
107 |
108 | return inputs, outputs
109 |
110 |
111 | ### @anyboby, try to include this in the model rather than separately
112 | def format_samples_for_cost(samples, oversampling=False, one_hot = True, num_classes=2, noise=None):
113 | """
114 | formats samples to fit training for cost, specifically returns:
115 | (obs, act, next_obs)
116 |
117 | Args:
118 | one_hot: determines whether targets are structured as classification or regression
119 | one_hot: True will output targets with shape [batch_size, num_classes]
120 | one_hot: False wil output targets with shape [batch_size,] and scalar targets
121 | """
122 | next_obs = samples['next_observations']
123 | obs = samples['observations']
124 | cost = samples['costs']
125 | act = samples['actions']
126 |
127 | if one_hot:
128 | cost_one_hot = np.zeros(shape=(len(cost), num_classes))
129 | batch_indcs = np.arange(0, len(cost))
130 | costs = cost.astype(int)
131 | cost_one_hot[(batch_indcs, costs)] = 1
132 | outputs = cost_one_hot
133 | else:
134 | outputs = cost[:, None]
135 |
136 | inputs = np.concatenate((obs, act, next_obs), axis=-1)
137 | ## ________________________________ ##
138 | ## oversample cost classes ##
139 | ## ________________________________ ##
140 | if oversampling:
141 | if len(outputs[np.where(costs>0)[0]])>0:
142 | imbalance_ratio = len(outputs[np.where(costs==0)[0]])//len(outputs[np.where(costs>0)[0]])
143 | extra_outputs = np.tile(outputs[np.where(costs>0)[0]], (1+imbalance_ratio//3,1)) ## don't need to overdo it
144 | outputs = np.concatenate((outputs, extra_outputs), axis=0)
145 | extra_inputs = np.tile(inputs[np.where(costs>0)[0]], (1+imbalance_ratio//3,1))
146 | extra_inputs = _add_noise(extra_inputs, 0.0001)
147 | inputs = np.concatenate((inputs, extra_inputs), axis=0)
148 |
149 | ### ______ add noise _____ ###
150 | if noise:
151 | inputs = _add_noise(inputs, noise) ### noise helps
152 |
153 | return inputs, outputs
154 |
155 | def _add_noise(data_inp, noiseToSignal):
156 | data= copy.deepcopy(data_inp)
157 | mean_data = np.mean(data, axis = 0)
158 | std_of_noise = mean_data*noiseToSignal
159 | for j in range(mean_data.shape[0]):
160 | if(std_of_noise[j]>0):
161 | data[:,j] = np.copy(data[:,j]+np.random.normal(0, np.absolute(std_of_noise[j]), (data.shape[0],)))
162 | return data
163 |
164 | def reset_model(model):
165 | model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=model.name)
166 | model.sess.run(tf.initialize_vars(model_vars))
167 |
--------------------------------------------------------------------------------
/models/pens/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from __future__ import print_function
3 | from __future__ import absolute_import
4 |
5 | import tensorflow as tf
6 | import numpy as np
7 | EPS = 1e-10
8 |
9 | def get_required_argument(dotmap, key, message, default=None):
10 | val = dotmap.get(key, default)
11 | if val is default:
12 | raise ValueError(message)
13 | return val
14 |
15 | def gaussian_kl_np(mu0, log_std0, mu1, log_std1):
16 | """interprets each entry in mu_i and log_std_i as independent,
17 | preserves shape
18 | output clipped to {0, 1e10}
19 | """
20 | var0, var1 = np.exp(2 * log_std0), np.exp(2 * log_std1)
21 | pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1+EPS) - 1) + log_std1 - log_std0
22 | all_kls = pre_sum
23 | #all_kls = np.mean(all_kls)
24 | all_kls = np.clip(all_kls, 0, 1/EPS) ### for stability
25 | return all_kls
26 |
27 | def gaussian_jsd_np(mu0, log_std0, mu1, log_std1):
28 | pass
29 |
30 | def average_dkl(mu, std):
31 | """
32 | Calculates the average kullback leiber divergences of multiple univariate gaussian distributions.
33 |
34 | K(P1,…Pk) = 1/(k(k−1)) ∑_[k_(i,j)=1] DKL(Pi||Pj)
35 |
36 | (Andrea Sgarro, Informational divergence and the dissimilarity of probability distributions.)
37 |
38 | expects the distributions along axis 0, and samples along axis 1.
39 | Output is reduced by axis 0
40 |
41 | Args:
42 | mu: array-like means
43 | std: array-like stds
44 | """
45 | ## clip log
46 | log_std = np.log(std)
47 | log_std = np.clip(log_std, -100, 1e8)
48 | assert len(mu.shape)>=2 and len(log_std.shape)>=2
49 | num_models = len(mu)
50 | d_kl = None
51 | for i in range(num_models):
52 | for j in range(num_models):
53 | if d_kl is None:
54 | d_kl = gaussian_kl_np(mu[i], log_std[i], mu[j], log_std[j])
55 | else: d_kl+= gaussian_kl_np(mu[i], log_std[i], mu[j], log_std[j])
56 | d_kl = d_kl/(num_models*(num_models-1)+EPS)
57 | return d_kl
58 |
59 | def median_dkl(mu, std):
60 | """
61 | Calculates the median kullback leiber divergences of multiple univariate gaussian distributions.
62 |
63 | K(P1,…Pk) = 1/(k(k−1)) ∑_[k_(i,j)=1] DKL(Pi||Pj)
64 |
65 | (Andrea Sgarro, Informational divergence and the dissimilarity of probability distributions.)
66 |
67 | expects the distributions along axis 0, and samples along axis 1.
68 | Output is reduced by axis 0
69 |
70 | Args:
71 | mu: array-like means
72 | std: array-like stds
73 | """
74 | ## clip log
75 | log_std = np.log(std)
76 | log_std = np.clip(log_std, -100, 1e8)
77 | assert len(mu.shape)>=2 and len(log_std.shape)>=2
78 | num_models = len(mu)
79 | d_kl = np.zeros(shape=(num_models*(num_models-1),) + mu.shape[1:])
80 | n = 0
81 | for i in range(num_models):
82 | for j in range(num_models):
83 | if i != j:
84 | d_kl[n] = gaussian_kl_np(mu[i], log_std[i], mu[j], log_std[j])
85 | n += 1
86 | d_kl_med = np.median(d_kl, axis=0)
87 | return d_kl_med
88 |
89 |
90 | class TensorStandardScaler:
91 | """Helper class for automatically normalizing inputs into the network.
92 | """
93 | def __init__(self, x_dim, name='Scaler'):
94 | """Initializes a scaler.
95 |
96 | Arguments:
97 | x_dim (int): The dimensionality of the inputs into the scaler.
98 |
99 | Returns: None.
100 | """
101 | self.fitted = False
102 | with tf.variable_scope(name):
103 | self.count = tf.get_variable(
104 | name=name+'_count', shape=(), initializer=tf.constant_initializer(0),
105 | trainable=False
106 | )
107 |
108 | self.mu = tf.get_variable(
109 | name=name+'_mu', shape=[1, x_dim], initializer=tf.constant_initializer(0.0),
110 | trainable=False
111 | )
112 | self.var = tf.get_variable(
113 | name=name+'_std', shape=[1, x_dim], initializer=tf.constant_initializer(1.0),
114 | trainable=False
115 | )
116 |
117 | self.cached_count, self.cached_mu, self.cached_var = 0, np.zeros([1, x_dim]), np.ones([1, x_dim])
118 |
119 | def fit(self, data):
120 | """Runs two ops, one for assigning the mean of the data to the internal mean, and
121 | another for assigning the standard deviation of the data to the internal standard deviation.
122 | This function must be called within a 'with .as_default()' block.
123 |
124 | Arguments:
125 | data (np.ndarray): A numpy array containing the input
126 |
127 | Returns: None.
128 | """
129 | batch_count = data.shape[0]
130 | batch_mu = np.mean(data, axis=0, keepdims=True)
131 | batch_var = np.var(data, axis=0, keepdims=True)
132 | new_mean, new_var, new_count = self.running_mean_var_from_batch(batch_mu, batch_var, batch_count)
133 | #sigma[sigma < 1e-8] = 1.0
134 | self.mu.load(new_mean)
135 | self.var.load(new_var)
136 | self.count.load(new_count)
137 | self.fitted = True
138 | self.cache()
139 |
140 | def transform(self, data):
141 | """Transforms the input matrix data using the parameters of this scaler.
142 |
143 | can be adjusted to scale with a factor, to control sensitivity to ood data:
144 | d = (d-mu)/sigma = d + (d-mu)/sigma - d = d + (d(1-sigma)-mu)/sigma
145 | and the version with scaling factor thus becomes
146 | d = d + sc_factor*(d(1-sigma)-mu)/sigma
147 |
148 | Arguments:
149 | data (np.array): A numpy array containing the points to be transformed.
150 | sc_factor: Factor to what degree the original dataset is transformed
151 |
152 | Returns: (np.array) The transformed dataset.
153 |
154 |
155 | """
156 | scaled_transform = (data-self.mu)/(tf.maximum(tf.sqrt(self.var), 1e-2))
157 | return scaled_transform
158 |
159 | def inverse_transform(self, data):
160 | """Undoes the transformation performed by this scaler.
161 |
162 | Arguments:
163 | data (np.array): A numpy array containing the points to be transformed.
164 |
165 | Returns: (np.array) The transformed dataset.
166 | """
167 | return (tf.maximum(tf.sqrt(self.var), 1e-2)) * data + self.mu
168 |
169 | def inverse_transform_var(self, data):
170 | """Undoes the transformation performed by this scaler for variances.
171 |
172 | Arguments:
173 | data (np.array): A numpy array containing the points to be transformed.
174 |
175 | Returns: (np.array) The transformed dataset.
176 | """
177 | return tf.square(tf.maximum(tf.sqrt(self.var), 1e-2)) * data
178 |
179 | def inverse_transform_logvar(self, data):
180 | """Undoes the transformation performed by this scaler for variances.
181 |
182 | Arguments:
183 | data (np.array): A numpy array containing the points to be transformed.
184 |
185 | Returns: (np.array) The transformed dataset.
186 | """
187 | return 2*tf.log(tf.maximum(tf.sqrt(self.var), 1e-2)) + data
188 |
189 | def get_vars(self):
190 | """Returns a list of variables managed by this object.
191 |
192 | Returns: (list) The list of variables.
193 | """
194 | return [self.mu, self.var]
195 |
196 | def get_mu(self):
197 | return self.mu
198 |
199 | def get_var(self):
200 | return self.var
201 |
202 | def cache(self):
203 | """Caches current values of this scaler.
204 |
205 | Returns: None.
206 | """
207 | self.cached_mu = self.mu.eval()
208 | self.cached_var = self.var.eval()
209 | self.cached_count = self.count.eval()
210 |
211 | def load_cache(self):
212 | """Loads values from the cache
213 |
214 | Returns: None.
215 | """
216 | self.mu.load(self.cached_mu)
217 | self.var.load(self.cached_var)
218 | self.count.load(self.cached_count)
219 |
220 | def running_mean_var_from_batch(self, batch_mean, batch_var, batch_count):
221 | delta = batch_mean - self.cached_mu
222 | tot_count = self.cached_count + batch_count
223 |
224 | new_mean = self.cached_mu + delta * batch_count / tot_count
225 | m_a = self.cached_var * self.cached_count
226 | m_b = batch_var * batch_count
227 | M2 = m_a + m_b + np.square(delta) * self.cached_count * batch_count / tot_count
228 | new_var = M2 / tot_count
229 | new_count = tot_count
230 |
231 | return new_mean, new_var, new_count
232 |
--------------------------------------------------------------------------------
/models/statics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def no_done(obs, act, next_obs):
4 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
5 |
6 | done = np.zeros(shape=obs.shape[:-1], dtype=np.bool) #np.array([False]).repeat(len(obs))
7 | done = done[...,None]
8 | return done
9 |
10 | def hcs_cost_f(obs, act, next_obs):
11 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
12 |
13 | xdist = next_obs[...,-1]*10
14 | obj_cost = np.array((np.abs(xdist)<2.0), dtype=np.float32)[..., None]
15 | return obj_cost
16 |
17 | def antsafe_term_fn(obs, act, next_obs):
18 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
19 |
20 | z = next_obs[..., 0]
21 | body_quat = next_obs[...,1:5]
22 | z_rot = 1-2*(body_quat[...,1]**2+body_quat[...,2]**2)
23 |
24 | notdone = np.isfinite(next_obs).all(axis=-1) \
25 | * (z >= 0.2) \
26 | * (z <= 1.0) \
27 | * z_rot >= -0.7
28 |
29 | done = ~notdone
30 | done = done[...,None]
31 | return done
32 |
33 | def antsafe_c_fn(obs, act, next_obs):
34 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape)
35 |
36 | z = next_obs[..., 0]
37 | body_quat = next_obs[...,1:5]
38 | z_rot = 1-2*(body_quat[...,1]**2+body_quat[...,2]**2)
39 | y_dist = next_obs[..., -1:]
40 |
41 | obj_cost = np.any(abs(y_dist)>3.2, axis=-1)[...,None]*1.0
42 |
43 | notdone = np.isfinite(next_obs).all(axis=-1) \
44 | * (z >= 0.2) \
45 | * (z <= 1.0) \
46 | * z_rot >= -0.7
47 |
48 | done = ~notdone
49 | done = done[...,None]
50 |
51 | done_cost = done*1.0
52 | cost = np.clip(done_cost+obj_cost, 0, 1)
53 | return cost
54 |
55 |
56 | TERMS_BY_TASK = {
57 | 'default':no_done,
58 | 'HalfCheetah-v2':no_done,
59 | 'HalfCheetahSafe-v2':no_done,
60 | 'AntSafe-v2':antsafe_term_fn,
61 | }
62 |
63 | REWS_BY_TASK = {
64 |
65 | }
66 |
67 | COST_BY_TASK = {
68 | 'HalfCheetahSafe-v2':hcs_cost_f,
69 | 'AntSafe-v2':antsafe_c_fn,
70 | }
--------------------------------------------------------------------------------
/network/ac_network.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from gym.spaces import Box, Discrete
4 | from utilities.utils import combined_shape, EPS
5 |
6 | """
7 | Network utils
8 | """
9 |
10 | def placeholder(dim=None):
11 | return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim))
12 |
13 | def placeholders(*args):
14 | return [placeholder(dim) for dim in args]
15 |
16 | def placeholder_from_space(space):
17 | if isinstance(space, Box):
18 | return placeholder(space.shape)
19 | elif isinstance(space, Discrete):
20 | return tf.placeholder(dtype=tf.int32, shape=(None,))
21 | raise NotImplementedError('bad space {}'.format(space))
22 |
23 | def placeholders_from_spaces(*args):
24 | return [placeholder_from_space(space) for space in args]
25 |
26 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None, ensemble_size = 1):
27 | for h in hidden_sizes[:-1]:
28 | if ensemble_size==1:
29 | x = tf.layers.dense(x, units=h, activation=activation)
30 | else:
31 | x = tf.layers.dense(x, units=(ensemble_size,)+(h,), activation=activation)
32 | x = tf.transpose(x)
33 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
34 |
35 | def get_vars(scope=''):
36 | return [x for x in tf.trainable_variables() if '/'+scope+'/' in x.name]
37 |
38 | def count_vars(scope=''):
39 | v = get_vars(scope)
40 | return sum([np.prod(var.shape.as_list()) for var in v])
41 |
42 | """
43 | Gaussian distributions
44 | """
45 |
46 | def gaussian_likelihood(x, mu, log_std):
47 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
48 | return tf.reduce_sum(pre_sum, axis=1)
49 |
50 | def gaussian_kl(mu0, log_std0, mu1, log_std1):
51 | """Returns average kl divergence between two batches of dists"""
52 | var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1)
53 | pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1 + EPS) - 1) + log_std1 - log_std0
54 | all_kls = tf.reduce_sum(pre_sum, axis=1)
55 | return tf.reduce_mean(all_kls)
56 |
57 | def gaussian_entropy(log_std):
58 | """Returns average entropy over a batch of dists"""
59 | pre_sum = log_std + 0.5 * np.log(2*np.pi*np.e)
60 | all_ents = tf.reduce_sum(pre_sum, axis=-1)
61 | return tf.reduce_mean(all_ents)
62 |
63 | """
64 | Categorical distributions
65 | """
66 |
67 | def categorical_kl(logp0, logp1):
68 | """Returns average kl divergence between two batches of dists"""
69 | all_kls = tf.reduce_sum(tf.exp(logp1) * (logp1 - logp0), axis=1)
70 | return tf.reduce_mean(all_kls)
71 |
72 | def categorical_entropy(logp):
73 | """Returns average entropy over a batch of dists"""
74 | all_ents = -tf.reduce_sum(logp * tf.exp(logp), axis=1)
75 | return tf.reduce_mean(all_ents)
76 |
77 |
78 | """
79 | Policies
80 | """
81 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space):
82 | act_dim = action_space.n
83 | logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None)
84 | logp_all = tf.nn.log_softmax(logits)
85 | pi = tf.squeeze(tf.multinomial(logits,1), axis=1)
86 | logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
87 | logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1)
88 |
89 | old_logp_all = placeholder(act_dim)
90 | d_kl = categorical_kl(logp_all, old_logp_all)
91 | ent = categorical_entropy(logp_all)
92 |
93 | pi_info = {'logp_all': logp_all}
94 | pi_info_phs = {'logp_all': old_logp_all}
95 |
96 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent
97 |
98 |
99 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
100 | act_dim = a.shape.as_list()[-1]
101 |
102 |
103 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
104 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
105 | ### @anyboby testing: higher starting std, ppo1 uses log_std=0 at the beginning
106 | # log_std = tf.get_variable(name='log_std', shape=act_dim ,initializer=tf.zeros_initializer(), dtype=tf.float32)
107 | std = tf.exp(log_std)
108 |
109 | pi = mu + tf.random_normal(tf.shape(mu)) * std
110 | logp = gaussian_likelihood(a, mu, log_std)
111 | logp_pi = gaussian_likelihood(pi, mu, log_std)
112 |
113 | old_mu_ph, old_log_std_ph = placeholders(act_dim, act_dim)
114 | d_kl = gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph)
115 | ent = gaussian_entropy(log_std)
116 |
117 | # adjust log_std to input dim, even though it doesn't depend on it
118 | # @anyboby lol this is so bad
119 | log_std_info = tf.tensordot(tf.ones(tf.shape(x)[0]), log_std, axes=0)
120 | pi_info = {'mu': mu, 'log_std': log_std_info}
121 | pi_info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph}
122 |
123 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent
124 |
125 |
126 | LOG_STD_MAX = 2
127 | LOG_STD_MIN = -20
128 |
129 | def mlp_squashed_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
130 | """
131 | Experimental code for squashed gaussian policies, not yet tested
132 | """
133 | act_dim = a.shape.as_list()[-1]
134 | net = mlp(x, list(hidden_sizes), activation, activation)
135 | mu = tf.layers.dense(net, act_dim, activation=output_activation)
136 | log_std = tf.layers.dense(net, act_dim, activation=None)
137 | log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)
138 |
139 | std = tf.exp(log_std)
140 | u = mu + tf.random_normal(tf.shape(mu)) * std
141 | pi = tf.tanh(u)
142 |
143 | old_mu_ph, old_log_std_ph, u_ph = placeholders(act_dim, act_dim, act_dim)
144 | d_kl = gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph) # kl is invariant to squashing transform
145 |
146 | def apply_squashing_func(log_prob, raw_action):
147 | # Adjustment to log prob
148 | act = tf.tanh(raw_action)
149 | log_prob -= tf.reduce_sum(2*(np.log(2) - act - tf.nn.softplus(-2*act)), axis=1)
150 | return log_prob
151 |
152 | # Base log probs
153 | logp = gaussian_likelihood(u_ph, mu, log_std)
154 | logp_pi = gaussian_likelihood(u, mu, log_std)
155 |
156 | # Squashed log probs
157 | logp = apply_squashing_func(logp, u_ph)
158 | logp_pi = apply_squashing_func(logp_pi, u)
159 |
160 | # Approximate entropy
161 | ent = -tf.reduce_mean(logp_pi) # approximate! hacky!
162 |
163 | pi_info = {'mu': mu, 'log_std': log_std, 'raw_action': u}
164 | pi_info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph, 'raw_action': u_ph}
165 |
166 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent
167 |
168 |
169 |
170 | """
171 | Actor-Critics
172 | """
173 | def mlp_actor_critic(x, a, hidden_sizes_a=(64,64), hidden_sizes_c=(64,64), critic_ensemble_size=1, activation=tf.tanh,
174 | output_activation=None, policy=None, action_space=None):
175 |
176 | # default policy builder depends on action space
177 | if policy is None and isinstance(action_space, Box):
178 | policy = mlp_gaussian_policy
179 | elif policy is None and isinstance(action_space, Discrete):
180 | policy = mlp_categorical_policy
181 |
182 | with tf.variable_scope('pi'):
183 | policy_outs = policy(x, a, hidden_sizes_a, activation, output_activation, action_space)
184 | pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent = policy_outs
185 |
186 | with tf.variable_scope('vf'):
187 | v = tf.squeeze(mlp(x, list(hidden_sizes_c)+[1], activation, None, ensemble_size=critic_ensemble_size))
188 |
189 | with tf.variable_scope('vc'):
190 | vc = tf.squeeze(mlp(x, list(hidden_sizes_c)+[1], activation, None, ensemble_size=critic_ensemble_size))
191 |
192 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent, v, vc
193 |
194 | def mlp_actor(x, a, hidden_sizes=(64,64), activation=tf.tanh,
195 | output_activation=None, policy=None, action_space=None):
196 |
197 | # default policy builder depends on action space
198 | if policy is None and isinstance(action_space, Box):
199 | policy = mlp_gaussian_policy
200 | elif policy is None and isinstance(action_space, Discrete):
201 | policy = mlp_categorical_policy
202 |
203 | with tf.variable_scope('pi'):
204 | policy_outs = policy(x, a, hidden_sizes, activation, output_activation, action_space)
205 | pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent = policy_outs
206 | return pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent
207 |
208 | def mlp_critic (x, hidden_sizes=(64,64), activation=tf.tanh,
209 | output_activation=None, policy=None, action_space=None, name='V'):
210 | with tf.variable_scope(name+'f'):
211 | v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None))
212 |
213 | with tf.variable_scope(name+'c'):
214 | vc = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None))
215 |
216 | return v, vc
217 |
--------------------------------------------------------------------------------
/policies/base_policy.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from collections import OrderedDict
3 |
4 | import numpy as np
5 |
6 | class BasePolicy:
7 | def __init__(self):
8 | self._deterministic = False
9 |
10 | def reset(self):
11 | """Reset and clean the policy."""
12 | raise NotImplementedError
13 |
14 | def actions(self, conditions):
15 | """Compute (symbolic) actions given conditions (observations)"""
16 | raise NotImplementedError
17 |
18 | def log_pis(self, conditions, actions):
19 | """Compute (symbolic) log probs for given observations and actions."""
20 | raise NotImplementedError
21 |
22 | def actions_np(self, conditions):
23 | """Compute (numeric) actions given conditions (observations)"""
24 | raise NotImplementedError
25 |
26 | def log_pis_np(self, conditions, actions):
27 | """Compute (numeric) log probs for given observations and actions."""
28 | raise NotImplementedError
29 |
30 | def get_diagnostics(self, conditions):
31 | """Return diagnostic information of the policy.
32 |
33 | Arguments:
34 | conditions: Observations to run the diagnostics for.
35 | Returns:
36 | diagnostics: OrderedDict of diagnostic information.
37 | """
38 | diagnostics = OrderedDict({})
39 | return diagnostics
--------------------------------------------------------------------------------
/policies/utils.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 |
3 | def get_cpo_policy(env, session, *args, **kwargs):
4 | from policies.cpo_policy import CPOPolicy
5 | policy = CPOPolicy(
6 | obs_space=env.observation_space,
7 | act_space=env.action_space,
8 | session=session,
9 | *args,
10 | **kwargs)
11 | return policy
12 |
13 | POLICY_FUNCTIONS = {
14 | 'cpopolicy': get_cpo_policy,
15 | }
16 |
17 |
18 | def get_policy(policy_type, *args, **kwargs):
19 | return POLICY_FUNCTIONS[policy_type](*args, **kwargs)
20 |
21 | def get_policy_from_params(params, env, *args, **kwargs):
22 | policy_params = params['policy_params']
23 | policy_type = policy_params['type']
24 | policy_kwargs = deepcopy(policy_params['kwargs'])
25 |
26 | policy = POLICY_FUNCTIONS[policy_type](
27 | env,
28 | *args,
29 | **policy_kwargs,
30 | **kwargs)
31 |
32 | return policy
33 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | conda-env-export==0.3.2
2 | dotmap==1.3.8
3 | gtimer==1.0.0b5
4 | gym==0.18.0
5 | joblib==0.14.1
6 | mkl-fft==1.2.0
7 | mkl-random==1.1.0
8 | mkl-service==2.3.0
9 | mujoco-py==2.0.2.13
10 | olefile==0.46
11 | pyOpenSSL==19.1.0
12 | PySocks==1.7.1
13 | PyYAML==5.4.1
14 | ray==0.6.4
15 | sip==4.19.24
16 | tensorflow==1.14.0
17 | tornado==6.0.4
18 |
--------------------------------------------------------------------------------
/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anyboby/Constrained-Model-Based-Policy-Optimization/7ec7529c2ece9caa13e15abef145f6a0e1d22a63/samplers/__init__.py
--------------------------------------------------------------------------------
/samplers/base_sampler.py:
--------------------------------------------------------------------------------
1 | from collections import deque, OrderedDict
2 | from itertools import islice
3 |
4 |
5 | class BaseSampler(object):
6 | def __init__(self,
7 | max_path_length,
8 | min_pool_size,
9 | batch_size,
10 | store_last_n_paths=10,
11 | preprocess_type='default'):
12 | self._max_path_length = max_path_length
13 | self._min_pool_size = min_pool_size
14 | self._batch_size = batch_size
15 | self._store_last_n_paths = store_last_n_paths
16 | self._last_n_paths = deque(maxlen=store_last_n_paths)
17 | self._obs_process_type = preprocess_type
18 | self.env = None
19 | self.policy = None
20 | self.pool = None
21 |
22 | def initialize(self, env, policy, pool):
23 | self.env = env
24 | self.policy = policy
25 | self.pool = pool
26 |
27 | def set_policy(self, policy):
28 | self.policy = policy
29 |
30 | def clear_last_n_paths(self):
31 | self._last_n_paths.clear()
32 |
33 | def get_last_n_paths(self, n=None):
34 | if n is None:
35 | n = self._store_last_n_paths
36 |
37 | last_n_paths = tuple(islice(self._last_n_paths, None, n))
38 |
39 | return last_n_paths
40 |
41 | def sample(self):
42 | raise NotImplementedError
43 |
44 | def batch_ready(self):
45 | enough_samples = self.pool.size >= self._min_pool_size
46 | return enough_samples
47 |
48 | def random_batch(self, batch_size=None, **kwargs):
49 | batch_size = batch_size or self._batch_size
50 | return self.pool.random_batch(batch_size, **kwargs)
51 |
52 | def terminate(self):
53 | self.env.close()
54 |
55 | def get_diagnostics(self):
56 | diagnostics = OrderedDict({'pool-size': self.pool.size})
57 | return diagnostics
58 |
59 | def __getstate__(self):
60 | state = {
61 | key: value for key, value in self.__dict__.items()
62 | if key not in ('env', 'policy', 'pool')
63 | }
64 |
65 | return state
66 |
67 | def __setstate__(self, state):
68 | self.__dict__.update(state)
69 |
70 | self.env = None
71 | self.policy = None
72 | self.pool = None
73 |
--------------------------------------------------------------------------------
/samplers/simple_sampler.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 |
4 |
5 | import numpy as np
6 | import matplotlib.pyplot as plt
7 |
8 | from samplers.base_sampler import BaseSampler
9 |
10 | class SimpleSampler(BaseSampler):
11 | def __init__(self, **kwargs):
12 | super(SimpleSampler, self).__init__(**kwargs)
13 |
14 | self._path_length = 0
15 | self._path_return = 0
16 | self._current_path = defaultdict(list)
17 | self._last_path_return = 0
18 | self._max_path_return = -np.inf
19 | self._n_episodes = 0
20 | self._current_observation = None
21 | self._total_samples = 0
22 | self._last_action = None
23 |
24 | def _process_observations(self,
25 | observation,
26 | action,
27 | reward,
28 | cost,
29 | terminal,
30 | next_observation,
31 | info):
32 |
33 | processed_observation = {
34 | 'observations': observation,
35 | 'actions': action,
36 | 'rewards': [reward],
37 | 'cost' : [cost],
38 | 'terminals': [terminal],
39 | 'next_observations': next_observation,
40 | 'infos': info,
41 | }
42 |
43 | return processed_observation
44 |
45 | def sample(self):
46 | if self._current_observation is None:
47 | self._current_observation = np.squeeze(self.env.reset())
48 | self._last_action = np.zeros(shape=self.env.action_space.shape)
49 |
50 | action = self.policy.actions_np(
51 | self.env.convert_to_active_observation(
52 | self._current_observation)[None]
53 | )[0]
54 |
55 | next_observation, reward, terminal, info = self.env.step(action)
56 | next_observation = np.squeeze(next_observation)
57 | reward = np.squeeze(reward)
58 | terminal = np.squeeze(terminal)
59 | cost = info.get('cost', 0)
60 |
61 | self._path_length += 1
62 | self._path_return += reward
63 | self._total_samples += 1
64 |
65 | processed_sample = self._process_observations(
66 | observation=self._current_observation,
67 | action=action,
68 | reward=reward,
69 | cost=cost,
70 | terminal=terminal,
71 | next_observation=next_observation,
72 | info=info,
73 | )
74 |
75 | for key, value in processed_sample.items():
76 | self._current_path[key].append(value)
77 |
78 | #### add to pool only after full epoch or terminal path
79 | if terminal or self._path_length >= self._max_path_length:
80 | last_path = {
81 | field_name: np.array(values)
82 | for field_name, values in self._current_path.items()
83 | }
84 |
85 | self.pool.add_path(last_path)
86 | self._last_n_paths.appendleft(last_path)
87 |
88 | self._max_path_return = max(self._max_path_return,
89 | self._path_return)
90 | self._last_path_return = self._path_return
91 |
92 | self.policy.reset()
93 | self._current_observation = None
94 | self._path_length = 0
95 | self._path_return = 0
96 | self._current_path = defaultdict(list)
97 | self._last_action = np.zeros(shape=self.env.action_space.shape)
98 | self._n_episodes += 1
99 | else:
100 | self._current_observation = next_observation
101 | self._last_action = action
102 |
103 | return next_observation, reward, terminal, info
104 |
105 | def random_batch(self, batch_size=None, **kwargs):
106 | batch_size = batch_size or self._batch_size
107 | observation_keys = getattr(self.env, 'observation_keys', None)
108 |
109 | return self.pool.random_batch(
110 | batch_size, observation_keys=observation_keys, **kwargs)
111 |
112 | def get_diagnostics(self):
113 | diagnostics = super(SimpleSampler, self).get_diagnostics()
114 | diagnostics.update({
115 | 'max-path-return': self._max_path_return,
116 | 'last-path-return': self._last_path_return,
117 | 'episodes': self._n_episodes,
118 | 'total-samples': self._total_samples,
119 | })
120 |
121 | return diagnostics
122 |
123 |
--------------------------------------------------------------------------------
/samplers/utils.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 |
3 | import numpy as np
4 |
5 | def get_cposampler(*args, **kwargs):
6 | from samplers.cpo_sampler import CpoSampler
7 | sampler = CpoSampler(
8 | *args,
9 | **kwargs)
10 |
11 | return sampler
12 |
13 | SAMPLERS_FUNCTIONS = {
14 | 'CPOSampler' : get_cposampler,
15 | }
16 |
17 |
18 | def get_sampler_from_params(params, *args, **kwargs):
19 |
20 | sampler_params = params['sampler_params']
21 | sampler_type = sampler_params['type']
22 |
23 | sampler_args = deepcopy(sampler_params.get('args', ()))
24 | sampler_kwargs = deepcopy(sampler_params.get('kwargs', {}))
25 |
26 | sampler = SAMPLERS_FUNCTIONS[sampler_type](
27 | *sampler_args, *args, **sampler_kwargs, **kwargs)
28 |
29 | return sampler
--------------------------------------------------------------------------------
/scripts/console_scripts.py:
--------------------------------------------------------------------------------
1 | """A command line interface that exposes softlearning examples to user.
2 |
3 | This package exposes the functions in examples.instrument module to the user
4 | through a cli, which allows seamless runs of examples in different modes (e.g.
5 | locally, in google compute engine, or ec2).
6 |
7 |
8 | There are two types of cli commands in this file (each have their corresponding
9 | function in examples.instrument):
10 | 1. run_example_* methods, which run the experiments by invoking
11 | `tune.run_experiments` function.
12 | 2. launch_example_* methods, which are helpers function to submit an
13 | example to be run in the cloud. In practice, these launch a cluster,
14 | and then run the `run_example_cluster` method with the provided
15 | arguments and options.
16 | """
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import logging
23 |
24 | import click
25 |
26 | from utilities.instrument import (
27 | run_example_dry,
28 | run_example_local,
29 | run_example_debug,
30 | run_example_cluster,
31 | launch_example_cluster,
32 | launch_example_gce,
33 | launch_example_ec2)
34 |
35 |
36 | logging.basicConfig(level=logging.INFO)
37 | logger = logging.getLogger(__name__)
38 | logger.setLevel(logging.INFO)
39 |
40 | def add_options(options):
41 | def decorator(f):
42 | for option in options[::-1]:
43 | click.decorators._param_memo(f, option)
44 | return f
45 | return decorator
46 |
47 |
48 | @click.group()
49 | def cli():
50 | pass
51 |
52 |
53 | @cli.command(
54 | name='run_example_dry',
55 | context_settings={'ignore_unknown_options': True})
56 | @click.argument("example_module_name", required=True, type=str)
57 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
58 | def run_example_dry_cmd(example_module_name, example_argv):
59 | """Print the variant spec and related information of an example."""
60 | return run_example_dry(example_module_name, example_argv)
61 |
62 |
63 | @cli.command(
64 | name='run_local',
65 | context_settings={'ignore_unknown_options': True})
66 | @click.argument("example_module_name", required=True, type=str)
67 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
68 | def run_example_local_cmd(example_module_name, example_argv):
69 | """Run example locally, potentially parallelizing across cpus/gpus."""
70 | return run_example_local(example_module_name, example_argv)
71 |
72 |
73 | @cli.command(
74 | name='run_example_debug',
75 | context_settings={'ignore_unknown_options': True})
76 | @click.argument("example_module_name", required=True, type=str)
77 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
78 | def run_example_debug_cmd(example_module_name, example_argv):
79 | """The debug mode limits tune trial runs to enable use of debugger."""
80 | return run_example_debug(example_module_name, example_argv)
81 |
82 | @cli.command(
83 | name='run_example_cluster',
84 | context_settings={'ignore_unknown_options': True})
85 |
86 | @click.argument("example_module_name", required=True, type=str)
87 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
88 | def run_example_cluster_cmd(example_module_name, example_argv):
89 | """Run example on cluster mode.
90 |
91 | This functions is very similar to the local mode, except that it
92 | correctly sets the redis address to make ray/tune work on a cluster.
93 | """
94 | run_example_cluster(example_module_name, example_argv)
95 |
96 | @cli.command(
97 | name='launch_example_cluster',
98 | context_settings={
99 | 'allow_extra_args': True,
100 | 'ignore_unknown_options': True
101 | })
102 | @click.argument("example_module_name", required=True, type=str)
103 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
104 | @click.option(
105 | "--config_file",
106 | required=False,
107 | type=str)
108 | @click.option(
109 | "--stop/--no-stop",
110 | is_flag=True,
111 | default=True,
112 | help="Stop the cluster after the command finishes running.")
113 | @click.option(
114 | "--start/--no-start",
115 | is_flag=True,
116 | default=True,
117 | help="Start the cluster if needed.")
118 | @click.option(
119 | "--screen/--no-screen",
120 | is_flag=True,
121 | default=False,
122 | help="Run the command in a screen.")
123 | @click.option(
124 | "--tmux/--no-tmux",
125 | is_flag=True,
126 | default=True,
127 | help="Run the command in tmux.")
128 | @click.option(
129 | "--override-cluster-name",
130 | required=False,
131 | type=str,
132 | help="Override the configured cluster name.")
133 | @click.option(
134 | "--port-forward", required=False, type=int, help="Port to forward.")
135 | def launch_example_cluster_cmd(*args, **kwargs):
136 | """Launches the example on autoscaled ray cluster through ray exec_cmd.
137 |
138 | This handles basic validation and sanity checks for the experiment, and
139 | then executes the command on autoscaled ray cluster. If necessary, it will
140 | also fill in more useful defaults for our workflow (i.e. for tmux and
141 | override_cluster_name).
142 | """
143 | return launch_example_cluster(*args, **kwargs)
144 |
145 |
146 | @cli.command(
147 | name='launch_example_gce',
148 | context_settings={
149 | 'allow_extra_args': True,
150 | 'ignore_unknown_options': True
151 | })
152 | @add_options(launch_example_cluster_cmd.params)
153 | def launch_example_gce_cmd(*args, **kwargs):
154 | """Forwards call to `launch_example_cluster` after adding gce defaults.
155 |
156 | This optionally sets the ray autoscaler configuration file to the default
157 | gce configuration file, and then calls `launch_example_cluster` to
158 | execute the original command on autoscaled gce cluster by parsing the args.
159 |
160 | See `launch_example_cluster` for further details.
161 | """
162 | return launch_example_gce(*args, **kwargs)
163 |
164 |
165 | @cli.command(
166 | name='launch_example_ec2',
167 | context_settings={
168 | 'allow_extra_args': True,
169 | 'ignore_unknown_options': True
170 | })
171 | @add_options(launch_example_cluster_cmd.params)
172 | def launch_example_ec2_cmd(*args, **kwargs):
173 | """Forwards call to `launch_example_cluster` after adding ec2 defaults.
174 |
175 | This optionally sets the ray autoscaler configuration file to the default
176 | ec2 configuration file, and then calls `launch_example_cluster` to
177 | execute the original command on autoscaled ec2 cluster by parsing the args.
178 |
179 | See `launch_example_cluster` for further details.
180 | """
181 | return launch_example_ec2(*args, **kwargs)
182 |
183 | cli.add_command(run_example_local_cmd)
184 | cli.add_command(run_example_dry_cmd)
185 | cli.add_command(run_example_cluster_cmd)
186 |
187 | # Alias for run_example_local
188 | cli.add_command(run_example_local_cmd, name='launch_example_local')
189 | # Alias for run_example_dry
190 | cli.add_command(run_example_dry_cmd, name='launch_example_dry')
191 | # Alias for run_example_debug
192 | cli.add_command(run_example_debug_cmd, name='launch_example_debug')
193 | cli.add_command(launch_example_cluster_cmd)
194 | cli.add_command(launch_example_gce_cmd)
195 | cli.add_command(launch_example_ec2_cmd)
196 |
197 |
198 | def main():
199 | return cli()
200 |
201 |
202 | if __name__ == "__main__":
203 | main()
204 |
--------------------------------------------------------------------------------
/scripts/run.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import glob
4 | import pickle
5 | import sys
6 | import pdb
7 | import importlib
8 | from dotmap import DotMap
9 |
10 | import tensorflow as tf
11 | import ray
12 | from ray import tune
13 | from ray.autoscaler.commands import exec_cluster
14 |
15 | from envs.utils import get_env_from_params
16 | from algorithms.utils import get_algorithm_from_params
17 | from policies.utils import get_policy_from_params
18 | from buffers.utils import get_buffer_from_params
19 | from samplers.utils import get_sampler_from_params
20 | from utilities.utils import set_seed, initialize_tf_variables
21 | from utilities.instrument import create_trial_name_creator
22 |
23 | class SimpleExperiment(tune.Trainable):
24 | def _setup(self, params):
25 | self._params = params
26 |
27 | #### set up tf session
28 | set_seed(params['run_params']['seed'])
29 | gpu_options = tf.GPUOptions(allow_growth=True)
30 | session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
31 | tf.keras.backend.set_session(session)
32 |
33 | self._session = tf.keras.backend.get_session()
34 | self.train_generator = None
35 | self._built = False
36 |
37 | def _stop(self):
38 | tf.reset_default_graph()
39 | tf.keras.backend.clear_session()
40 |
41 | def _build(self):
42 | """
43 | called by tune to build algorithm
44 | """
45 |
46 | #### set up building blocks for algorithm
47 | params = copy.deepcopy(self._params)
48 | env_params = params['environment_params']
49 | env = self.env = (
50 | get_env_from_params(env_params))
51 |
52 | buffer = self.buffer = (
53 | get_buffer_from_params(params, env))
54 |
55 | sampler = self.sampler = get_sampler_from_params(params)
56 |
57 | policy = self.policy = get_policy_from_params(
58 | params, env, self._session)
59 |
60 | #### build algorithm
61 | self.algorithm = get_algorithm_from_params(
62 | variant=self._params,
63 | env=env,
64 | policy=policy,
65 | buffer=buffer,
66 | sampler=sampler,
67 | session=self._session)
68 |
69 | #### finalize graph
70 | initialize_tf_variables(self._session, only_uninitialized=True)
71 | tf.get_default_graph().finalize()
72 |
73 | #### set train generator function
74 | self.train_generator = self.algorithm.train()
75 | self._built = True
76 |
77 | def _train(self):
78 | if not self._built:
79 | self._build()
80 |
81 | diagnostics = next(self.train_generator)
82 | return diagnostics
83 |
84 | def main(argv=None):
85 | """
86 | run simple ray tune experiment.
87 |
88 | Please provide config file location, e.g.
89 |
90 |
91 | """
92 | assert argv[0] is not None, "Please provide config file location, e.g."
93 |
94 | #### create
95 | base_module = 'configs.baseconfig'
96 | base_module = importlib.import_module(base_module)
97 |
98 | #### tune configs
99 | trial_name_template = 'seed:{trial.config[run_params][seed]}'
100 | trial_name_creator = create_trial_name_creator(trial_name_template) ## generator for trial name (determines logdir)
101 | gpus=1 ## gpus to be used
102 | trial_gpus=1 ## gpus to be used in trial
103 | mode='local' ## local or remote, currently only local supported
104 |
105 | config=str(argv[0]) ## config file location
106 |
107 | exp_config = DotMap(dict(
108 | gpus=gpus,
109 | trial_gpus=trial_gpus,
110 | mode=mode,
111 | config=config,
112 | ))
113 |
114 | ### build the experiment
115 | exp_config = base_module.get_variant_spec(exp_config) ## merge base config and config file to final config
116 | exp_id = exp_config.get('exp_name') ## name of the experiment
117 | exp_class = SimpleExperiment ## tune trainable class that runs the experiments
118 | local_dir = os.path.join(exp_config.get('log_dir'), exp_config.get('task')) ## directory for tf summaries, configs etc.
119 |
120 | ### define experiment
121 | experiment = {
122 | exp_id:{
123 | 'run': exp_class,
124 | 'config': exp_config,
125 | 'local_dir': local_dir,
126 | 'trial_name_creator': trial_name_creator,
127 | }
128 | }
129 |
130 | ### initialize ray und run experiments
131 | ray.init(
132 | num_gpus=gpus,
133 | local_mode=True,
134 | object_store_memory=100 * 1024 * 1024, #@anyboby TODO: test the memory config
135 | )
136 |
137 | tune.run_experiments(
138 | experiment,
139 | server_port=4321,
140 | )
141 |
142 | if __name__ == '__main__':
143 | main(argv=sys.argv[1:])
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | from setuptools import find_packages
3 |
4 | setup(
5 | name='cmbpo',
6 | packages=find_packages(),
7 | version='0.0.1',
8 | description='Constrained Model-based policy optimization',
9 | long_description=open('./README.md').read(),
10 | author='Moritz Zanger',
11 | author_email='zanger.moritz@gmail.com',
12 | entry_points={
13 | 'console_scripts': (
14 | 'cmbpo=scripts.console_scripts:main',
15 | )
16 | },
17 | requires=(),
18 | zip_safe=True,
19 | license='MIT'
20 | )
21 |
--------------------------------------------------------------------------------
/utilities/logging.py:
--------------------------------------------------------------------------------
1 | import time
2 | import math
3 | import pdb
4 |
5 |
6 |
7 | def update_dict(dict_a, dict_b, weight_a=.5, weight_b=.5):
8 | """
9 | creates new updated dict and adds entries according to weights.
10 | for both weights = 1 the entries are added
11 | """
12 | dict_a_cp = dict(dict_a)
13 | dict_a_cp.update(dict_b)
14 | for k,v in dict_b.items():
15 | if k in dict_a.keys():
16 | dict_a_cp[k] = weight_b*dict_b[k] + weight_a*dict_a[k]
17 | return dict_a_cp
18 |
19 | class Progress:
20 |
21 | def __init__(self, total, name = 'Progress', ncol=3, max_length=20, indent=0, line_width=100, speed_update_freq=100):
22 | self.total = total
23 | self.name = name
24 | self.ncol = ncol
25 | self.max_length = max_length
26 | self.indent = indent
27 | self.line_width = line_width
28 | self._speed_update_freq = speed_update_freq
29 |
30 | self._step = 0
31 | self._prev_line = '\033[F'
32 | self._clear_line = ' ' * self.line_width
33 |
34 | self._pbar_size = self.ncol * self.max_length
35 | self._complete_pbar = '#' * self._pbar_size
36 | self._incomplete_pbar = ' ' * self._pbar_size
37 |
38 | self.lines = ['']
39 | self.fraction = '{} / {}'.format(0, self.total)
40 |
41 | self.resume()
42 |
43 |
44 | def update(self, n=1):
45 | self._step += n
46 | if self._step % self._speed_update_freq == 0:
47 | self._time0 = time.time()
48 | self._step0 = self._step
49 |
50 | def resume(self):
51 | self._skip_lines = 1
52 | print('\n', end='')
53 | self._time0 = time.time()
54 | self._step0 = self._step
55 |
56 | def pause(self):
57 | self._clear()
58 | self._skip_lines = 1
59 |
60 | def set_description(self, params=[]):
61 |
62 | ############
63 | # Position #
64 | ############
65 | self._clear()
66 |
67 | ###########
68 | # Percent #
69 | ###########
70 | percent, fraction = self._format_percent(self._step, self.total)
71 | self.fraction = fraction
72 |
73 | #########
74 | # Speed #
75 | #########
76 | speed = self._format_speed(self._step)
77 |
78 | ##########
79 | # Params #
80 | ##########
81 | num_params = len(params)
82 | nrow = math.ceil(num_params / self.ncol)
83 | params_split = self._chunk(params, self.ncol)
84 | params_string, lines = self._format(params_split)
85 | self.lines = lines
86 |
87 |
88 | description = '{} | {}{}'.format(percent, speed, params_string)
89 | print(description)
90 | self._skip_lines = nrow + 1
91 |
92 | def append_description(self, descr):
93 | self.lines.append(descr)
94 |
95 | def _clear(self):
96 | position = self._prev_line * self._skip_lines
97 | empty = '\n'.join([self._clear_line for _ in range(self._skip_lines)])
98 | print(position, end='')
99 | print(empty)
100 | print(position, end='')
101 |
102 | def _format_percent(self, n, total):
103 | if total:
104 | percent = n / float(total)
105 |
106 | complete_entries = int(percent * self._pbar_size)
107 | incomplete_entries = self._pbar_size - complete_entries
108 |
109 | pbar = self._complete_pbar[:complete_entries] + self._incomplete_pbar[:incomplete_entries]
110 | fraction = '{} / {}'.format(n, total)
111 | string = '{} [{}] {:3d}%'.format(fraction, pbar, int(percent*100))
112 | else:
113 | fraction = '{}'.format(n)
114 | string = '{} iterations'.format(n)
115 | return string, fraction
116 |
117 | def _format_speed(self, n):
118 | num_steps = n - self._step0
119 | t = time.time() - self._time0
120 | speed = num_steps / t
121 | string = '{:.1f} Hz'.format(speed)
122 | if num_steps > 0:
123 | self._speed = string
124 | return string
125 |
126 | def _chunk(self, l, n):
127 | return [l[i:i+n] for i in range(0, len(l), n)]
128 |
129 | def _format(self, chunks):
130 | lines = [self._format_chunk(chunk) for chunk in chunks]
131 | lines.insert(0,'')
132 | padding = '\n' + ' '*self.indent
133 | string = padding.join(lines)
134 | return string, lines
135 |
136 | def _format_chunk(self, chunk):
137 | line = ' | '.join([self._format_param(param) for param in chunk])
138 | return line
139 |
140 | def _format_param(self, param):
141 | k, v = param
142 | return '{} : {}'.format(k, v)[:self.max_length]
143 |
144 | def stamp(self):
145 | if self.lines != ['']:
146 | params = ' | '.join(self.lines)
147 | string = '[ {} ] {}{} | {}'.format(self.name, self.fraction, params, self._speed)
148 | self._clear()
149 | print(string, end='\n')
150 | self._skip_lines = 1
151 | else:
152 | self._clear()
153 | self._skip_lines = 0
154 |
155 | def close(self):
156 | self.pause()
157 |
158 | class Silent:
159 |
160 | def __init__(self, *args, **kwargs):
161 | pass
162 |
163 | def __getattr__(self, attr):
164 | return lambda *args: None
165 |
166 |
167 | if __name__ == '__main__':
168 | silent = Silent()
169 | silent.update()
170 | silent.stamp()
171 |
172 | num_steps = 1000
173 | progress = Progress(num_steps)
174 | for i in range(num_steps):
175 | progress.update()
176 | params = [
177 | ['A', '{:06d}'.format(i)],
178 | ['B', '{:06d}'.format(i)],
179 | ['C', '{:06d}'.format(i)],
180 | ['D', '{:06d}'.format(i)],
181 | ['E', '{:06d}'.format(i)],
182 | ['F', '{:06d}'.format(i)],
183 | ['G', '{:06d}'.format(i)],
184 | ['H', '{:06d}'.format(i)],
185 | ]
186 | progress.set_description(params)
187 | time.sleep(0.01)
188 | progress.close()
189 |
--------------------------------------------------------------------------------
/utilities/mpi_tf.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from mpi4py import MPI
4 | from utilities.mpi_tools import broadcast
5 |
6 |
7 | def flat_concat(xs):
8 | return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0)
9 |
10 | def assign_params_from_flat(x, params):
11 | flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars
12 | splits = tf.split(x, [flat_size(p) for p in params])
13 | new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)]
14 | return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)])
15 |
16 | def sync_params(params):
17 | get_params = flat_concat(params)
18 | def _broadcast(x):
19 | broadcast(x)
20 | return x
21 | synced_params = tf.py_func(_broadcast, [get_params], tf.float32)
22 | return assign_params_from_flat(synced_params, params)
23 |
24 | def sync_all_params():
25 | """Sync all tf variables across MPI processes."""
26 | return sync_params(tf.global_variables())
27 |
28 |
29 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
30 | """
31 | Adam optimizer that averages gradients across MPI processes.
32 |
33 | The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_.
34 | For documentation on method arguments, see the Tensorflow docs page for
35 | the base `AdamOptimizer`_.
36 |
37 | .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py
38 | .. _`AdamOptimizer`: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
39 | """
40 |
41 | def __init__(self, **kwargs):
42 | self.comm = MPI.COMM_WORLD
43 | tf.train.AdamOptimizer.__init__(self, **kwargs)
44 |
45 | def compute_gradients(self, loss, var_list, **kwargs):
46 | """
47 | Same as normal compute_gradients, except average grads over processes.
48 | """
49 | grads_and_vars = super().compute_gradients(loss, var_list, **kwargs)
50 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
51 | flat_grad = flat_concat([g for g, v in grads_and_vars])
52 | shapes = [v.shape.as_list() for g, v in grads_and_vars]
53 | sizes = [int(np.prod(s)) for s in shapes]
54 |
55 | num_tasks = self.comm.Get_size()
56 | buf = np.zeros(flat_grad.shape, np.float32)
57 |
58 | def _collect_grads(flat_grad):
59 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
60 | np.divide(buf, float(num_tasks), out=buf)
61 | return buf
62 |
63 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
64 | avg_flat_grad.set_shape(flat_grad.shape)
65 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
66 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
67 | for g, (_, v) in zip(avg_grads, grads_and_vars)]
68 |
69 | return avg_grads_and_vars
70 |
71 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
72 | """
73 | Same as normal apply_gradients, except sync params after update.
74 | """
75 | opt = super().apply_gradients(grads_and_vars, global_step, name)
76 | with tf.control_dependencies([opt]):
77 | sync = sync_params([v for g,v in grads_and_vars])
78 | return tf.group([opt, sync])
--------------------------------------------------------------------------------
/utilities/mpi_tools.py:
--------------------------------------------------------------------------------
1 |
2 | from mpi4py import MPI
3 | import os, subprocess, sys
4 | import numpy as np
5 |
6 |
7 | def mpi_fork(n, bind_to_core=False):
8 | """
9 | Re-launches the current script with workers linked by MPI.
10 |
11 | Also, terminates the original process that launched it.
12 |
13 | Taken almost without modification from the Baselines function of the
14 | `same name`_.
15 |
16 | .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
17 |
18 | Args:
19 | n (int): Number of process to split into.
20 |
21 | bind_to_core (bool): Bind each MPI process to a core.
22 | """
23 | if n<=1:
24 | return
25 | if os.getenv("IN_MPI") is None:
26 | env = os.environ.copy()
27 | env.update(
28 | MKL_NUM_THREADS="1",
29 | OMP_NUM_THREADS="1",
30 | IN_MPI="1"
31 | )
32 | args = ["mpirun", "-np", str(n)]
33 | if bind_to_core:
34 | args += ["-bind-to", "core"]
35 | args += [sys.executable] + sys.argv
36 | subprocess.check_call(args, env=env)
37 | sys.exit()
38 |
39 |
40 | def msg(m, string=''):
41 | print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m))
42 |
43 | def proc_id():
44 | """Get rank of calling process."""
45 | return MPI.COMM_WORLD.Get_rank()
46 |
47 | def allreduce(*args, **kwargs):
48 | return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
49 |
50 | def num_procs():
51 | """Count active MPI processes."""
52 | return MPI.COMM_WORLD.Get_size()
53 |
54 | def broadcast(x, root=0):
55 | MPI.COMM_WORLD.Bcast(x, root=root)
56 |
57 | def mpi_op(x, op):
58 | x, scalar = ([x], True) if np.isscalar(x) else (x, False)
59 | x = np.asarray(x, dtype=np.float32)
60 | buff = np.zeros_like(x, dtype=np.float32)
61 | allreduce(x, buff, op=op)
62 | return buff[0] if scalar else buff
63 |
64 | def mpi_sum(x):
65 | return mpi_op(x, MPI.SUM)
66 |
67 | def mpi_avg(x):
68 | """Average a scalar or vector over MPI processes."""
69 | return mpi_sum(x) / num_procs()
70 |
71 | def mpi_statistics_scalar(x, with_min_and_max=False):
72 | """
73 | Get mean/std and optional min/max of scalar x across MPI processes.
74 |
75 | Args:
76 | x: An array containing samples of the scalar to produce statistics
77 | for.
78 |
79 | with_min_and_max (bool): If true, return min and max of x in
80 | addition to mean and std.
81 | """
82 | x = np.array(x, dtype=np.float32)
83 | global_sum, global_n = mpi_sum([np.sum(x), len(x)])
84 | mean = global_sum / global_n
85 |
86 | global_sum_sq = mpi_sum(np.sum((x - mean)**2))
87 | std = np.sqrt(global_sum_sq / global_n) # compute global std
88 |
89 | if with_min_and_max:
90 | global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
91 | global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
92 | return mean, std, global_min, global_max
93 | return mean, std
--------------------------------------------------------------------------------
/utilities/serialization_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | def convert_json(obj):
4 | """ Convert obj to a version which can be serialized with JSON. """
5 | if is_json_serializable(obj):
6 | return obj
7 | else:
8 | if isinstance(obj, dict):
9 | serializables = {}
10 | for k,v in obj.items():
11 | if is_json_serializable(k) and is_json_serializable(v):
12 | serializables[convert_json(k)]=convert_json(v)
13 |
14 | return serializables
15 |
16 | elif isinstance(obj, tuple):
17 | return (convert_json(x) for x in obj)
18 |
19 | elif isinstance(obj, list):
20 | return [convert_json(x) for x in obj]
21 |
22 | elif hasattr(obj,'__name__') and not('lambda' in obj.__name__):
23 | return convert_json(obj.__name__)
24 |
25 | elif hasattr(obj,'__dict__') and obj.__dict__:
26 | obj_dict = {convert_json(k): convert_json(v)
27 | for k,v in obj.__dict__.items()}
28 | return {str(obj): obj_dict}
29 |
30 | return str(obj)
31 |
32 | def is_json_serializable(v):
33 | try:
34 | json.dumps(v)
35 | return True
36 | except:
37 | return False
--------------------------------------------------------------------------------
/utilities/trust_region.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from utilities.utils import EPS
5 | """
6 | Tensorflow utilities for trust region optimization
7 | """
8 |
9 | def flat_concat(xs):
10 | return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0)
11 |
12 | def flat_grad(f, params):
13 | return flat_concat(tf.gradients(xs=params, ys=f))
14 |
15 | def hessian_vector_product(f, params):
16 | # for H = grad**2 f, compute Hx
17 | g = flat_grad(f, params)
18 | x = tf.placeholder(tf.float32, shape=g.shape)
19 | return x, flat_grad(tf.reduce_sum(g*x), params)
20 |
21 | def assign_params_from_flat(x, params):
22 | flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars
23 | splits = tf.split(x, [flat_size(p) for p in params])
24 | new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)]
25 | return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)])
26 |
27 |
28 | """
29 | Conjugate gradient
30 | """
31 |
32 | def cg(Ax, b, cg_iters=10):
33 | x = np.zeros_like(b)
34 | r = b.copy() # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
35 | p = r.copy()
36 | r_dot_old = np.dot(r,r)
37 | for _ in range(cg_iters):
38 | z = Ax(p)
39 | alpha = r_dot_old / (np.dot(p, z) + EPS)
40 | x += alpha * p
41 | r -= alpha * z
42 | r_dot_new = np.dot(r,r)
43 | p = r + (r_dot_new / r_dot_old) * p
44 | r_dot_old = r_dot_new
45 | return x
--------------------------------------------------------------------------------