├── examples ├── .gitignore ├── media │ ├── maze_average.png │ ├── maze_individual.png │ ├── random_pos_average.png │ ├── random_force_average.png │ ├── random_force_individual.png │ ├── random_pos_individual.png │ └── random_exploration │ │ ├── maze_constrained.gif │ │ ├── maze_unconstrained.gif │ │ ├── random_pos_unconstrained.gif │ │ └── random_force_unconstrained.gif ├── data │ └── Nav2dFixedMaze-500T.npz ├── README.md ├── 01-constraints-from-demonstrations.md └── 02-constraints-from-scratch.md ├── ceres ├── scripts │ ├── .gitignore │ ├── train_ceres.py │ ├── plot_rewards.py │ └── play_policy.py ├── tools │ ├── __init__.py │ ├── plot │ │ ├── __init__.py │ │ ├── plot_config.py │ │ └── plot_logs.py │ ├── io │ │ ├── __init__.py │ │ ├── h5_helper.py │ │ └── extra_args.py │ └── math │ │ ├── __init__.py │ │ ├── qpsolver_quadprog.py │ │ ├── spherical_coordinates.py │ │ └── qpsolver.py ├── __init__.py ├── networks │ ├── __init__.py │ ├── network_saver_mlp.py │ └── network_saver.py ├── envs │ ├── resetter │ │ ├── __init__.py │ │ ├── resetter_env.py │ │ └── resetter_env_ceres.py │ ├── __init__.py │ ├── constrained │ │ ├── __init__.py │ │ ├── constrained_env_fixed.py │ │ ├── constrained_env_network.py │ │ └── constrained_env.py │ ├── nav2d │ │ ├── __init__.py │ │ ├── nav2d_force.py │ │ ├── nav2d_ceres.py │ │ └── obstacles.py │ └── ceres_env.py ├── constraints │ ├── __init__.py │ ├── constraint_config.py │ ├── constraint_network_mlp.py │ ├── constraint_loss.py │ └── constraint_network.py └── baselines │ ├── ceres │ ├── mlp_policy_saver.py │ ├── run_continuous.py │ ├── pposgd_ceres_helper.py │ └── pposgd_ceres.py │ ├── common │ ├── mpi_moments_select.py │ ├── mpi_select.py │ ├── mpi_adam_select.py │ └── plot_logs_baselines.py │ └── ppo1 │ ├── pposgd_simple.py │ └── pposgd_simple_helper.py ├── MAINTAINERS.txt ├── HEADER ├── setup.py ├── LICENSE ├── DCO1.1.txt ├── README.md ├── CONTRIBUTING.md └── CONDUCT.md /examples/.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | -------------------------------------------------------------------------------- /ceres/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | demonstrations 2 | cnet 3 | cact 4 | -------------------------------------------------------------------------------- /MAINTAINERS.txt: -------------------------------------------------------------------------------- 1 | Maintainers 2 | 3 | Tu-Hoa Pham ph4m pham@jp.ibm.com 4 | -------------------------------------------------------------------------------- /examples/media/maze_average.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/maze_average.png -------------------------------------------------------------------------------- /examples/media/maze_individual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/maze_individual.png -------------------------------------------------------------------------------- /examples/data/Nav2dFixedMaze-500T.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/data/Nav2dFixedMaze-500T.npz -------------------------------------------------------------------------------- /examples/media/random_pos_average.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_pos_average.png -------------------------------------------------------------------------------- /examples/media/random_force_average.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_force_average.png -------------------------------------------------------------------------------- /examples/media/random_force_individual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_force_individual.png -------------------------------------------------------------------------------- /examples/media/random_pos_individual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_pos_individual.png -------------------------------------------------------------------------------- /examples/media/random_exploration/maze_constrained.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_exploration/maze_constrained.gif -------------------------------------------------------------------------------- /HEADER: -------------------------------------------------------------------------------- 1 | Copyright (c) <%= owner %> <%= years %>. All Rights Reserved. 2 | Project name: <%= name %> 3 | This project is licensed under the MIT License, see LICENSE 4 | -------------------------------------------------------------------------------- /examples/media/random_exploration/maze_unconstrained.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_exploration/maze_unconstrained.gif -------------------------------------------------------------------------------- /examples/media/random_exploration/random_pos_unconstrained.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_exploration/random_pos_unconstrained.gif -------------------------------------------------------------------------------- /examples/media/random_exploration/random_force_unconstrained.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_exploration/random_force_unconstrained.gif -------------------------------------------------------------------------------- /ceres/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .math import * 6 | from .io import * 7 | -------------------------------------------------------------------------------- /ceres/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .envs import * 6 | from .constraints import * 7 | from .tools import * 8 | -------------------------------------------------------------------------------- /ceres/tools/plot/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .plot_config import PlotConfig 6 | from .plot_logs import PlotLogs 7 | -------------------------------------------------------------------------------- /ceres/networks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .network_saver import NetworkSaver 6 | from .network_saver_mlp import NetworkSaverMLP 7 | 8 | -------------------------------------------------------------------------------- /ceres/tools/io/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .extra_args import ExtraArgs 6 | from .h5_helper import save_dict_as_h5, load_dict_from_h5 7 | -------------------------------------------------------------------------------- /ceres/envs/resetter/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .resetter_env import ResetterEnv 6 | from .resetter_env_ceres import ResetterEnvCeres 7 | 8 | -------------------------------------------------------------------------------- /ceres/scripts/train_ceres.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | if __name__ == '__main__': 6 | from ceres.baselines.ceres.run_continuous import main 7 | main() 8 | -------------------------------------------------------------------------------- /ceres/envs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .resetter import * 6 | from .constrained import * 7 | from .ceres_env import CeresEnv 8 | from .nav2d import * 9 | -------------------------------------------------------------------------------- /ceres/scripts/plot_rewards.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | if __name__ == '__main__': 6 | from ceres.baselines.common.plot_logs_baselines import main 7 | main() 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='ceres', 6 | version='0.1.0', 7 | packages=['ceres'], 8 | install_requires=[ 9 | 'numpy', 10 | 'matplotlib', 11 | 'baselines', 12 | 'gym', 13 | 'h5py', 14 | 'pygame', 15 | 'quadprog', 16 | ], 17 | ) 18 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Constrained Exploration and Recovery from Experience Shaping - Examples 2 | 3 | 1. [Learning action space constraints from positive and negative demonstrations](01-constraints-from-demonstrations.md): fixed maze 4 | 2. [Learning action space constraints from scratch](02-constraints-from-scratch.md): random obstacles with position and force control 5 | -------------------------------------------------------------------------------- /ceres/tools/math/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .qpsolver import QPSolver 6 | from .qpsolver_quadprog import QPSolverQuadprog 7 | from .spherical_coordinates import SphericalCoordinates 8 | -------------------------------------------------------------------------------- /ceres/envs/constrained/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .constrained_env import ConstrainedEnv 6 | from .constrained_env_fixed import ConstrainedEnvFixed 7 | from .constrained_env_network import ConstrainedEnvNetwork 8 | 9 | -------------------------------------------------------------------------------- /ceres/constraints/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .constraint_config import ConstraintConfig 6 | from .constraint_demonstration import ConstraintDemonstration, ConstraintDemonstrationTrajectory, ConstraintDemonstrationBuffer 7 | from .constraint_network import ConstraintNetwork 8 | from .constraint_network_mlp import ConstraintNetworkMLP 9 | -------------------------------------------------------------------------------- /ceres/baselines/ceres/mlp_policy_saver.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from baselines.ppo1.mlp_policy import MlpPolicy 6 | from ceres.networks import NetworkSaverMLP 7 | 8 | class MlpPolicySaver(MlpPolicy, NetworkSaverMLP): 9 | 10 | ''' 11 | Baselines MlpPolicy with save / restore functions 12 | ''' 13 | 14 | def __init__(self, name, *args, session=None, **kwargs): 15 | MlpPolicy.__init__(self, name, *args, **kwargs) 16 | NetworkSaverMLP.__init__(self, network_id=name) 17 | -------------------------------------------------------------------------------- /ceres/envs/constrained/constrained_env_fixed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .constrained_env import ConstrainedEnv 6 | 7 | class ConstrainedEnvFixed(ConstrainedEnv): 8 | ''' 9 | A base class for constrained environments with fixed constraints. 10 | ''' 11 | 12 | def __init__(self, *args, **kwargs): 13 | super().__init__(*args, **kwargs) 14 | self.init_ineq_matrices() 15 | 16 | def update_ineq_matrices(self, state): 17 | ''' 18 | This purposely does nothing since ineq matrices only need to be built once 19 | ''' 20 | pass 21 | 22 | def init_ineq_matrices(self): 23 | ''' 24 | Define fixed ineq matrices here 25 | ''' 26 | raise NotImplementedError('Implement init_ineq_matrices in subclass {0}'.format(type(self))) 27 | 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT license 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /ceres/envs/nav2d/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from gym.envs.registration import register 6 | from .nav2d_ceres import * 7 | 8 | # We list all used Nav2d environments here and call them by '-v0' 9 | gwenv_list = [] 10 | gwenv_list.append('Nav2dPosCeres') 11 | gwenv_list.append('Nav2dPosFixedMazeCeres') 12 | gwenv_list.append('Nav2dPosRandomHolesCeres') 13 | gwenv_list.append('Nav2dForceCeres') 14 | gwenv_list.append('Nav2dForceFixedMazeCeres') 15 | gwenv_list.append('Nav2dForceRandomHolesCeres') 16 | gwenv_list.append('Nav2dPosFixedMazeCeres5N') 17 | gwenv_list.append('Nav2dPosRandomHolesCeres5N') 18 | gwenv_list.append('Nav2dForceRandomHolesCeres5N') 19 | 20 | for gwenv in gwenv_list: 21 | env = locals()[gwenv] 22 | register( 23 | id='{0}-v0'.format(gwenv), 24 | entry_point='ceres.envs.nav2d:{0}'.format(gwenv), 25 | max_episode_steps=env.max_episode_steps, 26 | reward_threshold=100.0, 27 | ) 28 | -------------------------------------------------------------------------------- /ceres/tools/plot/plot_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | class PlotConfig(object): 6 | ''' 7 | Plot parameters, optionally built from ExtraArgs objects 8 | ''' 9 | 10 | def __init__(self, n_average=201, timesteps_per_iteration=1024): 11 | self.n_average = n_average 12 | self.color_rewards_ind = 'b' 13 | self.color_rewards_avg = 'r' 14 | self.color_rewards_std = 'b' 15 | self.label_y = 'Reward' 16 | self.set_timesteps_per_iteration(timesteps_per_iteration) 17 | 18 | @classmethod 19 | def from_extra_args(cls, extra_args): 20 | plot_config = cls(n_average=extra_args.plot_average, 21 | timesteps_per_iteration=(extra_args.n_direct * extra_args.timesteps_per_actorbatch)) 22 | return plot_config 23 | 24 | def set_timesteps_per_iteration(self, timesteps_per_iteration): 25 | self.timesteps_per_iteration = timesteps_per_iteration 26 | if self.timesteps_per_iteration == 1: 27 | self.label_x_iterations = 'Timesteps' 28 | else: 29 | self.label_x_iterations = 'Iterations [{0} timesteps]'.format(self.timesteps_per_iteration) 30 | 31 | -------------------------------------------------------------------------------- /DCO1.1.txt: -------------------------------------------------------------------------------- 1 | Developer's Certificate of Origin 1.1 2 | 3 | By making a contribution to this project, I certify that: 4 | 5 | (a) The contribution was created in whole or in part by me and I 6 | have the right to submit it under the open source license 7 | indicated in the file; or 8 | 9 | (b) The contribution is based upon previous work that, to the best 10 | of my knowledge, is covered under an appropriate open source 11 | license and I have the right under that license to submit that 12 | work with modifications, whether created in whole or in part 13 | by me, under the same open source license (unless I am 14 | permitted to submit under a different license), as indicated 15 | in the file; or 16 | 17 | (c) The contribution was provided directly to me by some other 18 | person who certified (a), (b) or (c) and I have not modified 19 | it. 20 | 21 | (d) I understand and agree that this project and the contribution 22 | are public and that a record of the contribution (including all 23 | personal information I submit with it, including my sign-off) is 24 | maintained indefinitely and may be redistributed consistent with 25 | this project or the open source license(s) involved. 26 | -------------------------------------------------------------------------------- /ceres/networks/network_saver_mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import tensorflow as tf 6 | from .network_saver import NetworkSaver 7 | 8 | class NetworkSaverMLP(NetworkSaver): 9 | ''' 10 | A simple multilayer perceptron with save / restore functions 11 | ''' 12 | 13 | def build_model(self, observation, n_outputs, 14 | hidden_layers, 15 | kernel_initializer, 16 | activation_hidden): 17 | var_names_begin = [_v.name for _v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)] 18 | self.hidden_layers = hidden_layers 19 | assert len(self.hidden_layers) > 0 20 | self.dense_layers = [] 21 | last_layer = observation 22 | self.layer_names = [] 23 | for i_layer, layer_size in enumerate(self.hidden_layers): 24 | layer_name = '{0}dense_{1}'.format(self.tf_var_prefix, i_layer) 25 | self.layer_names.append(layer_name) 26 | dense_layer = tf.layers.dense(inputs=last_layer, units=layer_size, activation=None, kernel_initializer=kernel_initializer, name=layer_name) 27 | dense_layer = activation_hidden(dense_layer) 28 | self.dense_layers.append(dense_layer) 29 | last_layer = dense_layer 30 | layer_name = '{0}dense_{1}'.format(self.tf_var_prefix, 'output') 31 | self.layer_names.append(layer_name) 32 | output_layer = tf.layers.dense(inputs=last_layer, units=n_outputs, kernel_initializer=kernel_initializer, name=layer_name) 33 | self.dense_layers.append(output_layer) 34 | var_names_end = [_v.name for _v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)] 35 | self.network_var_names = [_v for _v in var_names_end if not (_v in var_names_begin)] 36 | return output_layer 37 | 38 | -------------------------------------------------------------------------------- /ceres/tools/io/h5_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import os 6 | import h5py 7 | 8 | ''' 9 | Helper functions for writing and loading data in the HDF5 format 10 | ''' 11 | 12 | def save_dict_as_h5(d, path_save, confirm_overwrite=True, verbose=False): 13 | assert type(d) == dict, 'Invalid dictionary argument: {0}'.format(d) 14 | assert type(path_save) == str, 'Invalid path argument: {0}'.format(path_save) 15 | assert os.path.isdir(os.path.dirname(path_save)), 'Directory for save path does not exist: {0}'.format(path_save) 16 | if os.path.isfile(path_save): 17 | if confirm_overwrite: 18 | if input('File exists: {0}\nOverwrite?[y/N]\n'.format(path_save)).lower() != 'y': 19 | print('Cancel write') 20 | return False 21 | os.remove(path_save) 22 | # Check that no nested dictionary 23 | with h5py.File(path_save, 'w') as h5f: 24 | write_dict(h5f, d) 25 | if verbose: 26 | print('Wrote backup: {0}'.format(path_save)) 27 | return True 28 | 29 | def write_dict(h5f, d): 30 | for _k, _v in d.items(): 31 | if type(_v) == dict: 32 | grp = h5f.create_group(_k) 33 | write_dict(grp, _v) 34 | else: 35 | h5f.create_dataset(_k, data=_v) 36 | 37 | def load_dict_from_h5(path_save, verbose=False): 38 | d = {} 39 | with h5py.File(path_save, 'r') as h5f: 40 | read_dict(h5f, d) 41 | if verbose: 42 | print('Loaded {0} from backup: {0}'.format(','.join(d.keys()))) 43 | return d 44 | 45 | def read_dict(h5f, d): 46 | for _k, _v in h5f.items(): 47 | if isinstance(_v, h5py.Dataset): 48 | d[_k] = _v[()] 49 | else: 50 | assert isinstance(_v, h5py.Group) 51 | d[_k] = {} 52 | read_dict(_v, d[_k]) 53 | 54 | -------------------------------------------------------------------------------- /ceres/baselines/common/mpi_moments_select.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from mpi4py import MPI 6 | import numpy as np 7 | from . import mpi_select 8 | 9 | def mpi_mean_select(x, rank, root, destinations, n_processes, 10 | axis=0, comm=None, keepdims=False): 11 | ''' 12 | Compute a mean on a selection of processes instead of all 13 | ''' 14 | x = np.asarray(x) 15 | assert x.ndim > 0 16 | if comm is None: comm = MPI.COMM_WORLD 17 | xsum = x.sum(axis=axis, keepdims=keepdims) 18 | n = xsum.size 19 | localsum = np.zeros(n+1, x.dtype) 20 | localsum[:n] = xsum.ravel() 21 | localsum[n] = x.shape[axis] 22 | #globalsum = np.zeros_like(localsum) 23 | #comm.Allreduce(localsum, globalsum, op=MPI.SUM) 24 | globalsum = mpi_select.Allreduce_select(comm, rank, root, destinations, localsum, tag_reduce=root, tag_bcast=root + n_processes) 25 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n] 26 | 27 | def mpi_moments_select(x, rank, root, destinations, n_processes, 28 | axis=0, comm=None, keepdims=False): 29 | ''' 30 | Compute a mean on a selection of processes instead of all 31 | ''' 32 | x = np.asarray(x) 33 | assert x.ndim > 0 34 | mean, count = mpi_mean_select(x, rank, root, destinations, n_processes, 35 | axis=axis, comm=comm, keepdims=True) 36 | sqdiffs = np.square(x - mean) 37 | meansqdiff, count1 = mpi_mean_select(sqdiffs, 38 | rank, root, destinations, n_processes, 39 | axis=axis, comm=comm, keepdims=True) 40 | assert count1 == count 41 | std = np.sqrt(meansqdiff) 42 | if not keepdims: 43 | newshape = mean.shape[:axis] + mean.shape[axis+1:] 44 | mean = mean.reshape(newshape) 45 | std = std.reshape(newshape) 46 | return mean, std, count 47 | 48 | 49 | -------------------------------------------------------------------------------- /ceres/baselines/common/mpi_select.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import numpy as np 6 | 7 | ''' 8 | These reproduce the behavior of MPI functions, except applied to a subset of available processes. 9 | In addition, all functions return something instead of acting on argument buffers directly. 10 | ''' 11 | 12 | def Bcast_select(comm, rank, root, destinations, data_buffer, tag=0): 13 | if rank == root: 14 | for dest_rank in destinations: 15 | comm.send(data_buffer, dest=dest_rank, tag=tag) 16 | return data_buffer 17 | else: 18 | recv_buffer = comm.recv(source=root, tag=tag) 19 | assert len(recv_buffer) == len(data_buffer) 20 | return recv_buffer 21 | 22 | def Reduce_select(comm, rank, root, destinations, val_buffer, sum_buffer=None, tag=0): 23 | if sum_buffer is None: 24 | sum_buffer = np.zeros_like(val_buffer) 25 | if rank == root: 26 | sum_buffer += val_buffer 27 | for dest_rank in destinations: 28 | recv_buffer = comm.recv(source=dest_rank, tag=root) 29 | sum_buffer += recv_buffer 30 | else: 31 | comm.send(val_buffer, dest=root, tag=root) 32 | return sum_buffer 33 | 34 | def Allreduce_select(comm, rank, root, destinations, val_buffer, tag_reduce=0, tag_bcast=0): 35 | assert tag_reduce != tag_bcast 36 | sum_buffer = Reduce_select(comm, rank, root, destinations, val_buffer, tag=tag_reduce) 37 | sum_buffer = Bcast_select(comm, rank, root, destinations, sum_buffer, tag=tag_bcast) 38 | return sum_buffer 39 | 40 | def allgather_select(comm, rank, root, destinations, data_buffer, tag=0): 41 | # Gather everything to root 42 | index_map = {_v: _i for _i, _v in enumerate([root] + list(destinations))} 43 | gather_buffer = [None] * len(index_map) 44 | if rank == root: 45 | gather_buffer[index_map[root]] = data_buffer 46 | for dest_rank in destinations: 47 | recv_buffer = comm.recv(source=dest_rank, tag=tag) 48 | gather_buffer[index_map[dest_rank]] = recv_buffer 49 | else: 50 | comm.send(data_buffer, dest=root, tag=tag) 51 | # Broadcast to destinations 52 | allgather_buffer = Bcast_select(comm, rank, root, destinations, gather_buffer, tag=tag) 53 | return allgather_buffer 54 | 55 | -------------------------------------------------------------------------------- /ceres/baselines/common/mpi_adam_select.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | ### Control reduce / broadcast at the process level 6 | 7 | from mpi4py import MPI 8 | import baselines.common.tf_util as U 9 | import tensorflow as tf 10 | import numpy as np 11 | 12 | from baselines.common.mpi_adam import MpiAdam 13 | from . import mpi_select 14 | 15 | class MpiAdamSelect(MpiAdam): 16 | ''' 17 | Extend MpiAdam with parallelization across only a selection of processes (direct or recovery) instead of all 18 | ''' 19 | 20 | def __init__(self, rank, root, group, var_list, *args, select_params=None, **kwargs): 21 | super().__init__(var_list, *args, **kwargs) 22 | self.init_select(rank, root, group) 23 | 24 | def init_select(self, rank, root, group): 25 | self.rank = rank 26 | self.root = root 27 | self.group = group 28 | self.destinations = [_e for _e in self.group if _e != self.root] 29 | self.n_processes = self.comm.Get_size() 30 | 31 | def update(self, localg, stepsize): 32 | if self.t % 100 == 0: 33 | self.check_synced() 34 | localg = localg.astype('float32') 35 | globalg = mpi_select.Allreduce_select(self.comm, self.rank, self.root, self.destinations, localg, tag_reduce=self.root, tag_bcast=self.root + self.n_processes) 36 | 37 | if self.scale_grad_by_procs: 38 | globalg /= len(self.group) 39 | 40 | self.t += 1 41 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) 42 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg 43 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) 44 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) 45 | self.setfromflat(self.getflat() + step) 46 | 47 | def sync(self): 48 | theta = self.getflat() 49 | theta = mpi_select.Bcast_select(self.comm, self.rank, self.root, self.destinations, theta, tag=self.root) 50 | self.setfromflat(theta) 51 | 52 | def check_synced(self): 53 | if self.rank == self.root: # this is root 54 | theta = self.getflat() 55 | theta = mpi_select.Bcast_select(self.comm, self.rank, self.root, self.destinations, theta, tag=self.root) 56 | else: 57 | thetalocal = self.getflat() 58 | thetaroot = np.empty_like(thetalocal) 59 | thetaroot = mpi_select.Bcast_select(self.comm, self.rank, self.root, self.destinations, thetaroot, tag=self.root) 60 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) 61 | -------------------------------------------------------------------------------- /ceres/envs/nav2d/nav2d_force.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import numpy as np 6 | from .nav2d_pos import Nav2dPos 7 | 8 | class Nav2dForce(Nav2dPos): 9 | ''' 10 | Control an agent to navigate to a point by force commands 11 | ''' 12 | 13 | max_vel = 0.10 14 | max_acc = 0.05 15 | delta_time = 1. # one action per frame 16 | do_clip_vel = True 17 | do_randomize_agent_vel = True 18 | agent_mass = 1. 19 | 20 | def setup_actions(self): 21 | self.add_action('agent_set_acc_x', -self.max_acc, self.max_acc) 22 | self.add_action('agent_set_acc_y', -self.max_acc, self.max_acc) 23 | 24 | def setup_observations_task_specific(self): 25 | super().setup_observations_task_specific() 26 | # Agent velocity 27 | self.agent_vel_x_range = [-self.max_vel, self.max_vel] 28 | self.agent_vel_y_range = [-self.max_vel, self.max_vel] 29 | self.add_observation('agent_vel_x', self.agent_vel_x_range[0], self.agent_vel_x_range[1]) 30 | self.add_observation('agent_vel_y', self.agent_vel_y_range[0], self.agent_vel_y_range[1]) 31 | 32 | def do_action(self): 33 | self.agent_acc_x, self.agent_acc_y = self.agent_mass * self.command_play 34 | self.agent_vel_x += self.agent_acc_x*self.delta_time 35 | self.agent_vel_y += self.agent_acc_y*self.delta_time 36 | if self.do_clip_vel: 37 | self.agent_vel_x, self.agent_vel_y = self.clip_vector_by_norm([self.agent_vel_x, self.agent_vel_y], self.max_vel) 38 | self.agent_pos_x += self.agent_vel_x*self.delta_time 39 | self.agent_pos_y += self.agent_vel_y*self.delta_time 40 | 41 | 42 | def clip_command(self, a): 43 | if self.do_clip_command: 44 | return self.clip_vector_by_norm(a, self.max_acc) 45 | else: 46 | return a 47 | 48 | def fill_state_task_specific(self, state): 49 | super().fill_state_task_specific(state) 50 | state[self.observation_index['agent_vel_x']] = self.agent_vel_x 51 | state[self.observation_index['agent_vel_y']] = self.agent_vel_y 52 | 53 | def reset_agent(self): 54 | super().reset_agent() 55 | self.reset_agent_vel() 56 | 57 | def reset_agent_vel(self): 58 | if self.do_randomize_agent_vel: 59 | agent_vel_norm = np.random.rand()*self.max_vel 60 | agent_vel_angle = np.random.rand()*2.*np.pi 61 | self.agent_vel_x = agent_vel_norm*np.cos(agent_vel_angle) 62 | self.agent_vel_y = agent_vel_norm*np.sin(agent_vel_angle) 63 | else: 64 | self.agent_vel_x = 0 65 | self.agent_vel_y = 0 66 | 67 | -------------------------------------------------------------------------------- /ceres/envs/resetter/resetter_env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import gym 6 | import numpy as np 7 | 8 | class SnapshotInfo(object): 9 | ''' 10 | A simple class to store snapshot metadata 11 | ''' 12 | __slots__ = ['i_trajectory', 'i_state', 'action_level', 'action_weight'] 13 | def __init__(self, i_trajectory=None, i_state=None, action_level=None, action_weight=None): 14 | self.i_trajectory = i_trajectory 15 | self.i_state = i_state 16 | self.action_level = action_level 17 | self.action_weight = action_weight 18 | 19 | 20 | class ResetterEnv(gym.Env): 21 | ''' 22 | A base class to uniformize trajectory snapshotting and restoring. 23 | Some functions must be implemented either by the base RL environment (e.g., Nav2d) 24 | or by a task-specific resetter environment (e.g., ResetterEnvCeres) 25 | ''' 26 | max_reference_steps_per_episode = -1 27 | 28 | def __init__(self): 29 | super(ResetterEnv, self).__init__() 30 | self.init_reference() 31 | 32 | def init_reference(self): 33 | ''' 34 | Define the type of reference snapshots to restore the environment to 35 | ''' 36 | self._init_reference_parameters() 37 | self._init_reference_trajectories() 38 | 39 | def get_random_reference_index(self): 40 | ''' 41 | Pick a random trajectory, then a random state within that trajectory 42 | ''' 43 | i_traj = np.random.randint(0, len(self.reference_trajectories)) 44 | i_state = np.random.randint(0, len(self.reference_trajectories[i_traj])) 45 | return i_traj, i_state 46 | 47 | def get_random_reference_snapshot(self): 48 | ''' 49 | Get a random snapshot with the associated metadata 50 | ''' 51 | i_traj, i_state = self.get_random_reference_index() 52 | snapshot = self.reference_trajectories[i_traj][i_state].snapshot 53 | assert snapshot is not None 54 | snapshot_info = SnapshotInfo(i_trajectory=i_traj, i_state=i_state) 55 | return snapshot, snapshot_info 56 | 57 | def reset_random(self): 58 | raise NotImplementedError('Implement this in environment class {0}'.format(type(self))) 59 | 60 | def reset_and_restore(self, snapshot): 61 | raise NotImplementedError('Implement this in environment class {0}'.format(type(self))) 62 | 63 | def _init_reference_parameters(self): 64 | raise NotImplementedError('Implement this in resetter class {0}'.format(type(self))) 65 | 66 | def _init_reference_trajectories(self): 67 | raise NotImplementedError('Implement this in resetter class {0}'.format(type(self))) 68 | 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Constrained Exploration and Recovery from Experience Shaping 2 | Constrained Exploration and Recovery from Experience Shaping is an algorithm for model-free reinforcement learning to actively reshape the action space of an agent during training so that reward-driven exploration is constrained within safety limits. 3 | 4 | This repository accompanies the following paper on arXiv: https://arxiv.org/abs/1809.08925 5 | 6 | | Unconstrained Random Exploration | Constrained Random Exploration | 7 | :-------------------------:|:-------------------------: 8 | | 9 | 10 | ## Installing 11 | 12 | This implementation requires Python 3 and relies on Tensorflow for building and training constraint networks. 13 | Depending on your setup, run: 14 | ``` 15 | pip install tensorflow-gpu 16 | ``` 17 | if you have a CUDA-compatible device or: 18 | ``` 19 | pip install tensorflow 20 | ``` 21 | 22 | For training constraint networks together with control policies, we built on top of the [OpenAI Baselines framework](https://github.com/openai/baselines/). 23 | Install it with: 24 | ``` 25 | pip install baselines 26 | ``` 27 | We will maintain compatibility with the OpenAI Baselines ```master``` branch (last confirmed check on 2018-09-08: [commit](https://github.com/openai/baselines/commit/58b1021b28345a902ea20cb99ac0fe3914ee4171)), though feel free to create an [issue](https://github.com/IBM/constrained-rl/issues) if you notice something wrong. 28 | 29 | Quadratic program solving is performed using quadprog. 30 | Install first Cython: 31 | ``` 32 | pip install Cython 33 | ``` 34 | Then: 35 | ``` 36 | pip install quadprog 37 | ``` 38 | 39 | Finally, clone this repository and install the local package with pip: 40 | ``` 41 | git clone git@github.com:IBM/constrained-rl.git 42 | cd constrained-rl 43 | pip install -e . 44 | ``` 45 | 46 | ## Examples 47 | Examples and reference data are provided in the [examples](examples) directory: 48 | 1. [Learning action space constraints from positive and negative demonstrations](examples/01-constraints-from-demonstrations.md): fixed maze 49 | 2. [Learning action space constraints from scratch](examples/02-constraints-from-scratch.md): random obstacles with position and force control 50 | 51 | ## License 52 | The Constrained Exploration and Recovery from Experience Shaping Project uses the [MIT](LICENSE) software license. 53 | 54 | ## Contributing to the project 55 | Full details of how to contribute to this project are documented in the [CONTRIBUTING.md](CONTRIBUTING.md) file. 56 | 57 | ## Maintainers 58 | The project's [maintainers](MAINTAINERS.txt): are responsible for reviewing and merging all pull requests and they guide the over-all technical direction of the project. 59 | 60 | -------------------------------------------------------------------------------- /ceres/constraints/constraint_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import os 6 | from ceres.tools.io.h5_helper import save_dict_as_h5, load_dict_from_h5 7 | 8 | class ConstraintConfig(object): 9 | ''' 10 | Constraint network configuration with save and restore functions 11 | ''' 12 | 13 | valid_param = ['mlp_hidden_layers', 14 | 'n_ineq', 15 | 'loss_weights', 16 | 'spherical_coordinates', 17 | 'normalize_ineq_mat', 18 | 'predict_interior_point', 19 | 'interior_point_margin_min', 20 | 'interior_point_margin_max', 21 | 'interior_point_max'] 22 | cnet_config_filename = 'cnet_config.h5' 23 | 24 | def __init__(self, **kwargs): 25 | self.set_default() 26 | self.set(**kwargs) 27 | 28 | def set_default(self): 29 | self.spherical_coordinates = False 30 | self.normalize_ineq_mat = False 31 | self.predict_interior_point = False 32 | self.interior_point_margin_min = 0. 33 | self.interior_point_margin_max = 0. 34 | self.interior_point_max = 0. 35 | self.loss_weights = {} 36 | 37 | def set(self, **kwargs): 38 | for key, value in kwargs.items(): 39 | assert key in self.valid_param, 'Invalid parameter type {0}'.format(key) 40 | setattr(self, key, value) 41 | 42 | def save(self, path_save): 43 | d = self.__dict__ 44 | assert os.path.isdir(path_save), 'Config save function only takes a directory as input' 45 | path_save = os.path.join(path_save, self.cnet_config_filename) 46 | save_dict_as_h5(d, path_save, verbose=True) 47 | 48 | @classmethod 49 | def from_backup(cls, path_save): 50 | if os.path.isdir(path_save): 51 | path_cnet_dir = path_save 52 | else: 53 | path_cnet_dir = os.path.dirname(path_save) 54 | path_cnet_config = os.path.join(path_cnet_dir, cls.cnet_config_filename) 55 | d = load_dict_from_h5(path_cnet_config, verbose=False) 56 | cnet_config = cls(**d) 57 | return cnet_config 58 | 59 | @classmethod 60 | def from_extra_args(cls, args): 61 | cnet_config = cls(mlp_hidden_layers=args.cnet_hidden_layers, 62 | n_ineq=args.cnet_n_ineq, 63 | loss_weights=args.cnet_loss_weights, 64 | spherical_coordinates=args.cnet_spherical_coordinates, 65 | normalize_ineq_mat=args.cnet_normalize_ineq_mat, 66 | predict_interior_point=args.cnet_predict_interior_point, 67 | interior_point_margin_min=args.cnet_interior_point_margin_min, 68 | interior_point_margin_max=args.cnet_interior_point_margin_max, 69 | interior_point_max=args.cnet_interior_point_max, 70 | ) 71 | return cnet_config 72 | 73 | 74 | -------------------------------------------------------------------------------- /ceres/envs/nav2d/nav2d_ceres.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from ceres.envs import CeresEnv 6 | from .obstacles import ObstacleSquare, ObstacleCircle 7 | from .nav2d_pos import Nav2dPos 8 | from .nav2d_force import Nav2dForce 9 | from .nav2d_rendering import Nav2dRendering 10 | from .nav2d_obstacles import Nav2dObstacles 11 | import numpy as np 12 | 13 | class FixedMaze(object): 14 | ''' 15 | Fixed square obstacles defining a maze 16 | ''' 17 | fixed_obstacles = [ 18 | ObstacleSquare(top_left_x=-0.70, top_left_y = 0.70, bottom_right_x=0.70, bottom_right_y=0.35), 19 | ObstacleSquare(top_left_x=-1.00, top_left_y = -0.35, bottom_right_x=-0.30, bottom_right_y=-0.70), 20 | ObstacleSquare(top_left_x=0.30, top_left_y = -0.35, bottom_right_x=1.00, bottom_right_y=-0.70), 21 | ObstacleSquare(top_left_x=-1.00, top_left_y = 0.05, bottom_right_x=-0.65, bottom_right_y=-0.05), 22 | ObstacleSquare(top_left_x=0.65, top_left_y = 0.05, bottom_right_x=1.00, bottom_right_y=-0.05), 23 | ObstacleSquare(top_left_x=-0.40, top_left_y = 0.05, bottom_right_x=0.40, bottom_right_y=-0.05), 24 | ObstacleSquare(top_left_x=-0.05, top_left_y = 0.40, bottom_right_x=0.05, bottom_right_y=-0.40), 25 | ObstacleSquare(top_left_x=-0.05, top_left_y = -0.70, bottom_right_x=0.05, bottom_right_y=-1.00), 26 | ] 27 | 28 | class RandomHoles(object): 29 | ''' 30 | Circle obstacles randomized for every episode 31 | ''' 32 | n_random_circle_obstacles = 10 33 | random_circle_obstacle_dim_range = [0.10, 0.25] 34 | is_state_target_rel_pos = True 35 | state_lidar_angles = np.linspace(0., 2.*np.pi, 8, endpoint=False) 36 | 37 | class Nav2dPosCeres(Nav2dRendering, Nav2dPos, CeresEnv): 38 | max_reference_trajectories = 1024 39 | max_recovery_steps = 5 40 | 41 | class Nav2dPosFixedMazeCeres(FixedMaze, Nav2dRendering, Nav2dObstacles, Nav2dPos, CeresEnv): 42 | max_reference_trajectories = 1024 43 | max_recovery_steps = 5 44 | 45 | class Nav2dPosFixedMazeCeres5N(Nav2dPosFixedMazeCeres): 46 | max_normalized_obs = 5. 47 | max_normalized_act = 5. 48 | 49 | class Nav2dPosRandomHolesCeres(RandomHoles, Nav2dRendering, Nav2dObstacles, Nav2dPos, CeresEnv): 50 | max_reference_trajectories = 1024 51 | max_recovery_steps = 5 52 | 53 | class Nav2dPosRandomHolesCeres5N(Nav2dPosRandomHolesCeres): 54 | max_normalized_obs = 5. 55 | max_normalized_act = 5. 56 | 57 | class Nav2dForceCeres(Nav2dRendering, Nav2dForce, CeresEnv): 58 | max_reference_trajectories = 1024 59 | max_recovery_steps = 10 60 | 61 | class Nav2dForceFixedMazeCeres(FixedMaze, Nav2dRendering, Nav2dObstacles, Nav2dForce, CeresEnv): 62 | max_reference_trajectories = 1024 63 | max_recovery_steps = 10 64 | 65 | class Nav2dForceRandomHolesCeres(RandomHoles, Nav2dRendering, Nav2dObstacles, Nav2dForce, CeresEnv): 66 | max_reference_trajectories = 1024 67 | max_recovery_steps = 10 68 | 69 | class Nav2dForceRandomHolesCeres5N(Nav2dForceRandomHolesCeres): 70 | max_normalized_obs = 5. 71 | max_normalized_act = 5. 72 | -------------------------------------------------------------------------------- /ceres/envs/constrained/constrained_env_network.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from ceres.constraints import ConstraintNetworkMLP, ConstraintConfig 8 | from .constrained_env import ConstrainedEnv 9 | 10 | class ConstrainedEnvNetwork(ConstrainedEnv): 11 | ''' 12 | Environment with constraints predicted by a constraint network. 13 | Make sure to call init_constraint_prediction before running it. 14 | ''' 15 | 16 | def __init__(self, *args, **kwargs): 17 | super(ConstrainedEnvNetwork, self).__init__(*args, **kwargs) 18 | self.ineq_mat_params, self.ineq_vec_params = None, None 19 | self.is_initialized_constraint_prediction = False 20 | 21 | def init_constraint_prediction(self, cnet, session=None): 22 | ''' 23 | Initialize constraint network, either from a ConstraintNetworkMLP object or a path to a trained network backup 24 | ''' 25 | if session is None: 26 | tf_config = tf.ConfigProto() 27 | tf_config.gpu_options.allow_growth = True 28 | self.cnet_session = tf.Session(config=tf_config) 29 | else: 30 | self.cnet_session = session 31 | 32 | if type(cnet) == str: # path to constraint network backup 33 | cnet_config = ConstraintConfig.from_backup(cnet) 34 | self.cnet = ConstraintNetworkMLP(self.observation_space, self.action_space, cnet_config) 35 | self.cnet.restore_model(cnet, session=self.cnet_session) 36 | else: 37 | assert isinstance(cnet, ConstraintNetworkMLP) 38 | self.cnet = cnet 39 | self.is_initialized_constraint_prediction = True 40 | 41 | def update_ineq_matrices(self, state): 42 | ''' 43 | Predict ineq matrices by passing an input state through the constraint network and compute auxiliary variables 44 | ''' 45 | assert self.is_initialized_constraint_prediction, 'Constraint prediction is not initialized: call init_constraint_prediction' 46 | feed_dict = {self.cnet.observation: np.expand_dims(state, axis=0)} 47 | cnet_outputs = [self.cnet.ineq_mat, self.cnet.ineq_vec, self.cnet.ineq_mat_params, self.cnet.ineq_vec_params, self.cnet.interior_point] 48 | ineq_outputs = self.cnet_session.run(cnet_outputs, feed_dict=feed_dict) 49 | self.ineq_mat, self.ineq_vec, self.ineq_mat_params, self.ineq_vec_params, self.ineq_interior_point = [_v[0] for _v in ineq_outputs] 50 | self.ineq_interior_point_flat = np.squeeze(self.ineq_interior_point) 51 | # Check constraint prediction validity 52 | if not (np.all(np.isfinite(self.ineq_mat)) and np.all(np.isfinite(self.ineq_vec))): 53 | error_str = 'Invalid inequality matrices: make sure you are using a recent version of Tensorflow' # In some versions, tf.cos and tf.sin can output infinity for large inputs 54 | print('Inequality parameters') 55 | print(self.ineq_mat_params) 56 | print(self.ineq_vec_params) 57 | print('Processed constraints') 58 | self.print_ineq() 59 | raise ValueError(error_str) 60 | -------------------------------------------------------------------------------- /ceres/tools/math/qpsolver_quadprog.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import numpy as np 6 | import quadprog 7 | from .qpsolver import QPSolver 8 | 9 | class QPSolverQuadprog(QPSolver): 10 | ''' 11 | A class interfacing with the Quadprog QP solver 12 | ''' 13 | 14 | def __init__(self, n_var=None, verbose=False): 15 | super(QPSolverQuadprog, self).__init__(n_var=n_var, verbose=verbose) 16 | 17 | def update_solver_specific(self): 18 | self.obj_mat_quadprog = self.obj_mat 19 | self.obj_vec_quadprog = -np.squeeze(self.obj_vec, axis=1) 20 | self.obj_mat_quadprog = self.obj_mat_quadprog.astype(dtype=np.float64) 21 | self.obj_vec_quadprog = self.obj_vec_quadprog.astype(dtype=np.float64) 22 | has_eq_loc = self.eq_mat is not None 23 | has_ineq_loc = self.ineq_mat is not None 24 | if has_ineq_loc: 25 | if has_eq_loc: 26 | self.constraint_mat_quadprog = -np.vstack([self.eq_mat, self.ineq_mat]).transpose() 27 | self.constraint_vec_quadprog = -np.hstack([np.squeeze(self.eq_vec, axis=1), np.squeeze(self.ineq_vec, axis=1)]) 28 | else: 29 | self.constraint_mat_quadprog = -self.ineq_mat.transpose() 30 | self.constraint_vec_quadprog = -np.squeeze(self.ineq_vec, axis=1) 31 | else: 32 | if has_eq_loc: 33 | self.constraint_mat_quadprog = -self.eq_mat.transpose() 34 | self.constraint_vec_quadprog = -np.squeeze(self.eq_vec, axis=1) 35 | else: 36 | self.constraint_mat_quadprog = None 37 | self.constraint_vec_quadprog = None 38 | if has_eq_loc or has_ineq_loc: 39 | self.constraint_mat_quadprog = self.constraint_mat_quadprog.astype(dtype=np.float64) 40 | self.constraint_vec_quadprog = self.constraint_vec_quadprog.astype(dtype=np.float64) 41 | 42 | def solve(self): 43 | try: 44 | self.solver_out = quadprog.solve_qp(self.obj_mat_quadprog, self.obj_vec_quadprog, 45 | self.constraint_mat_quadprog, self.constraint_vec_quadprog, 46 | self.n_eq) 47 | self.optimum = self.solver_out[0] 48 | self.success = True 49 | except ValueError as e: 50 | print('WARNING: solver failed ({0})'.format(e)) 51 | self.optimum = np.zeros(self.n_var) 52 | self.success = False 53 | return self.optimum, self.success 54 | 55 | if __name__ == '__main__': 56 | ''' 57 | Implement example from cvxopt.org 58 | minimize 2 x1^2 + x2^2 + x1*x2 + x1 + x2 59 | subject to: 60 | x1 >= 0 61 | x2 >= 0 62 | x1 + x2 = 1 63 | ''' 64 | qp_solver = QPSolverQuadprog() 65 | Q = 2.*np.array([[2., 0.5], 66 | [0.5, 1.]]) 67 | p = np.array([[1.], 68 | [1.]]) 69 | G1 = np.array([[-1., 0.]]) 70 | h1 = np.array([[0.]]) 71 | G2 = np.array([[0., -1.]]) 72 | h2 = np.array([[0.]]) 73 | A = np.array([[1., 1.]]) 74 | b = np.array([[1.]]) 75 | qp_solver.add_obj(Q, p) 76 | qp_solver.add_eq(A, b) 77 | qp_solver.add_ineq(G1, h1) 78 | qp_solver.add_ineq(G2, h2) 79 | qp_solver.update() 80 | x_opt, success = qp_solver.solve() 81 | print(x_opt, success) 82 | 83 | if input('Enter debug mode? y/[N]\n').lower() == 'y': 84 | import ipdb; ipdb.set_trace() 85 | -------------------------------------------------------------------------------- /ceres/constraints/constraint_network_mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | from .constraint_network import ConstraintNetwork 8 | from ceres.networks import NetworkSaverMLP 9 | 10 | class ConstraintNetworkMLP(ConstraintNetwork, NetworkSaverMLP): 11 | ''' 12 | Constraint network with MLP and save/restore functions 13 | ''' 14 | 15 | def __init__(self, observation_space, action_space, config): 16 | NetworkSaverMLP.__init__(self, network_id='cnet') 17 | ConstraintNetwork.__init__(self, observation_space, action_space, config) 18 | 19 | def build_model(self): 20 | return NetworkSaverMLP.build_model(self, self.observation, self.n_outputs, 21 | self.config.mlp_hidden_layers, 22 | self.initializer, 23 | self.activation_common) 24 | 25 | 26 | def play_cnet(): 27 | ''' 28 | Load a trained constrained network and print constraint predictions from random states 29 | ''' 30 | from ceres.tools import ExtraArgs 31 | from ceres.envs import ConstrainedEnv 32 | import gym 33 | extra_args = ExtraArgs(ignore_max_timesteps=True, ignore_max_iterations=True, ignore_max_episodes=True) 34 | assert len(extra_args.env_id) > 0, 'Required argument --env_id' 35 | env = gym.make(extra_args.env_id) 36 | assert isinstance(env.unwrapped, ConstrainedEnv), 'The chosen environment {0} does not support constraints'.format(extra_args.env_id) 37 | assert len(extra_args.trained_cnet) > 0, 'Required argument --trained_cnet' 38 | cnet_config = ConstraintConfig.from_backup(extra_args.trained_cnet) 39 | cnet = ConstraintNetworkMLP(env.observation_space, env.action_space, cnet_config) 40 | 41 | n_obs = env.observation_space.shape[0] 42 | def random_state(): 43 | pass 44 | 45 | cmd_str = '[r/Return]: random state, [q]: quit, otherwise input comma-separated state of length {0}\n'.format(n_obs) 46 | with tf.Session() as sess: 47 | def predict_constraints(state): 48 | observation = [state] 49 | ineq_mat, ineq_vec = sess.run([cnet.ineq_mat, cnet.ineq_vec], feed_dict={cnet.observation: observation}) 50 | ineq_mat = ineq_mat[0] 51 | ineq_vec = ineq_vec[0] 52 | return ineq_mat, ineq_vec 53 | 54 | def predict_and_print_constraints(state): 55 | print('Input state: {0}'.format(state)) 56 | ineq_mat, ineq_vec = predict_constraints(state) 57 | env.unwrapped.print_ineq(ineq_mat=ineq_mat, ineq_vec=ineq_vec) 58 | 59 | cnet.restore_model(extra_args.trained_cnet, session=sess) 60 | while True: 61 | cmd = input(cmd_str) 62 | if cmd == 'q': 63 | break 64 | elif (cmd == 'r') or (cmd == ''): 65 | state = np.random.rand(n_obs) 66 | predict_and_print_constraints(state) 67 | else: 68 | try: 69 | state = list(map(float, cmd.split(','))) 70 | assert len(state) == n_obs, 'input state {0} is of length {1}, expected {2}'.format(state, len(state), n_obs) 71 | predict_and_print_constraints(state) 72 | except Exception as e: 73 | print('Invalid command \'{0}\': {1}'.format(cmd, str(e))) 74 | 75 | 76 | if __name__ == '__main__': 77 | play_cnet() 78 | 79 | -------------------------------------------------------------------------------- /ceres/baselines/ppo1/pposgd_simple.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This is the learn function from OpenAI's baselines.ppo1.pposgd_simple 3 | rewritten with individual functions in .pposgd_simple_helper 4 | OpenAI Baselines is licensed under the MIT License, see LICENSE 5 | ''' 6 | 7 | from baselines.common.mpi_moments import mpi_moments 8 | from baselines.ppo1.pposgd_simple import traj_segment_generator 9 | from baselines import logger 10 | import baselines.common.tf_util as U 11 | import tensorflow as tf, numpy as np 12 | from mpi4py import MPI 13 | from .pposgd_simple_helper import build_policy_training_vars, build_counters, adjust_policy_learning_rate, update_policy, log_iter_info, calc_end_training 14 | 15 | def learn(env, policy_fn, *, 16 | timesteps_per_actorbatch, # timesteps per actor per update 17 | clip_param, entcoeff, # clipping parameter epsilon, entropy coeff 18 | optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers 19 | gamma, lam, # advantage estimation 20 | max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint 21 | callback=None, # you can do anything in the callback, since it takes locals(), globals() 22 | adam_epsilon=1e-5, 23 | schedule='constant' # annealing for stepsize parameters (epsilon and adam) 24 | ): 25 | # Setup losses and stuff 26 | # ---------------------------------------- 27 | 28 | ob_space = env.observation_space 29 | ac_space = env.action_space 30 | pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy 31 | oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy 32 | 33 | loss_names, var_list, lossandgrad, adam, assign_old_eq_new, compute_losses = build_policy_training_vars(pi, oldpi, clip_param, entcoeff, adam_epsilon) 34 | mpi_moments_fn = lambda losses: mpi_moments(losses, axis=0) 35 | allgather_fn = MPI.COMM_WORLD.allgather 36 | 37 | U.initialize() 38 | adam.sync() 39 | 40 | # Prepare for rollouts 41 | # ---------------------------------------- 42 | seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) 43 | 44 | iters_so_far, episodes_so_far, timesteps_so_far, tstart, lenbuffer, rewbuffer = build_counters() 45 | 46 | assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" 47 | 48 | while True: 49 | if callback: callback(locals(), globals()) 50 | 51 | if calc_end_training(max_timesteps, timesteps_so_far, 52 | max_episodes, episodes_so_far, 53 | max_iters, iters_so_far, 54 | max_seconds, tstart): 55 | break 56 | 57 | logger.log("********** Iteration %i ************"%iters_so_far) 58 | 59 | seg = seg_gen.__next__() 60 | 61 | cur_lrmult = adjust_policy_learning_rate(schedule, max_timesteps, timesteps_so_far, max_episodes, episodes_so_far, max_iters, iters_so_far) 62 | vpredbefore, tdlamret, optim_batchsize = update_policy(pi, seg, gamma, lam, 63 | logger, optim_epochs, optim_batchsize, optim_stepsize, cur_lrmult, 64 | loss_names, lossandgrad, adam, assign_old_eq_new, compute_losses, 65 | mpi_moments_fn, allgather_fn) 66 | 67 | episodes_so_far, timesteps_so_far = log_iter_info(lenbuffer, rewbuffer, tstart, 68 | vpredbefore, tdlamret, seg, 69 | episodes_so_far, timesteps_so_far, 70 | MPI.COMM_WORLD.Get_rank()==0) 71 | iters_so_far += 1 72 | 73 | return pi 74 | -------------------------------------------------------------------------------- /ceres/tools/math/spherical_coordinates.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | class SphericalCoordinates(object): 9 | ''' 10 | Implement N-dimensional coordinates in Numpy and Tensorflow 11 | https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates 12 | ''' 13 | 14 | def __init__(self, dim, input_angles=None): 15 | assert dim >= 2 16 | self.dim = dim 17 | self.n_angles = dim - 1 18 | if input_angles is None: 19 | self.input_angles = tf.placeholder(tf.float32, shape=(None, self.n_angles)) 20 | else: 21 | self.input_angles = input_angles 22 | self.init_angles_to_unit_vec() 23 | 24 | def spherical_to_cartesian(self, angles, radius=1): 25 | assert angles.shape[-1] == self.n_angles 26 | shape_angles_in = angles.shape 27 | angles_batch = np.reshape(angles, [-1, self.n_angles]) 28 | vec_batch = [] 29 | for angles in angles_batch: 30 | previous = radius 31 | vec = [] 32 | for angle in angles[:-1]: 33 | coord = previous*np.cos(angle) 34 | vec.append(coord) 35 | previous *= np.sin(angle) 36 | angle = angles[-1] 37 | vec.append(previous*np.cos(angle)) 38 | vec.append(previous*np.sin(angle)) 39 | vec_batch.append(vec) 40 | shape_vec_out = list(shape_angles_in) 41 | shape_vec_out[-1] += 1 42 | vec_batch = np.array(vec_batch) 43 | vec_batch = np.reshape(vec_batch, shape_vec_out) 44 | return vec_batch 45 | 46 | def init_angles_to_unit_vec(self, radius=None): 47 | angles = self.input_angles 48 | shape_angles_in = tf.shape(angles) 49 | angles = tf.reshape(angles, [-1, shape_angles_in[-1]]) 50 | angles_cos = tf.cos(angles) 51 | angles_sin = tf.sin(angles) 52 | vec = [] 53 | previous = 1. 54 | for i_angle in range(self.n_angles-1): 55 | axis_cos = tf.slice(angles_cos, [0, i_angle], [-1, 1]) 56 | axis_sin = tf.slice(angles_sin, [0, i_angle], [-1, 1]) 57 | coord = tf.multiply(previous, axis_cos) 58 | previous = tf.multiply(previous, axis_sin) 59 | vec.append(coord) 60 | i_angle = self.n_angles-1 61 | axis_cos = tf.slice(angles_cos, [0, i_angle], [-1, 1]) 62 | axis_sin = tf.slice(angles_sin, [0, i_angle], [-1, 1]) 63 | vec.append(tf.multiply(previous, axis_cos)) 64 | vec.append(tf.multiply(previous, axis_sin)) 65 | vec = tf.concat(vec, axis=1) 66 | if radius is not None: 67 | radius = tf.reshape(radius, [-1, shape_angles_in[-1]]) 68 | vec = tf.multiply(radius, vec) 69 | shape_vec_last = tf.constant([self.dim], dtype=shape_angles_in.dtype) 70 | shape_vec_out = tf.concat([shape_angles_in[:-1], shape_vec_last], axis=0) 71 | vec = tf.reshape(vec, shape_vec_out) 72 | self.output_unit_vec = vec 73 | 74 | def main(): 75 | while True: 76 | input_str = input('Input list of angles, in degrees, comma-separated\n') 77 | if len(input_str) == 0: 78 | #angles_deg = np.array([[0.0], [90.]]) 79 | angles_deg = np.array([ 80 | [[0.], 81 | [45.]], 82 | [[90.], 83 | [135.]] 84 | ]) 85 | print('Use default example: {0}'.format(angles_deg)) 86 | else: 87 | angles_deg = np.array([float(e) for e in input_str.split(',')]) 88 | n_angles = angles_deg.shape[-1] 89 | dim = n_angles + 1 90 | angles_rad = np.radians(angles_deg) 91 | print('Degrees: {0}'.format(angles_deg)) 92 | print('Radians: {0}'.format(angles_rad)) 93 | sc = SphericalCoordinates(dim) 94 | vec_np = sc.spherical_to_cartesian(angles_rad) 95 | print('Unit vector') 96 | print(' Numpy: {0}'.format(vec_np)) 97 | with tf.Session() as sess: 98 | angles_rad_reshaped = np.reshape(angles_rad, [-1, n_angles]) 99 | vec_tf = sess.run([sc.output_unit_vec], feed_dict={sc.input_angles: angles_rad_reshaped}) 100 | print(' Tensorflow: {0}'.format(vec_tf)) 101 | 102 | if __name__ == '__main__': 103 | main() 104 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ### Welcome 2 | 3 | We welcome contributions to the Constrained Exploration and Recovery from Experience Shaping Project in many forms, and there's always plenty to do! 4 | 5 | First things first, please review the Constrained Exploration and Recovery from Experience Shaping Project's [Code of Conduct](CONDUCT.md) before participating. It is important that we keep things civil. 6 | 7 | ### Reporting bugs 8 | If you are a user and you find a bug, please submit an [issue](https://github.com/IBM/constrained-rl/issues). Please try to provide sufficient information for someone else to reproduce the issue. One of the project's maintainers should respond to your issue within 24 hours. If not, please bump the issue and request that it be reviewed. 9 | 10 | ### Fixing issues and working stories 11 | Review the [issues list](https://github.com/IBM/constrained-rl/issues) and find something that interests you. You could also check the ["help wanted"](https://github.com/IBM/constrained-rl/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) list. It is wise to start with something relatively straight forward and achievable. Usually there will be a comment in the issue that indicates whether someone has already self-assigned the issue. If no one has already taken it, then add a comment assigning the issue to yourself, eg.: ```I'll work on this issue.```. Please be considerate and rescind the offer in comments if you cannot finish in a reasonable time, or add a comment saying that you are still actively working the issue if you need a little more time. 12 | 13 | We are using the [GitHub Flow](https://guides.github.com/introduction/flow/) process to manage code contributions. If you are unfamiliar, please review that link before proceeding. 14 | 15 | To work on something, whether a new feature or a bugfix: 16 | 1. Create a [fork](https://help.github.com/articles/fork-a-repo/) (if you haven't already) 17 | 18 | 2. Clone it locally 19 | ``` 20 | git clone https://github.com/yourid/constrained-rl.git 21 | ``` 22 | 3. Add the upstream repository as a remote 23 | ``` 24 | git remote add upstream https://github.com/IBM/constrained-rl.git 25 | ``` 26 | 4. Create a branch 27 | 28 | Create a descriptively-named branch off of your cloned fork ([more detail here](https://help.github.com/articles/syncing-a-fork/)) 29 | ``` 30 | cd constrained-rl 31 | git checkout -b issue-nnnn 32 | ``` 33 | 5. Commit your code 34 | 35 | Commit to that branch locally, and regularly push your work to the same branch on the server. 36 | 37 | 6. Commit messages 38 | 39 | Commit messages must have a short description no longer than 50 characters followed by a blank line and a longer, more descriptive message that includes reference to issue(s) being addressed so that they will be automatically closed on a merge e.g. ```Closes #1234``` or ```Fixes #1234```. 40 | 41 | 7. Pull Request (PR) 42 | 43 | When you need feedback or help, or you think the branch is ready for merging, open a pull request (make sure you have first successfully built and tested your changes. 44 | 45 | _Note: if your PR does not merge cleanly, use ```git rebase master``` in your feature branch to update your pull request rather than using ```git merge master```_. 46 | 47 | 8. Did we mention tests? All code changes should be accompanied by new or modified tests. 48 | 49 | 9. Any code changes that affect documentation should be accompanied by corresponding changes (or additions) to the documentation and tests. This will ensure that if the merged PR is reversed, all traces of the change will be reversed as well. 50 | 51 | After your Pull Request (PR) has been reviewed and signed off, a maintainer will merge it into the master branch. 52 | 53 | ## Coding guidelines 54 | 55 | ### Becoming a maintainer 56 | Projects or sub-projects will be lead by a set of maintainers. New projects can designate an initial set of maintainers that will be approved by the Technical Steering Committee when the project is first approved. The project's maintainers will, from time-to-time, consider adding a new maintainer. An existing maintainer will post a pull request to the [MAINTAINERS.txt](MAINTAINERS.txt) file. If a majority of the maintainers concur in the comments, the pull request is then merged and the individual becomes a maintainer. 57 | 58 | ### Legal stuff 59 | We have tried to make it as easy as possible to make contributions. This applies to how we handle the legal aspects of contribution. We use the same approach—the [Developer's Certificate of Origin 1.1 (DCO)](DCO1.1.txt)—that the Linux® Kernel [community](http://elinux.org/Developer_Certificate_Of_Origin) uses to manage code contributions. 60 | We simply ask that when submitting a pull request, the developer must include a sign-off statement in the pull request description. 61 | 62 | Here is an example Signed-off-by line, which indicates that the submitter accepts the DCO: 63 | 64 | ``` 65 | Signed-off-by: John Doe 66 | ``` 67 | -------------------------------------------------------------------------------- /ceres/scripts/play_policy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import sys 6 | import os 7 | import numpy as np 8 | import time 9 | 10 | from ceres.tools import ExtraArgs 11 | from baselines.common.cmd_util import make_mujoco_env 12 | import baselines.common.tf_util as U 13 | from ceres.envs import CeresEnv 14 | from ceres.baselines.ceres.pposgd_ceres import build_policy_observation_filter 15 | 16 | class DummyPolicy(object): 17 | ''' 18 | A dummy policy that outputs either zero or random actions in the size expected by the environment 19 | ''' 20 | 21 | def __init__(self, name, ob_space, ac_space): 22 | self.name = name 23 | self.ob_space = ob_space 24 | self.ac_space = ac_space 25 | 26 | self.ac_zero = np.zeros(self.ac_space.shape) 27 | self.vpred_zero = 0. 28 | 29 | def act(self, stochastic, ob): 30 | if stochastic: 31 | return self.ac_space.sample(), self.vpred_zero 32 | else: 33 | return self.ac_zero, self.vpred_zero 34 | 35 | def main(): 36 | ''' 37 | Load and play trained policy 38 | ''' 39 | log_root = os.path.join(os.getcwd(), 'logs') 40 | extra_args = ExtraArgs(log_root=log_root) 41 | 42 | env = make_mujoco_env(extra_args.env_id, extra_args.seed) 43 | 44 | if isinstance(env.unwrapped, CeresEnv) and (len(extra_args.trained_cnet) > 0): 45 | env.unwrapped.init_ceres() 46 | env.unwrapped.init_constraint_prediction(extra_args.trained_cnet) 47 | 48 | episode_lengths = np.zeros(extra_args.max_episodes) 49 | episode_rewards = np.zeros(extra_args.max_episodes) 50 | ob = env.reset() 51 | 52 | do_save_render = extra_args.render and len(extra_args.save_render) > 0 53 | if do_save_render: 54 | os.makedirs(extra_args.save_render, exist_ok=True) 55 | 56 | def save_render(i_step, max_step=300, verbose=True): 57 | n_digits = len(str(max_step)) 58 | do_save_step = (max_step <= 0) or (i_step <= max_step) 59 | if do_save_render and do_save_step: 60 | path_save = os.path.join(extra_args.save_render, str(i_step).zfill(n_digits) + '.png') 61 | env.unwrapped.save_render(path_save, verbose=verbose) 62 | 63 | ob_space = env.unwrapped.observation_space 64 | ac_space = env.unwrapped.action_space 65 | ob_space, policy_observation_filter= build_policy_observation_filter(extra_args, ob_space) 66 | 67 | env.unwrapped.set_ineq_margin(extra_args.conservative_exploration) 68 | 69 | if len(extra_args.trained_policy) > 0: 70 | assert os.path.exists(extra_args.trained_policy), 'Invalid path to model: \'{0}\''.format(extra_args.trained_policy) 71 | from ceres.baselines.ceres.mlp_policy_saver import MlpPolicySaver 72 | from baselines.common import tf_util as U 73 | sess = U.single_threaded_session() 74 | sess.__enter__() 75 | 76 | def policy_fn(name, ob_space, ac_space): 77 | return MlpPolicySaver(name, ob_space=ob_space, ac_space=ac_space, 78 | hid_size=extra_args.policy_hidden_size, num_hid_layers=extra_args.policy_hidden_layers) 79 | pi = policy_fn('pi', ob_space, ac_space) 80 | 81 | U.initialize() 82 | pi.restore_model(extra_args.trained_policy, session=sess) 83 | else: 84 | print('Invalid model path \'{0}\', use dummy agent'.format(extra_args.trained_policy)) 85 | pi = DummyPolicy('pi', ob_space, ac_space) 86 | 87 | time_total = 0. 88 | n_steps_global = -1 89 | for i_episode in range(extra_args.max_episodes): 90 | print('Episode {0}'.format(i_episode)) 91 | time_episode_begin = time.time() 92 | ob = policy_observation_filter(ob) 93 | n_steps_global += 1 94 | if extra_args.render: 95 | env.render() 96 | save_render(n_steps_global) 97 | done = False 98 | ep_rew = 0. 99 | i_step = 0 100 | time.sleep(extra_args.play_step_duration) 101 | 102 | while not done: 103 | action, vpred = pi.act(True, ob) 104 | ob, rew, done, info = env.step(action) 105 | ob = policy_observation_filter(ob) 106 | ep_rew += rew 107 | i_step += 1 108 | n_steps_global += 1 109 | if extra_args.render: 110 | env.render() 111 | save_render(n_steps_global) 112 | time.sleep(extra_args.play_step_duration) 113 | episode_lengths[i_episode] = i_step 114 | episode_rewards[i_episode] = ep_rew 115 | time_episode = time.time() - time_episode_begin 116 | time_total += time_episode 117 | print(' Episode length: {0} (average {1:.1f}), episode reward {2:.1f} (average {5:.1f}), duration {3:.1f} ms (average {4:.1f})'.format(i_step, np.average(episode_lengths[:i_episode+1]), ep_rew, 1000.*time_episode, 1000.*time_total/(i_episode+1), np.average(episode_rewards[:i_episode+1]))) 118 | ob = env.reset() 119 | 120 | 121 | if __name__ == '__main__': 122 | main() 123 | -------------------------------------------------------------------------------- /ceres/tools/math/qpsolver.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import numpy as np 6 | 7 | class QPSolver(object): 8 | ''' 9 | A base class to interface with QP solvers 10 | ''' 11 | 12 | def __init__(self, n_var=None, verbose=False): 13 | self.n_var = n_var 14 | self.verbose = verbose 15 | self.reset() 16 | 17 | def reset(self, do_reset_obj=True, do_reset_eq=True, do_reset_ineq=True): 18 | if do_reset_obj: 19 | self.reset_obj() 20 | if do_reset_eq: 21 | self.reset_eq() 22 | if do_reset_ineq: 23 | self.reset_ineq() 24 | 25 | def update(self): 26 | self.build_obj() 27 | self.build_eq() 28 | self.build_ineq() 29 | self.update_solver_specific() 30 | 31 | def reset_eq(self): 32 | self.eq_mat_list = [] 33 | self.eq_vec_list = [] 34 | self.eq_mat = None 35 | self.eq_vec = None 36 | self.n_eq = 0 37 | self.reset_eq_solver_specific() 38 | 39 | def reset_ineq(self): 40 | self.ineq_mat_list = [] 41 | self.ineq_vec_list = [] 42 | self.ineq_mat = None 43 | self.ineq_vec = None 44 | self.n_ineq = 0 45 | self.reset_ineq_solver_specific() 46 | 47 | def reset_obj(self): 48 | self.obj_mat_list = [] 49 | self.obj_vec_list = [] 50 | self.obj_mat = None 51 | self.obj_vec = None 52 | self.n_obj = 0 53 | self.reset_obj_solver_specific() 54 | 55 | def check_mat_vec(self, mat, vec): 56 | ''' 57 | Ensure that mat and vec are numpy arrays and of appropriate dimensions 58 | ''' 59 | mat = np.array(mat) 60 | vec = np.array(vec) 61 | if self.n_var is None: 62 | self.n_var = mat.shape[1] 63 | else: 64 | assert mat.shape[1] == self.n_var, 'Invalid constraint matrix size {0} for {1} variables'.format(mat.shape, self.n_var) 65 | assert mat.ndim == 2, 'Invalid constraint matrix dimensions: expected 2, got {0}'.format(mat.ndim) 66 | assert vec.ndim == 2, 'Invalid constraint vector dimensions: expected 2, got {0}'.format(vec.ndim) 67 | assert mat.shape[0] == vec.shape[0], 'Inconsistent constraint matrix and vector sizes' 68 | assert vec.shape[1] == 1, 'Invalid constraint vector size {0}, should have one column'.format(mat.shape) 69 | return mat, vec 70 | 71 | def add_obj(self, mat, vec, build=False): 72 | mat, vec = self.check_mat_vec(mat, vec) 73 | assert mat.shape[0] == mat.shape[1], 'Invalid objective matrix shape {0}, should be square'.format(mat.shape) 74 | self.obj_mat_list.append(mat) 75 | self.obj_vec_list.append(vec) 76 | if build: 77 | self.build_obj() 78 | 79 | def build_obj(self): 80 | self.n_obj = len(self.obj_mat_list) 81 | assert self.n_obj > 0 82 | self.obj_mat = sum(self.obj_mat_list) 83 | self.obj_vec = sum(self.obj_vec_list) 84 | self.build_obj_solver_specific() 85 | 86 | def add_eq(self, mat, vec, build=False): 87 | mat, vec = self.check_mat_vec(mat, vec) 88 | self.eq_mat_list.append(mat) 89 | self.eq_vec_list.append(vec) 90 | if build: 91 | self.build_eq() 92 | 93 | def build_eq(self): 94 | if len(self.eq_mat_list) > 0: 95 | self.eq_mat = np.concatenate(self.eq_mat_list, axis=0) 96 | self.eq_vec = np.concatenate(self.eq_vec_list, axis=0) 97 | self.n_eq = self.eq_mat.shape[0] 98 | else: 99 | self.eq_mat = None 100 | self.eq_vec = None 101 | self.n_eq = 0 102 | self.build_eq_solver_specific() 103 | 104 | def add_ineq(self, mat, vec, build=False): 105 | if (mat is None) or (vec is None): 106 | assert (mat is None) and (vec is None), 'Constraint incomplete: mat={0}, vec={1}'.format(mat, vec) 107 | return 108 | mat, vec = self.check_mat_vec(mat, vec) 109 | n_ineq_loc = mat.shape[0] 110 | if n_ineq_loc > 0: 111 | self.ineq_mat_list.append(mat) 112 | self.ineq_vec_list.append(vec) 113 | if build: 114 | self.build_ineq() 115 | 116 | def build_ineq(self): 117 | if len(self.ineq_mat_list) > 0: 118 | self.ineq_mat = np.concatenate(self.ineq_mat_list, axis=0) 119 | self.ineq_vec = np.concatenate(self.ineq_vec_list, axis=0) 120 | self.n_ineq = self.ineq_mat.shape[0] 121 | else: 122 | self.ineq_mat = None 123 | self.ineq_vec = None 124 | self.n_ineq = 0 125 | self.build_ineq_solver_specific() 126 | 127 | def reset_obj_solver_specific(self): 128 | pass 129 | 130 | def reset_eq_solver_specific(self): 131 | pass 132 | 133 | def reset_ineq_solver_specific(self): 134 | pass 135 | 136 | def build_obj_solver_specific(self): 137 | pass 138 | 139 | def build_eq_solver_specific(self): 140 | pass 141 | 142 | def build_ineq_solver_specific(self): 143 | pass 144 | 145 | def update_solver_specific(self): 146 | pass 147 | 148 | def solve(self): 149 | raise NotImplementedError() 150 | 151 | 152 | -------------------------------------------------------------------------------- /ceres/constraints/constraint_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import tensorflow as tf 6 | 7 | class ConstraintLoss(object): 8 | ''' 9 | Define constraint network loss terms 10 | ''' 11 | 12 | no_normalization = ['_loss_l2'] # add loss functions that do not require division by batch size 13 | 14 | def __init__(self, network): 15 | ''' 16 | Loss terms are defined from constraint network variables 17 | ''' 18 | self.network = network 19 | self.init_losses() 20 | self.init_total_loss() 21 | 22 | def init_losses(self): 23 | ''' 24 | Initialize individual losses from a dictionary of loss names and weights, 25 | e.g., loss_weights = {'l2': 0.0001, 'positive_violation_max': 1.0, 'negative_satisfaction_min': 1.0} 26 | will call loss functions '_loss_l2', '_loss_positive_violation_max' and '_loss_negative_satisfaction_min' 27 | ''' 28 | self.losses = {} 29 | for loss_name, loss_weight in self.network.config.loss_weights.items(): 30 | if loss_weight != 0.: 31 | loss_func_name = '_loss_{0}'.format(loss_name) 32 | assert hasattr(self, loss_func_name), 'Undefined loss function {0}'.format(loss_func_name) 33 | loss = getattr(self, loss_func_name)() 34 | if not loss_func_name in self.no_normalization: 35 | loss = loss / self.network.batch_size_float 36 | self.losses[loss_name] = loss_weight * loss 37 | 38 | def init_total_loss(self): 39 | ''' 40 | Sum up individual loss terms if available, otherwise zero 41 | ''' 42 | loss_list = [v for k, v in self.losses.items()] 43 | if len(loss_list) == 0: 44 | print('Warning: no CNet loss defined') 45 | self.total_loss = 0. 46 | else: 47 | self.total_loss = tf.add_n(loss_list) 48 | 49 | def _loss_l2(self): 50 | ''' 51 | L2 norm of the neural network weights 52 | ''' 53 | assert len(self.network.model_weights) > 0 54 | loss = tf.add_n([tf.nn.l2_loss(w) for _k, w in self.network.model_weights.items()]) 55 | return loss 56 | 57 | def _loss_positive_violation_max(self, order=1): 58 | ''' 59 | Maximum violation margin for positive demonstrations, supports squaring 60 | ''' 61 | loss = self.network.ineq_violation_margin 62 | loss = tf.reduce_max(loss, axis=1) 63 | if order == 2: 64 | loss = tf.square(loss) 65 | loss = tf.multiply(self.network.is_positive, loss) 66 | loss = tf.reduce_sum(loss) 67 | return loss 68 | 69 | def _loss_pvm(self, order=1): 70 | ''' 71 | Shortname for positive violation max 72 | ''' 73 | pvm_loss = self._loss_positive_violation_max(order=order) 74 | return pvm_loss 75 | 76 | def _loss_pvm_1d(self): 77 | ''' 78 | Positive violation max, 1st order 79 | ''' 80 | return self._loss_pvm(order=1) 81 | 82 | def _loss_pvm_2d(self): 83 | ''' 84 | Positive violation max, squared 85 | ''' 86 | return self._loss_pvm(order=2) 87 | 88 | def _loss_positive_violation_norm(self, order=1): 89 | ''' 90 | Since we're seeking to zero all violation margins, we can minimize the total norm (L1 or L2) 91 | ''' 92 | loss = self.network.ineq_violation_margin 93 | if order == 2: 94 | loss = tf.square(loss) 95 | else: 96 | assert order == 1, 'Only order 1 and 2 supported' 97 | loss = tf.reduce_sum(loss, axis=1) 98 | loss = tf.multiply(self.network.is_positive, loss) 99 | loss = tf.reduce_sum(loss) 100 | return loss 101 | 102 | def _loss_pvn(self, order=1): 103 | ''' 104 | Shortname for positive violation norm 105 | ''' 106 | pvn_loss = self._loss_positive_violation_norm(order=order) 107 | return pvn_loss 108 | 109 | def _loss_pvn_1d(self): 110 | ''' 111 | Positive violation norm, L1 norm 112 | ''' 113 | return self._loss_pvn(order=1) 114 | 115 | def _loss_pvn_2d(self): 116 | ''' 117 | Positive violation norm, L2 norm 118 | ''' 119 | return self._loss_pvn(order=2) 120 | 121 | def _loss_negative_satisfaction_min(self, order=1): 122 | ''' 123 | Minimum satisfaction margin for negative demonstrations, supports squaring 124 | ''' 125 | loss = self.network.ineq_satisfaction_margin 126 | loss = tf.reduce_min(loss, axis=1) 127 | if order == 2: 128 | loss = tf.square(loss) 129 | loss = tf.multiply(self.network.is_negative, loss) 130 | loss = tf.reduce_sum(loss) 131 | return loss 132 | 133 | def _loss_nsm(self, order=1): 134 | ''' 135 | Shortname for negative satisfaction min 136 | ''' 137 | nsm_loss = self._loss_negative_satisfaction_min(order=order) 138 | return nsm_loss 139 | 140 | def _loss_nsm_1d(self): 141 | ''' 142 | Negative satisfaction min, 1st order 143 | ''' 144 | return self._loss_nsm(order=1) 145 | 146 | def _loss_nsm_2d(self): 147 | ''' 148 | Negative satisfaction min, squared 149 | ''' 150 | return self._loss_nsm(order=2) 151 | -------------------------------------------------------------------------------- /examples/01-constraints-from-demonstrations.md: -------------------------------------------------------------------------------- 1 | # Guiding exploration with constraints from demonstrations 2 | 3 | We consider an environment in which an agent navigates through a fixed maze to reach a target. 4 | The starting position of the agent and the position of the target are randomized at each episode. 5 | 6 | Visualize the environment with a random policy: 7 | ``` 8 | python3 -m ceres.scripts.play_policy --env_id Nav2dPosFixedMazeCeres-v0 \ 9 | --max_episodes 1000 --render 10 | ``` 11 | 12 | 13 | Arguments: 14 | * ```--env_id Nav2dPosFixMazeCeres-v0```: environment name (can load environments from modules other than ```ceres``` with the extended argument format ```:```) 15 | * ```--max_episodes 1000```: play a random policy for 1000 episodes 16 | * ```--render```: (optional) render to screen 17 | 18 | ## Learning constraints from demonstrations 19 | 20 | 500 positive trajectories with negative demonstrations were collected in ```data/Nav2dFixedMaze-500T.npz``` 21 | 22 | Visualize the demonstrations with: 23 | ``` 24 | python3 -m ceres.constraints.constraint_demonstration --env_id Nav2dPosFixedMazeCeres-v0 \ 25 | --constraint_demonstration_buffer data/Nav2dFixedMaze-500T.npz --render 26 | ``` 27 | Possible commands to replay demonstrations within the environment are described in the terminal. 28 | 29 | Train a constraint network using the ground truth data with: 30 | ``` 31 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosFixedMazeCeres-v0 \ 32 | --constraint_demonstration_buffer data/Nav2dFixedMaze-500T.npz \ 33 | --cnet_n_ineq 2 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \ 34 | --cnet_spherical_coordinates --cnet_predict_interior_point \ 35 | --cnet_training_epochs 1000 --cnet_decay_epochs 10 --early_stop_positive 0.99 --early_stop_negative 0.99 \ 36 | --max_iter 1 --only_train_constraints --output maze_cnet 37 | ``` 38 | 39 | Arguments: 40 | * ```--constraint_demonstration_buffer data/Nav2dFixedMaze-500T.npz```: use existing demonstration buffer 41 | * ```--cnet_n_ineq 2```: 2 inequality constraints 42 | * ```--cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6```: loss weights for positive violation max, negative satisfaction min, L2 regularization 43 | * ```--cnet_spherical_coordinates```: predict unit-norm constraints using spherical coordinates (alternatively, use ```--cnet_normalize_ineq_mat``` for post-normalization) 44 | * ```--cnet_predict_interior_point```: predict constraints such that there exists an interior point that satisfies them all 45 | * ```--cnet_training_epochs 1000```: train the constraint network over 1000 epochs 46 | * ```--cnet_decay_epochs 10```: halve constraint network learning rate every 10 epochs without loss reduction 47 | * ```--early_stop_positive 0.99 --early_stop_negative 0.99```: interrupt constraint network training if it reaches 99% separation accuracy 48 | * ```--max_iter 1```: run one iteration of CERES training 49 | * ```--only_train_constraints```: only train constraint network within CERES, not policy 50 | * ```--output maze_cnet```: save logs in ```logs/maze_cnet```. If the directory already exists, remove it manually or run the script with ```--overwrite``` 51 | 52 | The trained constraint network will be saved in ```logs/maze_cnet/worker_0_direct/constraints```. 53 | 54 | Visualize the constraints with a random policy: 55 | ``` 56 | python3 -m ceres.scripts.play_policy --env_id Nav2dPosFixedMazeCeres-v0 \ 57 | --trained_cnet logs/maze_cnet/worker_0_direct/constraints --max_episodes 1000 --render 58 | ``` 59 | 60 | 61 | ## Baseline PPO 62 | 63 | Within CERES, disabling constraints and setting the number of recovery policies to zero amounts to training with PPO: 64 | ``` 65 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosFixedMazeCeres-v0 \ 66 | --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \ 67 | --max_iter 5000 --output maze_ppo_unconstrained 68 | ``` 69 | Arguments: 70 | * ```--only_train_policy```: only train policy, not constraints 71 | * ```--constant_constraint_activation 0.```: set the constraint activation probability to zero throughout training 72 | * ```--n_recovery 0```: do not train recovery agents 73 | * ```--max_iter 5000```: do reinforcement learning for 5000 iterations 74 | * Optionally, run with ```--render``` to visualize exploration and constraints. 75 | 76 | The trained policy will be saved in ```logs/maze_ppo_unconstrained/worker_0_direct/policy``` 77 | 78 | ## Applying constraints to guide PPO 79 | 80 | Apply the trained constraint network to restrict the exploration range: 81 | ``` 82 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosFixedMazeCeres-v0 \ 83 | --only_train_policy --constant_constraint_activation 1. --n_recovery 0 \ 84 | --trained_cnet logs/maze_cnet/worker_0_direct/constraints \ 85 | --max_iter 5000 --output maze_ppo_constrained 86 | ``` 87 | Arguments: 88 | * ```--constant_constraint_activation 1.```: always enable constraints 89 | * ```--trained_cnet logs/maze_cnet/worker_0_direct/constraints```: use the constraint network trained previously 90 | 91 | The trained policy will be saved in ```logs/maze_ppo_constrained/worker_0_direct/policy``` 92 | 93 | ## Compare rewards with and without constraints 94 | 95 | Plot the rewards during training: 96 | ``` 97 | python3 -m ceres.scripts.plot_rewards \ 98 | --plot_path "Unconstrained PPO=logs/maze_ppo_unconstrained/worker_0_direct" \ 99 | --plot_path "Constrained PPO=logs/maze_ppo_constrained/worker_0_direct" 100 | ``` 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /ceres/envs/ceres_env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from .resetter import ResetterEnvCeres 6 | from .constrained import ConstrainedEnvNetwork 7 | import numpy as np 8 | import gym 9 | 10 | class CeresEnv(ResetterEnvCeres, ConstrainedEnvNetwork): 11 | ''' 12 | Base class for CERES-compatible environments, with support for snapshotting and constraint prediction 13 | ''' 14 | 15 | # ResetterEnv parameters 16 | max_reference_trajectories = 1024 # -1 to load all available episodes 17 | 18 | ### Solver parameters 19 | return_zero_if_opt_fails = False 20 | has_ineq = True 21 | has_eq = False 22 | 23 | # Recovery parameters 24 | recovery_reward_alive = 1. 25 | info_key_constrained_action = 'constrained_action' 26 | 27 | is_ceres_initialized = False # by default, use base environment (no special reset or constraints) 28 | 29 | def init_ceres(self, is_recovery_mode=False): 30 | ''' 31 | Setup CERES-specific behavior. 32 | If this function is not called, function as the base environment, without constraints or recovery. 33 | ''' 34 | if not self.is_ceres_initialized: # avoid double initialization through main class super 35 | self.check_base_attributes() 36 | self.init_overloading() 37 | self.init_recovery() 38 | ResetterEnvCeres.__init__(self) 39 | ConstrainedEnvNetwork.__init__(self) 40 | self.is_ceres_initialized = True 41 | self.is_recovery_mode = is_recovery_mode 42 | assert self.max_reference_trajectories > 0 43 | self.enable_constraints = True 44 | self.set_constraint_activation_probability(1.) 45 | 46 | def set_constraint_activation_probability(self, val): 47 | self.constraint_activation_probability = val 48 | 49 | def check_base_attributes(self): 50 | # Old gym environments use _reset instead of reset directly 51 | if hasattr(self, 'reset'): 52 | self.reset_function_name = 'reset' 53 | else: 54 | assert hasattr(self, '_reset'), 'Could find neither \'reset\' nor \'_reset\' base function.' 55 | self.reset_function_name = '_reset' 56 | # Old gym environments use _step instead of step directly 57 | if hasattr(self, 'step'): 58 | self.step_function_name = 'step' 59 | else: 60 | assert hasattr(self, '_step'), 'Could find neither \'step\' nor \'_step\' base function.' 61 | self.step_function_name = '_step' 62 | # The base environment also needs to be able to calculate snapshots and reset to given snapshots 63 | assert hasattr(self, 'calc_snapshot'), 'Could not find base calc_snapshot function' 64 | assert hasattr(self, 'reset_and_restore'), 'Could not find base reset_and_restore function' 65 | 66 | def init_overloading(self): 67 | ''' 68 | Replace base environment reset and step with CERES-specific functions 69 | ''' 70 | self.init_overloading_reset() 71 | self.init_overloading_step() 72 | 73 | def init_overloading_reset(self): 74 | self.reset_base = getattr(self, self.reset_function_name) 75 | setattr(self, self.reset_function_name, self.reset_ceres) 76 | 77 | def init_overloading_step(self): 78 | self.step_base = getattr(self, self.step_function_name) 79 | setattr(self, self.step_function_name, self.step_ceres) 80 | 81 | def step_ceres(self, action_raw): 82 | ''' 83 | Depending on the constraint activation probability, correct the input action, play the corrected action and update constraints. 84 | For recovery, change the reward and end condition. 85 | ''' 86 | do_enable_constraints_this_step = (self.constraint_activation_probability == 1.) or (np.random.rand() < self.constraint_activation_probability) 87 | if self.enable_constraints and do_enable_constraints_this_step: 88 | action_constrained, success, viol = self.correct_action(action_raw) 89 | else: 90 | action_constrained = action_raw 91 | state, reward, done, info = self.step_base(action_constrained) 92 | self.update_ineq_matrices(state) 93 | info[self.info_key_constrained_action] = action_constrained 94 | if self.is_recovery_mode: 95 | self.n_recovery_steps += 1 96 | is_max_recovery_steps = self.n_recovery_steps == self.max_recovery_steps 97 | if info[self.info_key_failure]: 98 | reward = self.recovery_reward_failure 99 | else: 100 | reward = self.recovery_reward_alive 101 | done = done or is_max_recovery_steps 102 | return state, reward, done, info 103 | 104 | def reset_ceres(self): 105 | ''' 106 | Restore a reference snapshot when available (e.g., in recovery mode), otherwise use the base environment reset, and predict new constraints 107 | ''' 108 | if len(self.reference_trajectories) == 0: 109 | state = self.reset_base() 110 | self.recovery_info = None 111 | else: 112 | snapshot, self.recovery_info = ResetterEnvCeres.get_random_reference_snapshot(self) 113 | state = self.reset_and_restore(snapshot=snapshot) 114 | self.update_ineq_matrices(state) 115 | self.n_recovery_steps = 0 116 | return state 117 | 118 | def init_recovery(self): 119 | required_base_env_attrs = ['max_recovery_steps', 'info_key_failure', 'info_key_success'] 120 | for _k in required_base_env_attrs: 121 | assert hasattr(self, _k), 'Undefined attribute {0} in base environment within {1}'.format(_k, type(self)) 122 | self.recovery_reward_failure = -self.max_recovery_steps 123 | 124 | -------------------------------------------------------------------------------- /ceres/networks/network_saver.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import tensorflow as tf 6 | from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file as pticf 7 | import time 8 | import os 9 | 10 | class NetworkSaver(object): 11 | ''' 12 | A simple class implementing save and restore functions for neural networks in Tensorflow 13 | ''' 14 | 15 | model_basename = 'model' 16 | 17 | def __init__(self, network_id): 18 | self.network_id = network_id 19 | self.tf_var_prefix = '{0}/'.format(self.network_id) 20 | 21 | def get_var_name_mapping(self, backup_network_id=None): 22 | ''' 23 | Return a dict associating this network's variable names to trainable tensors. 24 | Optional argument backup_network_id allows loading weights that were saved under a different name from network_id 25 | ''' 26 | var_name_mapping = {} 27 | for v in tf.trainable_variables(): 28 | if self.tf_var_prefix == v.name[:len(self.tf_var_prefix)]: 29 | v_name_train = v.name 30 | if backup_network_id is not None: 31 | v_name_train = backup_network_id.join(v_name_train.split(self.network_id)) 32 | v_name_train = v_name_train.split(':')[0] 33 | var_name_mapping[v_name_train] = v 34 | return var_name_mapping 35 | 36 | def restore_model(self, path_restore, session=None, backup_network_id=None, verbose=True): 37 | ''' 38 | Restore trained weights 39 | ''' 40 | if session is None: 41 | assert hasattr(self, 'session'), 'Either pass session as argument or set during saver initialization' 42 | session = self.session 43 | var_name_mapping = self.get_var_name_mapping(backup_network_id=backup_network_id) 44 | saver = tf.train.Saver(var_name_mapping) 45 | path_model = self.get_latest_model(path_restore, model_basename=self.model_basename) 46 | try: 47 | saver.restore(session, path_model) 48 | print('Restored network: {0}'.format(path_model)) 49 | except Exception as e: 50 | print('Could not restore {0} from checkpoint: {1}'.format(type(self), path_model)) 51 | print('This is the content of the checkpoint file:') 52 | pticf(file_name=path_model, tensor_name='', all_tensors=False) 53 | raise e 54 | 55 | def init_saver(self, path_backup_dir, session=None, max_to_keep=1): 56 | ''' 57 | Build backup path for future use 58 | ''' 59 | if session is not None: 60 | self.session = session 61 | os.makedirs(path_backup_dir, exist_ok=True) 62 | self.path_model = os.path.join(path_backup_dir, self.model_basename) 63 | var_name_mapping = self.get_var_name_mapping() 64 | var_to_save = [_e for _k, _e in var_name_mapping.items()] 65 | self.saver = tf.train.Saver(var_to_save, max_to_keep=max_to_keep) 66 | 67 | def save_model(self, global_step=None, verbose=True, path_model=None, session=None): 68 | ''' 69 | Save model from given session to given path if specified, 70 | else take these from previous init_saver call 71 | ''' 72 | if path_model is None: 73 | assert hasattr(self, 'path_model'), 'Specify path_model or set it at initialization' 74 | path_model = self.path_model 75 | if session is None: 76 | assert hasattr(self, 'path_model'), 'Specify session or set it at initialization' 77 | session = self.session 78 | if global_step is None: 79 | self.saver.save(session, self.path_model) 80 | else: 81 | self.saver.save(session, self.path_model, global_step=global_step) 82 | if verbose: 83 | print('Save network: {0}'.format(path_model)) 84 | 85 | @classmethod 86 | def get_latest_model(cls, path_model, model_basename='model'): 87 | ''' 88 | Check for files of the form - and return the most recent 89 | ''' 90 | model_index_extension = '.index' 91 | 92 | if os.path.isdir(path_model): 93 | path_model = os.path.join(path_model, model_basename) 94 | path_model_full = path_model + model_index_extension 95 | 96 | if not os.path.isfile(path_model_full): 97 | # Check for files of the form model-1000.index 98 | path_model_dirname = os.path.dirname(path_model_full) 99 | model_basename = os.path.basename(path_model) 100 | files_in_dir = os.listdir(path_model_dirname) 101 | path_model_candidates = [] 102 | model_iter_numbers = [] 103 | for _f in files_in_dir: 104 | if _f[:len(model_basename)] != model_basename: 105 | continue 106 | if _f[-len(model_index_extension):] != model_index_extension: 107 | continue 108 | _f_base = _f[:-len(model_index_extension)] 109 | _f_base_split = _f_base.split('-') 110 | assert (_f_base_split[0] == model_basename) and (len(_f_base_split) == 2), 'Invalid file {0}, expected {1}-{2}'.format(_f, model_basename, model_index_extension) 111 | i_iter = int(_f_base_split[1]) 112 | model_iter_numbers.append(i_iter) 113 | assert len(model_iter_numbers) > 0, 'Cannot find any model candidate in directory {0}'.format(path_model_dirname) 114 | model_basename = '{0}-{1}'.format(model_basename, max(model_iter_numbers)) 115 | path_model = os.path.join(path_model_dirname, model_basename) 116 | path_model_full = path_model + model_index_extension 117 | assert os.path.isfile(path_model_full), 'Model backup file does not exist: {0}'.format(path_model_full) 118 | return path_model 119 | 120 | def build_model(self, *args, **kwargs): 121 | raise NotImplementedError('Implement build_model in child classes') 122 | -------------------------------------------------------------------------------- /ceres/baselines/ceres/run_continuous.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import os 6 | from mpi4py import MPI 7 | from baselines.common.mpi_fork import mpi_fork 8 | from baselines.common import tf_util as U 9 | from baselines import logger 10 | from . import pposgd_ceres 11 | 12 | from baselines.common.cmd_util import make_mujoco_env 13 | from .mlp_policy_saver import MlpPolicySaver 14 | from ceres.envs import CeresEnv 15 | from ceres import ConstraintNetworkMLP, ConstraintConfig 16 | from ceres import ConstraintDemonstrationBuffer 17 | 18 | def build_log_dirs(path_xp, rank, is_direct_policy): 19 | worker_name = 'worker_{0}_{1}'.format(rank, 'direct' if is_direct_policy else 'recovery') 20 | worker_dir = os.path.join(path_xp, worker_name) 21 | worker_policy_dir = os.path.join(worker_dir, 'policy') 22 | worker_constraints_dir = os.path.join(worker_dir, 'constraints') 23 | return worker_name, worker_dir, worker_policy_dir, worker_constraints_dir 24 | 25 | def main(): 26 | ''' 27 | Initialize CERES environment and launch policy and constraint learning 28 | or restart from a previous training session. 29 | ''' 30 | from ceres.tools import ExtraArgs 31 | log_root = os.path.join(os.getcwd(), 'logs') 32 | extra_args = ExtraArgs(log_root=log_root) 33 | 34 | n_agents_total = extra_args.n_direct + extra_args.n_recovery 35 | whoami = mpi_fork(n_agents_total) 36 | if whoami == "parent": 37 | return 38 | sess = U.single_threaded_session() 39 | sess.__enter__() 40 | 41 | # Synchronize log directory between agents 42 | rank = MPI.COMM_WORLD.Get_rank() 43 | is_direct_policy = rank < extra_args.n_direct 44 | if rank == 0: 45 | path_xp = extra_args.path_xp 46 | for dest_rank in range(n_agents_total): 47 | send_buffer = [path_xp] 48 | MPI.COMM_WORLD.send(send_buffer, dest=dest_rank, tag=rank) 49 | else: 50 | recv_buffer = MPI.COMM_WORLD.recv(source=0, tag=0) 51 | path_xp = recv_buffer[0] 52 | 53 | # Find root processes for direct and recovery 54 | if is_direct_policy: 55 | root_rank = 0 56 | else: 57 | root_rank = extra_args.n_direct 58 | 59 | worker_name, worker_dir, worker_policy_dir, worker_constraints_dir = build_log_dirs(path_xp, rank, is_direct_policy) 60 | logger.configure(dir=worker_dir) 61 | 62 | if not rank == 0: # only log first direct 63 | logger.set_level(logger.DISABLED) 64 | 65 | workerseed = extra_args.seed + 10000 * rank 66 | assert len(extra_args.env_id) > 0, 'Missing argument --env_id' 67 | env = make_mujoco_env(extra_args.env_id, workerseed) 68 | assert isinstance(env.unwrapped, CeresEnv), 'Env {0} should be an instance of CeresEnv'.format(type(env)) 69 | env.unwrapped.init_ceres(is_recovery_mode=(not is_direct_policy)) 70 | 71 | # Setup restoration parameters from previous logs 72 | if len(extra_args.continue_ceres_training) > 0: 73 | assert os.path.isdir(extra_args.continue_ceres_training), 'Could not find log directory: {0}'.format(extra_args.continue_ceres_training) 74 | # All direct share one policy, all recovery share another 75 | _, _, extra_args.trained_policy, _ = build_log_dirs(extra_args.continue_ceres_training, root_rank, is_direct_policy) 76 | # All agents share a single constraint network 77 | _, _, _, extra_args.trained_cnet = build_log_dirs(extra_args.continue_ceres_training, 0, True) 78 | # All agents have separate demonstration buffers 79 | _, _, _, extra_args.constraint_demonstration_buffer = build_log_dirs(extra_args.continue_ceres_training, rank, is_direct_policy) 80 | 81 | def policy_fn(name, ob_space, ac_space): 82 | policy = MlpPolicySaver(name, ob_space=ob_space, ac_space=ac_space, 83 | hid_size=extra_args.policy_hidden_size, num_hid_layers=extra_args.policy_hidden_layers) 84 | policy.init_saver(worker_policy_dir, session=sess, max_to_keep=extra_args.backup_keep) 85 | return policy 86 | 87 | # Initialize backup directories 88 | os.makedirs(worker_constraints_dir, exist_ok=True) 89 | if len(extra_args.trained_cnet) > 0: 90 | cnet_config = ConstraintConfig.from_backup(extra_args.trained_cnet) 91 | else: 92 | cnet_config = ConstraintConfig.from_extra_args(extra_args) 93 | if rank == 0: 94 | cnet_config.save(worker_constraints_dir) 95 | cnet = ConstraintNetworkMLP(env.observation_space, env.action_space, cnet_config) 96 | cnet.init_saver(worker_constraints_dir, session=sess, max_to_keep=extra_args.backup_keep) 97 | env.unwrapped.init_constraint_prediction(cnet, session=sess) 98 | 99 | constraint_demonstration_buffer = ConstraintDemonstrationBuffer(extra_args.constraint_demonstration_buffer_size) 100 | constraint_demonstration_buffer.init_saver(worker_constraints_dir) 101 | 102 | # Check end criterion 103 | possible_end_criteria = ['max_iterations', 'max_timesteps', 'max_episodes', 'max_seconds'] 104 | active_end_criteria = [_k for _k in possible_end_criteria if getattr(extra_args, _k) > 0] 105 | n_end_criteria = len(active_end_criteria) 106 | if extra_args.max_iterations == 0: 107 | raise ValueError('Specify one end criterion out of {0}'.format(possible_end_criteria)) 108 | else: 109 | assert n_end_criteria == 1, 'Only one time constraint permitted but {0} specified: {1}'.format(n_end_criteria, active_end_criteria) 110 | 111 | # Start training! 112 | pposgd_ceres.learn(env, policy_fn, 113 | max_timesteps=extra_args.max_timesteps, 114 | max_iters=extra_args.max_iterations, 115 | max_episodes=extra_args.max_episodes, 116 | max_seconds=extra_args.max_seconds, 117 | timesteps_per_actorbatch=extra_args.timesteps_per_actorbatch, 118 | clip_param=0.2, entcoeff=extra_args.policy_entcoeff, 119 | optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, 120 | gamma=0.99, lam=0.95, schedule=extra_args.policy_learning_rate_schedule, 121 | extra_args=extra_args, cnet=cnet, constraint_demonstration_buffer=constraint_demonstration_buffer, 122 | ) 123 | env.close() 124 | 125 | if rank == 0: 126 | print('Done! Logs are located in {0}'.format(path_xp)) 127 | 128 | if __name__ == '__main__': 129 | main() 130 | -------------------------------------------------------------------------------- /ceres/envs/nav2d/obstacles.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import numpy as np 6 | 7 | class Obstacle(object): 8 | ''' 9 | Base class for obstacles, with export/import functions 10 | ''' 11 | required_parameters = [] 12 | def __init__(self, **kwargs): 13 | for _k in self.required_parameters: 14 | setattr(self, _k, kwargs[_k]) 15 | self.check_parameters() 16 | 17 | def to_array(self): 18 | params = [getattr(self, _k) for _k in self.required_parameters] 19 | return params 20 | 21 | @classmethod 22 | def FromArray(cls, params): 23 | assert len(params) == len(cls.required_parameters) 24 | params_as_dict = {_k: _v for _k, _v in zip(cls.required_parameters, params)} 25 | obstacle = cls(**params_as_dict) 26 | return obstacle 27 | 28 | class ObstacleSquare(Obstacle): 29 | ''' 30 | Square obstacle, initialized from the 2D location of its top-left and bottom-right corners 31 | ''' 32 | required_parameters = ['top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y'] 33 | 34 | def check_parameters(self): 35 | assert self.top_left_x < self.bottom_right_x 36 | assert self.top_left_y > self.bottom_right_y 37 | self.bottom_left_x = self.top_left_x 38 | self.bottom_left_y = self.bottom_right_y 39 | self.top_right_x = self.bottom_right_x 40 | self.top_right_y = self.top_left_y 41 | 42 | def test_collision(self, x, y, conservative=False, min_distance=0.): 43 | # Set conservative=True to count border as collision 44 | x_proj, y_proj, is_strictly_inside = self.project(x, y) 45 | if is_strictly_inside: 46 | return True 47 | else: 48 | dist = np.linalg.norm(np.array([x, y]) - np.array([x_proj, y_proj])) 49 | if conservative: 50 | is_collision = dist <= min_distance 51 | else: 52 | is_collision = dist < min_distance 53 | return is_collision 54 | 55 | def to_polygon(self): 56 | path_closed = [] 57 | path_closed.append((self.top_left_x, self.top_left_y)) 58 | path_closed.append((self.bottom_left_x, self.bottom_left_y)) 59 | path_closed.append((self.bottom_right_x, self.bottom_right_y)) 60 | path_closed.append((self.top_right_x, self.top_right_y)) 61 | return path_closed 62 | 63 | def project(self, x, y): 64 | strict_inside_x = False 65 | strict_inside_y = False 66 | if x >= self.bottom_right_x: 67 | x_proj = self.bottom_right_x 68 | elif x <= self.bottom_left_x: 69 | x_proj = self.bottom_left_x 70 | else: 71 | x_proj = x 72 | strict_inside_x = True 73 | if y >= self.top_left_y: 74 | y_proj = self.top_left_y 75 | elif y <= self.bottom_left_y: 76 | y_proj = self.bottom_left_y 77 | else: 78 | y_proj = y 79 | strict_inside_y = True 80 | is_strictly_inside = strict_inside_x and strict_inside_y 81 | return x_proj, y_proj, is_strictly_inside 82 | 83 | def intersection_with_line(self, p1, p2): 84 | raise NotImplementedError('Intersection between square and line not implemented') 85 | 86 | 87 | class ObstacleCircle(Obstacle): 88 | ''' 89 | Circle obstacle, initialized from the 2D location of its center and its radius 90 | ''' 91 | required_parameters = ['center_x', 'center_y', 'radius'] 92 | 93 | def check_parameters(self): 94 | assert self.radius > 0. 95 | self.center_xy = np.array([self.center_x, self.center_y]) 96 | self.intersection_line_shift = np.dot(self.center_xy, self.center_xy) - self.radius**2 # use this when computing intersection with line 97 | 98 | def test_collision(self, x, y, conservative=False, min_distance=0.): 99 | # Set conservative=True to count border as collision 100 | x_proj, y_proj, is_strictly_inside = self.project(x, y) 101 | if is_strictly_inside: 102 | return True 103 | else: 104 | dist = np.linalg.norm(np.array([x, y]) - np.array([x_proj, y_proj])) 105 | if conservative: 106 | is_collision = dist <= min_distance 107 | else: 108 | is_collision = dist < min_distance 109 | return is_collision 110 | 111 | def to_polygon(self): 112 | raise NotImplementedError('Use circle drawing function') 113 | 114 | def project(self, x, y): 115 | center_to_point = np.array([x - self.center_x, y - self.center_y]) 116 | dist_from_center = np.linalg.norm(center_to_point) 117 | is_strictly_inside = dist_from_center < self.radius 118 | if is_strictly_inside: 119 | x_proj, y_proj = x, y 120 | else: 121 | center_to_surface = center_to_point / dist_from_center * self.radius 122 | x_proj = self.center_x + center_to_surface[0] 123 | y_proj = self.center_y + center_to_surface[1] 124 | return x_proj, y_proj, is_strictly_inside 125 | 126 | def intersection_with_line(self, p1, p2): 127 | ''' 128 | Solve quadratic equation a x^2 + b x + c = 0 129 | with a = np.dot(v, v) with v unit vector between p1 and p2, 130 | b = 2 np.dot(v, p1 - center) 131 | c = np.dot(p1, p1) + np.dot(center, center) - 2 np.dot(p1, center) - radius^2 132 | ''' 133 | p1 = np.array(p1) 134 | p2 = np.array(p2) 135 | unit_vec = p2 - p1 136 | dist = np.linalg.norm(unit_vec) 137 | assert dist > 0. 138 | unit_vec /= dist 139 | a = np.dot(unit_vec, unit_vec) 140 | b = 2. * np.dot(unit_vec, p1 - self.center_xy) 141 | c = np.dot(p1, p1) - 2. * np.dot(p1, self.center_xy) + self.intersection_line_shift 142 | 143 | delta = b**2 - 4.* a * c 144 | if delta < 0: 145 | return False, None 146 | delta_sqrt = np.sqrt(delta) 147 | # Two solutions: x1, x2 148 | x1 = (-b - delta_sqrt) / (2. * a) 149 | x2 = (-b + delta_sqrt) / (2. * a) 150 | if 0. <= x1 <= dist: 151 | x_min = x1 152 | elif 0. <= x2 <= dist: 153 | x_min = x2 154 | else: 155 | #x_min = min(x1, x2) 156 | return False, None 157 | closest = p1 + x_min * unit_vec 158 | return True, closest 159 | 160 | -------------------------------------------------------------------------------- /ceres/baselines/ceres/pposgd_ceres_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from baselines import logger 6 | import baselines.common.tf_util as U 7 | import numpy as np 8 | import time 9 | from mpi4py import MPI 10 | import gym 11 | 12 | def update_constraint_activation_probability(env, extra_args, logger, is_direct_policy, do_train_cnet, 13 | activation_probability_before, activation_probability_after): 14 | ''' 15 | Update environment constraint activation probability using constraint accuracy before or after training 16 | ''' 17 | activation_probability = extra_args.constant_constraint_activation 18 | if len(extra_args.adaptive_constraint_activation) > 0: 19 | do_use_prior_accuracy_as_activation_probability = 'prior' in extra_args.adaptive_constraint_activation 20 | if do_use_prior_accuracy_as_activation_probability or (not do_train_cnet): 21 | activation_probability = activation_probability_before 22 | else: 23 | activation_probability = activation_probability_after 24 | if (not is_direct_policy) and extra_args.unconstrained_recovery: 25 | activation_probability = 0. 26 | if activation_probability is not None: 27 | logger.log('Set constraint activation probability to {0:.1f} %'.format(activation_probability * 100.)) 28 | env.unwrapped.set_constraint_activation_probability(activation_probability) 29 | 30 | def check_time_between_backups(extra_args, last_backup_time=None): 31 | ''' 32 | Only write backups every min_time_between_backups 33 | ''' 34 | time_now = time.time() 35 | if last_backup_time is not None: 36 | time_since_last = time_now - last_backup_time 37 | do_save_backup = time_since_last > extra_args.min_time_between_backups 38 | else: 39 | do_save_backup = True 40 | if do_save_backup: 41 | last_backup_time = time_now 42 | return do_save_backup, last_backup_time 43 | 44 | def build_policy_observation_filter(extra_args, ob_space): 45 | ''' 46 | If extra_args.policy_observation_filter is a string of the form "1:3:6", only provide the policy with observations number 1, 3 and 6 47 | ''' 48 | if len(extra_args.policy_observation_filter) == 0: 49 | observation_filter = lambda ob: ob 50 | ob_space_filtered = ob_space 51 | else: 52 | indices = [int(_v) for _v in extra_args.policy_observation_filter.split(':')] 53 | observation_filter = lambda ob: np.array([ob[_i] for _i in indices], dtype=ob.dtype) 54 | low_filtered = observation_filter(ob_space.low) 55 | high_filtered = observation_filter(ob_space.high) 56 | ob_space_filtered = gym.spaces.Box(low=low_filtered, high=high_filtered, dtype=ob_space.dtype) 57 | return ob_space_filtered, observation_filter 58 | 59 | def build_mpi_vars(extra_args): 60 | ''' 61 | Initialize process indices across direct and recovery agents 62 | ''' 63 | mpi_comm = MPI.COMM_WORLD 64 | mpi_rank = mpi_comm.Get_rank() 65 | is_direct_policy = mpi_rank < extra_args.n_direct 66 | 67 | mpi_root_direct = 0 68 | mpi_group_direct = list(range(extra_args.n_direct)) 69 | mpi_root_recovery = extra_args.n_direct 70 | mpi_group_recovery = list(range(extra_args.n_direct, extra_args.n_direct + extra_args.n_recovery)) 71 | if is_direct_policy: 72 | mpi_root = mpi_root_direct 73 | mpi_group = mpi_group_direct 74 | else: 75 | mpi_root = mpi_root_recovery 76 | mpi_group = mpi_group_recovery 77 | mpi_destinations = [_e for _e in mpi_group if _e != mpi_root] 78 | mpi_n_processes = extra_args.n_direct + extra_args.n_recovery 79 | is_root = mpi_rank == mpi_root 80 | 81 | if extra_args.n_recovery > 0: 82 | # Correspondences between direct and recovery agents for CNet data exchange 83 | cnet_exchange_ids = {_i: [] for _i in mpi_group_direct + mpi_group_recovery} 84 | for _i in range(max(len(mpi_group_direct), len(mpi_group_recovery))): 85 | _i_direct = mpi_group_direct[_i % len(mpi_group_direct)] 86 | _i_recovery = mpi_group_recovery[_i % len(mpi_group_recovery)] 87 | if not (_i_recovery in cnet_exchange_ids[_i_direct]): 88 | cnet_exchange_ids[_i_direct].append(_i_recovery) 89 | if not (_i_direct in cnet_exchange_ids[_i_recovery]): 90 | cnet_exchange_ids[_i_recovery].append(_i_direct) 91 | 92 | # Also get the index of each recovery process within those associated to the corresponding direct process (re-read this several times) 93 | cnet_recovery_id_in_direct_exchange_ids = {_i: {} for _i in mpi_group_recovery} 94 | for _i_recovery in mpi_group_recovery: 95 | for _i_direct in cnet_exchange_ids[_i_recovery]: 96 | cnet_recovery_id_in_direct_exchange_ids[_i_recovery][_i_direct] = cnet_exchange_ids[_i_direct].index(_i_recovery) 97 | n_exchange_processes = len(cnet_exchange_ids[mpi_rank]) 98 | else: 99 | cnet_exchange_ids = None 100 | cnet_recovery_id_in_direct_exchange_ids = None 101 | n_exchange_processes = None 102 | 103 | return mpi_comm, mpi_rank, is_direct_policy, mpi_root, mpi_group, mpi_destinations, mpi_n_processes, is_root, cnet_recovery_id_in_direct_exchange_ids, cnet_exchange_ids, n_exchange_processes 104 | 105 | def save_models_and_data(extra_args, iters_so_far, end_training, last_backup_time, 106 | is_root, mpi_rank, pi, cnet, constraint_demonstration_buffer): 107 | ''' 108 | Save policy network, constraint network and constraint demonstration buffer 109 | ''' 110 | do_save_at_all = extra_args.backup_frequency > 0 111 | do_save_this_iter = (((iters_so_far - 1) % extra_args.backup_frequency) == 0) or end_training 112 | do_save_this_time, last_backup_time = check_time_between_backups(extra_args, last_backup_time) 113 | do_save_policy = not extra_args.only_train_constraints 114 | do_save_constraints = not extra_args.only_train_policy 115 | do_save_buffer = not (extra_args.only_train_policy or extra_args.only_train_constraints) 116 | if do_save_at_all and do_save_this_iter and do_save_this_time: 117 | if do_save_policy and is_root: 118 | # save direct and recovery policies separatery 119 | pi.save_model(global_step=(iters_so_far-1), verbose=True) 120 | if do_save_constraints and (mpi_rank == 0): 121 | # same CNet for all agents 122 | cnet.save_model(global_step=(iters_so_far-1), verbose=True) 123 | if do_save_buffer: 124 | # different buffers for all agents 125 | constraint_demonstration_buffer.write(verbose=is_root) 126 | return last_backup_time 127 | -------------------------------------------------------------------------------- /ceres/envs/resetter/resetter_env_ceres.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import numpy as np 6 | from .resetter_env import ResetterEnv 7 | 8 | class ResetterEnvCeres(ResetterEnv): 9 | ''' 10 | Resetter base environment implementing functions required by the CERES logic, 11 | e.g., reset from trajectory midpoints, remove identified demonstrations, etc. 12 | ''' 13 | max_reference_trajectories = -1 # Set this to a non negative value in a child class 14 | 15 | def _init_reference_parameters(self): 16 | ''' 17 | The maximum number of trajectories to keep is application-specific 18 | ''' 19 | assert self.max_reference_trajectories >= 0, 'max_reference_trajectories must be non negative: set it in class {0}'.format(type(self)) 20 | 21 | def _init_reference_trajectories(self): 22 | ''' 23 | Setup empty reference trajectory list 24 | ''' 25 | self.reference_trajectories = [] 26 | self.reset_count_per_trajectory = [] # these values are incremented outside the environment 27 | self.filter_reset_per_trajectory = [] # check for difference before incrementing 28 | 29 | def add_reference_trajectory(self, trajectory): 30 | ''' 31 | Add a new reference trajectory and generate new metadata 32 | ''' 33 | if len(self.reference_trajectories) < self.max_reference_trajectories: 34 | self.reference_trajectories.append(trajectory) 35 | self.reset_count_per_trajectory.append(0) 36 | self.filter_reset_per_trajectory.append(None) 37 | else: 38 | pass # Skip if full, add other behaviors in the future 39 | 40 | def get_random_reference_index(self): 41 | ''' 42 | Return trajectory midpoints 43 | ''' 44 | assert len(self.reference_trajectories) > 0, 'No active trajectory' 45 | i_traj = np.random.randint(0, len(self.reference_trajectories)) 46 | i_state = self.get_reference_trajectory_midpoint(i_traj) 47 | return i_traj, i_state 48 | 49 | def get_reference_trajectory(self, i_traj): 50 | return self.reference_trajectories[i_traj] 51 | 52 | def get_reference_trajectory_midpoint(self, i_traj): 53 | i_state = self.reference_trajectories[i_traj].get_midpoint() 54 | return i_state 55 | 56 | def remove_empty_trajectories(self): 57 | ''' 58 | Remove trajectories that have no active state 59 | ''' 60 | i_traj_active = [] 61 | for (i, traj) in enumerate(self.reference_trajectories): 62 | if traj.length_active > 0: 63 | i_traj_active.append(i) 64 | n_remove = len(self.reference_trajectories) - len(i_traj_active) 65 | self.reference_trajectories = [self.reference_trajectories[i] for i in i_traj_active] 66 | self.reset_count_per_trajectory = [self.reset_count_per_trajectory[i] for i in i_traj_active] 67 | self.filter_reset_per_trajectory = [self.filter_reset_per_trajectory[i] for i in i_traj_active] 68 | return n_remove 69 | 70 | def check_remove_traj(self, traj): 71 | ''' 72 | Check if the trajectory can be removed based on the number of active snapshots 73 | ''' 74 | do_remove_traj = traj.length_active == 0 75 | if traj.length_active == 1: # remove also if the only demonstration left is already classified 76 | demonstration = traj.get_demonstration(traj.active_demonstrations[0]) 77 | do_remove_traj = demonstration.test_is_classified() 78 | if do_remove_traj: 79 | #print('Final demonstration is already classified as {0}'.format(demonstration.action_indicator)) 80 | pass 81 | else: 82 | traj.do_reset_after_last_active = True 83 | return do_remove_traj 84 | 85 | def update_reference_trajectory(self, i_traj, is_resized, remove_if_emptied=False): 86 | ''' 87 | Reset trajectory metadata and remove if applicable 88 | ''' 89 | traj = self.reference_trajectories[i_traj] 90 | if is_resized: 91 | self.reset_count_per_trajectory[i_traj] = 0 92 | self.filter_reset_per_trajectory[i_traj] = None 93 | if remove_if_emptied: 94 | if self.check_remove_traj(traj): 95 | self.reference_trajectories.pop(i_traj) 96 | self.reset_count_per_trajectory.pop(i_traj) 97 | self.filter_reset_per_trajectory.pop(i_traj) 98 | 99 | def get_reference_trajectory_active_demonstrations_from(self, i_traj, begin, remove_demonstrations=False, return_copy=True, remove_if_emptied=False): 100 | ''' 101 | Get a sub-trajectory starting from a given active demonstration and update metadata 102 | ''' 103 | traj = self.reference_trajectories[i_traj] 104 | subtraj, is_resized = traj.get_active_demonstrations_from(begin, remove_demonstrations=remove_demonstrations, return_copy=return_copy) 105 | self.update_reference_trajectory(i_traj, is_resized, remove_if_emptied=remove_if_emptied) 106 | return subtraj 107 | 108 | def get_reference_trajectory_active_demonstrations_to(self, i_traj, end, remove_demonstrations=False, return_copy=True, remove_if_emptied=False): 109 | ''' 110 | Get a sub-trajectory up to a given active demonstration and update metadata 111 | ''' 112 | traj = self.reference_trajectories[i_traj] 113 | subtraj, is_resized = traj.get_active_demonstrations_to(end, remove_demonstrations=remove_demonstrations, return_copy=return_copy) 114 | self.update_reference_trajectory(i_traj, is_resized, remove_if_emptied=remove_if_emptied) 115 | return subtraj 116 | 117 | def get_reference_trajectory_demonstration(self, i_traj, i_state, return_copy=True): 118 | ''' 119 | Get a chosen demonstration within a chosen trajectory, or copy thereof for separate processing 120 | ''' 121 | traj = self.reference_trajectories[i_traj] 122 | demonstration = traj.get_demonstration(i_state, return_copy=return_copy) 123 | return demonstration 124 | 125 | def increment_trajectory_reset_count(self, i_traj, increment=1, increment_reset_count_on_change=None): 126 | ''' 127 | Increment the number of times a trajectory has been reset too, unless a reset criterion is set 128 | ''' 129 | if increment_reset_count_on_change is not None: 130 | if increment_reset_count_on_change == self.filter_reset_per_trajectory[i_traj]: 131 | return 132 | else: 133 | self.filter_reset_per_trajectory[i_traj] = increment_reset_count_on_change 134 | self.reset_count_per_trajectory[i_traj] += increment 135 | 136 | def get_trajectory_reset_count(self, i_traj): 137 | return self.reset_count_per_trajectory[i_traj] 138 | 139 | -------------------------------------------------------------------------------- /ceres/envs/constrained/constrained_env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import gym 6 | from ceres.tools.math import QPSolverQuadprog 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | class ConstrainedEnv(gym.Env): 11 | ''' 12 | Base class for constrained environments, with action correction prior to playing 13 | ''' 14 | 15 | return_zero_if_opt_fails = True 16 | constraint_violation_factor = 1. 17 | ineq_vec_margin = 0. 18 | has_ineq = True 19 | has_eq = True 20 | 21 | def __init__(self, *args, **kwargs): 22 | self.init_solver() 23 | self.check_instance() 24 | self.ineq_mat = None 25 | self.ineq_vec = None 26 | 27 | def init_solver(self): 28 | ''' 29 | Correct actions with quadratic programming, define functions to implement within child classes 30 | ''' 31 | self.correct_in_env = True 32 | self.required_functions = ['update_ineq_matrices'] 33 | if self.has_ineq: 34 | self.required_functions.append('update_ineq_matrices') 35 | if self.has_eq: 36 | self.required_functions.append('update_eq_matrices') 37 | self.solver = QPSolverQuadprog() 38 | 39 | def check_instance(self): 40 | ''' 41 | Check necessary attributes from parent and child classes 42 | ''' 43 | for f_name in self.required_functions: 44 | assert hasattr(self, f_name), 'Required function {0} is not implemented in {1}'.format(f_name, type(self)) 45 | for attr_name in ['observation_space', 'action_space']: 46 | assert hasattr(self, attr_name), 'Undefined attribute {0}: make sure the base environment is initialized'.format(attr_name) 47 | n_obs = self.observation_space.shape[0] 48 | if hasattr(self, 'n_obs'): 49 | assert self.n_obs == n_obs, 'Found two different values of n_obs: {0} and {1}'.format(self.n_obs, n_obs) 50 | else: 51 | self.n_obs = n_obs 52 | n_act = self.action_space.shape[0] 53 | if hasattr(self, 'n_act'): 54 | assert self.n_act == n_act, 'Found two different values of n_act: {0} and {1}'.format(self.n_act, n_act) 55 | else: 56 | self.n_act = n_act 57 | 58 | def set_ineq_margin(self, margin_param, relative=True): 59 | ''' 60 | Define a margin that corrected actions must preserve w.r.t. constraints, that is, solve G x <= h - margin 61 | ''' 62 | self.ineq_vec_margin = margin_param 63 | if relative: 64 | ac_space_pm = [0.5*(high-low) for low, high in zip(self.action_space.low, self.action_space.high)] 65 | self.ineq_vec_margin *= min(ac_space_pm) 66 | assert self.ineq_vec_margin >= 0, 'Negative margin not supported, but you can disable this check to allow constraint violation' 67 | 68 | 69 | def update_solver(self, do_update_eq=True, do_update_ineq=True, do_update_obj=True): 70 | ''' 71 | Update QP solver parameters 72 | ''' 73 | self.solver.reset(do_reset_eq=do_update_eq, do_reset_ineq=do_update_ineq, do_reset_obj=do_update_obj) 74 | self.solver.add_obj(self.obj_mat, self.obj_vec) 75 | if self.has_ineq: 76 | ineq_vec_solve = self.ineq_vec - self.ineq_vec_margin # account for conservative margin 77 | self.solver.add_ineq(self.ineq_mat, ineq_vec_solve) 78 | if self.has_eq: 79 | self.solver.add_eq(self.eq_mat, self.eq_vec) 80 | self.solver.update() 81 | 82 | def correct_action(self, target_action, do_update_eq=True, do_update_ineq=True, do_update_obj=True): 83 | ''' 84 | Correct action by solving the QP and compute how much the uncorrected action violates the constraints 85 | ''' 86 | # Only rebuild objective function matrices since inequality matrices are already rebuilt at the end of each step 87 | if do_update_obj: 88 | self.update_obj_matrices(target_action) 89 | self.update_solver(do_update_eq=do_update_eq, do_update_ineq=do_update_ineq, do_update_obj=do_update_obj) 90 | corrected_action, success = self.solver.solve() 91 | if success: 92 | corrected_action = np.reshape(corrected_action, target_action.shape) 93 | else: 94 | if self.return_zero_if_opt_fails: 95 | corrected_action = np.zeros(target_action.shape) 96 | else: 97 | corrected_action = target_action 98 | viol = self.calc_constraint_violation(target_action) 99 | return corrected_action, success, viol 100 | 101 | def print_ineq(self, ineq_mat=None, ineq_vec=None): 102 | ''' 103 | Print inequality constraints in a human-readable format 104 | ''' 105 | print(self.ineq_to_str(ineq_mat=ineq_mat, ineq_vec=ineq_vec)) 106 | 107 | def ineq_to_str(self, ineq_mat=None, ineq_vec=None): 108 | ''' 109 | Build a human-readable string for inequality constraints 110 | ''' 111 | if ineq_mat is None: 112 | ineq_mat = self.ineq_mat 113 | if ineq_vec is None: 114 | ineq_vec = self.ineq_vec 115 | ineq_mat_str = str(ineq_mat) 116 | ineq_mat_lines_str = ineq_mat_str.split('\n') 117 | ineq_vec_str = str(ineq_vec) 118 | ineq_vec_lines_str = ineq_vec_str.split('\n') 119 | n_ineq = len(ineq_mat_lines_str) 120 | n_digits_max = len(str(n_ineq)) 121 | opt_var_str = [['X{0}'.format(str(_i).zfill(n_digits_max))] for _i in range(n_ineq)] 122 | opt_var_lines_str = str(np.array(opt_var_str)).split('\n') 123 | ineq_str_lines = [] 124 | for _i, (ineq_mat_line_str, opt_var_line_str, ineq_vec_line_str) in enumerate(zip(ineq_mat_lines_str, opt_var_lines_str, ineq_vec_lines_str)): 125 | ineq_str_line = '{0}{3} {1} {3}<= {2}{3}'.format(ineq_mat_line_str, opt_var_line_str, ineq_vec_line_str, ' ' if _i < n_ineq-1 else '') 126 | ineq_str_lines.append(ineq_str_line) 127 | ineq_str = '\n'.join(ineq_str_lines) 128 | return ineq_str 129 | 130 | def calc_constraint_violation(self, raw_action): 131 | ''' 132 | Compute the L2 norm of the constraint violation margin for the uncorrected action 133 | ''' 134 | if self.ineq_mat is not None: 135 | a = np.reshape(raw_action, (self.n_act, 1)) 136 | ineq_diff = np.dot(self.ineq_mat, a) - self.ineq_vec 137 | ineq_val = np.maximum(ineq_diff, 0.) 138 | ineq_viol = np.linalg.norm(ineq_val) 139 | else: 140 | ineq_viol = 0. 141 | return ineq_viol 142 | 143 | def update_obj_matrices(self, target_action): 144 | ''' 145 | Build objective function matrices of the form 1/2 xT P x + qT x, 146 | to minimize the distance between optimal and uncorrected (target) action, 1/2 || target - x || ^2, 147 | hence P = identity and q = -target 148 | ''' 149 | self.obj_mat = np.eye(self.n_act) 150 | self.obj_vec = -np.reshape(target_action, (self.n_act, 1)) 151 | 152 | -------------------------------------------------------------------------------- /ceres/baselines/ppo1/pposgd_simple_helper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | These are components of OpenAI's baselines.ppo1.pposgd_simple.learn 3 | cut into individual functions for re-use in CERES 4 | OpenAI Baselines is licensed under the MIT License, see LICENSE 5 | ''' 6 | 7 | from baselines.ppo1.pposgd_simple import add_vtarg_and_adv 8 | from baselines.common import Dataset, explained_variance, fmt_row, zipsame 9 | from baselines import logger 10 | import baselines.common.tf_util as U 11 | import tensorflow as tf, numpy as np 12 | import time 13 | from baselines.common.mpi_adam import MpiAdam 14 | from collections import deque 15 | from mpi4py import MPI 16 | 17 | def calc_end_training(max_timesteps, timesteps_so_far, 18 | max_episodes, episodes_so_far, 19 | max_iters, iters_so_far, 20 | max_seconds, tstart): 21 | if max_timesteps and timesteps_so_far >= max_timesteps: 22 | return True 23 | elif max_episodes and episodes_so_far >= max_episodes: 24 | return True 25 | elif max_iters and iters_so_far >= max_iters: 26 | return True 27 | elif max_seconds and time.time() - tstart >= max_seconds: 28 | return True 29 | else: 30 | return False 31 | 32 | def build_policy_training_vars(pi, oldpi, clip_param, entcoeff, adam_epsilon): 33 | atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) 34 | ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return 35 | 36 | lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule 37 | clip_param = clip_param * lrmult # Annealed cliping parameter epislon 38 | 39 | ob = U.get_placeholder_cached(name="ob") 40 | ac = pi.pdtype.sample_placeholder([None]) 41 | 42 | kloldnew = oldpi.pd.kl(pi.pd) 43 | ent = pi.pd.entropy() 44 | meankl = tf.reduce_mean(kloldnew) 45 | meanent = tf.reduce_mean(ent) 46 | pol_entpen = (-entcoeff) * meanent 47 | 48 | ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold 49 | surr1 = ratio * atarg # surrogate from conservative policy iteration 50 | surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # 51 | pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) 52 | vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) 53 | total_loss = pol_surr + pol_entpen + vf_loss 54 | losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] 55 | loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] 56 | 57 | var_list = pi.get_trainable_variables() 58 | lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) 59 | adam = MpiAdam(var_list, epsilon=adam_epsilon) 60 | 61 | assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) 62 | for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) 63 | compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) 64 | 65 | return loss_names, var_list, lossandgrad, adam, assign_old_eq_new, compute_losses 66 | 67 | def adjust_policy_learning_rate(schedule, 68 | max_timesteps, timesteps_so_far, 69 | max_episodes, episodes_so_far, 70 | max_iters, iters_so_far): 71 | if schedule == 'constant': 72 | cur_lrmult = 1.0 73 | elif schedule == 'linear': 74 | if max_timesteps > 0: 75 | cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) 76 | elif max_episodes > 0: 77 | cur_lrmult = max(1.0 - float(episodes_so_far) / max_episodes, 0) 78 | elif max_iters > 0: 79 | cur_lrmult = max(1.0 - float(iters_so_far) / max_iters, 0) 80 | else: 81 | raise NotImplementedError 82 | else: 83 | raise NotImplementedError 84 | return cur_lrmult 85 | 86 | def update_policy(pi, seg, gamma, lam, 87 | logger, optim_epochs, optim_batchsize, optim_stepsize, cur_lrmult, 88 | loss_names, lossandgrad, adam, assign_old_eq_new, compute_losses, 89 | mpi_moments_fn): 90 | 91 | add_vtarg_and_adv(seg, gamma, lam) 92 | 93 | # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) 94 | ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] 95 | vpredbefore = seg["vpred"] # predicted value function before udpate 96 | atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate 97 | d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) 98 | optim_batchsize = optim_batchsize or ob.shape[0] 99 | 100 | if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy 101 | 102 | assign_old_eq_new() # set old parameter values to new parameter values 103 | logger.log("Optimizing...") 104 | logger.log(fmt_row(13, loss_names)) 105 | # Here we do a bunch of optimization epochs over the data 106 | for _ in range(optim_epochs): 107 | losses = [] # list of tuples, each of which gives the loss for a minibatch 108 | for batch in d.iterate_once(optim_batchsize): 109 | *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) 110 | adam.update(g, optim_stepsize * cur_lrmult) 111 | losses.append(newlosses) 112 | logger.log(fmt_row(13, np.mean(losses, axis=0))) 113 | 114 | logger.log("Evaluating losses...") 115 | losses = [] 116 | for batch in d.iterate_once(optim_batchsize): 117 | newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) 118 | losses.append(newlosses) 119 | meanlosses,_,_ = mpi_moments_fn(losses) 120 | logger.log(fmt_row(13, meanlosses)) 121 | for (lossval, name) in zipsame(meanlosses, loss_names): 122 | logger.record_tabular("loss_"+name, lossval) 123 | return vpredbefore, tdlamret, optim_batchsize 124 | 125 | def log_iter_info(lenbuffer, rewbuffer, tstart, 126 | vpredbefore, tdlamret, seg, 127 | episodes_so_far, timesteps_so_far, 128 | do_dump_tabular, allgather_fn): 129 | logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) 130 | lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values 131 | listoflrpairs = allgather_fn(lrlocal) # list of tuples 132 | lens, rews = map(flatten_lists, zip(*listoflrpairs)) 133 | lenbuffer.extend(lens) 134 | rewbuffer.extend(rews) 135 | logger.record_tabular("EpLenMean", np.mean(lenbuffer)) 136 | logger.record_tabular("EpRewMean", np.mean(rewbuffer)) 137 | logger.record_tabular("EpThisIter", len(lens)) 138 | episodes_so_far += len(lens) 139 | timesteps_so_far += sum(lens) 140 | logger.record_tabular("EpisodesSoFar", episodes_so_far) 141 | logger.record_tabular("TimestepsSoFar", timesteps_so_far) 142 | logger.record_tabular("TimeElapsed", time.time() - tstart) 143 | if do_dump_tabular: 144 | logger.dump_tabular() 145 | return episodes_so_far, timesteps_so_far 146 | 147 | def build_counters(): 148 | episodes_so_far = 0 149 | timesteps_so_far = 0 150 | iters_so_far = 0 151 | tstart = time.time() 152 | lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths 153 | rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards 154 | return iters_so_far, episodes_so_far, timesteps_so_far, tstart, lenbuffer, rewbuffer 155 | 156 | def flatten_lists(listoflists): 157 | return [el for list_ in listoflists for el in list_] 158 | -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | An open-source and open community project is one in which participants choose to work together, and in that process experience differences in language, location, nationality, and experience. In such a diverse environment, misunderstandings and disagreements happen, which in most cases can be resolved informally. In rare cases, however, behavior can intimidate, harass, or otherwise disrupt one or more people in the community, which this project will not tolerate. 2 | 3 | A **Code of Conduct** is useful to define accepted and acceptable behaviors and to promote high standards of professional practice. It also provides a benchmark for self evaluation and acts as a vehicle for better identity of the organization. 4 | 5 | This code (**CoC**) applies to any participant in this project's community – developers, participants in meetings, teleconferences, mailing lists, conferences or functions, etc. Note that this code complements rather than replaces legal rights and obligations pertaining to any particular situation. 6 | 7 | ## Statement of Intent 8 | 9 | This project is committed to maintain a **positive** [work environment](#work-environment). This commitment calls for a workplace where [participants](#participant) at all levels behave according to the rules of the following code. A foundational concept of this code is that we all share responsibility for our work environment. 10 | ## Code 11 | 1. Treat each other with [respect](#respect), professionalism, fairness, and sensitivity to our many differences and strengths, including in situations of high pressure and urgency. 12 | 1. Never [harass](#harassment) or [bully](#workplace-bullying) anyone verbally, physically or [sexually](#sexual-harassment). 13 | 1. Never [discriminate](#discrimination) on the basis of personal characteristics or group membership. 14 | 1. Communicate constructively and avoid [demeaning](#demeaning-behavior) or [insulting](#insulting-behavior) behavior or language. 15 | 1. Seek, accept, and offer objective work criticism, and [acknowledge](#acknowledgement] properly the contributions of others. 16 | 1. Be honest about your own qualifications, and about any circumstances that might lead to conflicts of interest. 17 | 1. Respect the privacy of others and the confidentiality of data you access. 18 | 1. With respect to cultural differences, be conservative in what you do and liberal in what you accept from others, but not to the point of accepting disrespectful, unprofessional or unfair or [unwelcome behavior](#unwelcome-behavior) or [advances](#unwelcome-sexual-advance). 19 | 1. Promote the rules of this Code and take action (especially if you are in a [leadership position](#leadership-position)) to bring the discussion back to a more civil level whenever inappropriate behaviors are observed. 20 | 1. Stay on topic: Make sure that you are posting to the correct channel and avoid off-topic discussions. Remember when you update an issue or respond to an email you are potentially sending to a large number of people. 21 | 1. Step down considerately: Members of every project come and go, and the Hyperledger Project is no different. When you leave or disengage from the project, in whole or in part, we ask that you do so in a way that minimizes disruption to the project. This means you should tell people you are leaving and take the proper steps to ensure that others can pick up where you left off. 22 | 23 | ## Glossary 24 | #### Demeaning behavior 25 | is acting in a way that reduces another person's dignity, sense of self-worth or respect within the community. 26 | 27 | #### Discrimination 28 | is the prejudicial treatment of an individual based on criteria such as: physical appearance, race, ethnic origin, genetic differences, national or social origin, name, religion, gender, sexual orientation, family or health situation, pregnancy, disability, age, education, wealth, domicile, political view, morals, employment, or union activity. 29 | 30 | #### Insulting behavior 31 | is treating another person with scorn or disrespect. 32 | 33 | #### Acknowledgement 34 | is a record of the origin(s) and author(s) of a contribution. 35 | 36 | #### Harassment 37 | is any conduct, verbal or physical, that has the intent or effect of interfering with an individual, or that creates an intimidating, hostile, or offensive environment. 38 | 39 | #### Leadership position 40 | includes group Chairs, project maintainers, staff members, and Board members. 41 | 42 | #### Participant 43 | includes the following persons: 44 | * Developers 45 | * Anyone from the Public partaking in this project's work environment (e.g. contribute code, comment on our code or specs, email us, attend our conferences, functions, etc) 46 | 47 | #### Respect 48 | is the genuine consideration you have for someone (if only because of their status as participant in Hyperledger Project, like yourself), and that you show by treating them in a polite and kind way. 49 | 50 | #### Sexual harassment 51 | includes visual displays of degrading sexual images, sexually suggestive conduct, offensive remarks of a sexual nature, requests for sexual favors, unwelcome physical contact, and sexual assault. 52 | 53 | #### Unwelcome behavior 54 | Hard to define? Some questions to ask yourself are: 55 | * how would I feel if I were in the position of the recipient? 56 | * would my spouse, parent, child, sibling or friend like to be treated this way? 57 | * would I like an account of my behavior published in the organization's newsletter? 58 | * could my behavior offend or hurt other members of the work group? 59 | * could someone misinterpret my behavior as intentionally harmful or harassing? 60 | * would I treat my boss or a person I admire at work like that ? 61 | 62 | _Summary_: if you are unsure whether something might be welcome or unwelcome, don't do it. 63 | 64 | #### Unwelcome sexual advance 65 | includes requests for sexual favors, and other verbal or physical conduct of a sexual nature, where: 66 | * submission to such conduct is made either explicitly or implicitly a term or condition of an individual's employment, 67 | * submission to or rejection of such conduct by an individual is used as a basis for employment decisions affecting the individual, 68 | * such conduct has the purpose or effect of unreasonably interfering with an individual's work performance or creating an intimidating hostile or offensive working environment. 69 | 70 | #### Workplace Bullying 71 | is a tendency of individuals or groups to use persistent aggressive or unreasonable behavior (e.g. verbal or written abuse, offensive conduct or any interference which undermines or impedes work) against a co-worker or any professional relations. 72 | 73 | #### Work Environment 74 | is the set of all available means of collaboration, including, but not limited to messages to mailing lists, private correspondence, Web pages, chat channels, phone and video teleconferences, and any kind of face-to-face meetings or discussions. 75 | 76 | ## Incident Procedure 77 | 78 | To report incidents or to appeal reports of incidents, contact the Project maintainers. Please include any available relevant information, including links to any publicly accessible material relating to the matter. Every effort will be taken to ensure a safe and collegial environment in which to collaborate on matters relating to the Project. In order to protect the community, the Project reserves the right to take appropriate action, potentially including the removal of an individual from any and all participation in the project. The Project will work towards an equitable resolution in the event of a misunderstanding. 79 | 80 | ## Credits 81 | 82 | This code is based on the [Hyperledger Project's CoC](https://github.com/hyperledger/hyperledger/wiki/Hyperledger-Project-Code-of-Conduct), [W3C’s Code of Ethics and Professional Conduct](https://www.w3.org/Consortium/cepc) with some additions from the [Cloud Foundry](https://www.cloudfoundry.org/)‘s Code of Conduct. 83 | -------------------------------------------------------------------------------- /examples/02-constraints-from-scratch.md: -------------------------------------------------------------------------------- 1 | # Training policy and constraints from scratch 2 | 3 | We consider an environment in which an agent navigates through random obstacles to reach a target by position commands. 4 | The starting position of the agent, the position of the target, the obstacle positions and sizes are randomized at each episode. 5 | 6 | Visualize the environment with a random policy: 7 | ``` 8 | python3 -m ceres.scripts.play_policy --env_id Nav2dPosRandomHolesCeres-v0 \ 9 | --max_episodes 1000 --render 10 | ``` 11 | 12 | 13 | Arguments: 14 | * ```--env_id Nav2dPosRandomHolesCeres-v0```: environment name (can load environments from modules other than ```ceres``` with the extended argument format ```:```) 15 | * ```--max_episodes 1000```: play a random policy for 1000 episodes 16 | * ```--render```: (optional) render to screen 17 | 18 | ## Baseline PPO 19 | 20 | Within CERES, disabling constraints and setting the number of recovery policies to zero amounts to training with PPO: 21 | ``` 22 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosRandomHolesCeres-v0 \ 23 | --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \ 24 | --max_iter 1000 --output random_pos_ppo_full 25 | ``` 26 | Arguments: 27 | * ```--only_train_policy```: only train policy, not constraints 28 | * ```--constant_constraint_activation 0.```: set the constraint activation probability to zero throughout training 29 | * ```--n_recovery 0```: do not train recovery agents 30 | * ```--max_iter 1000```: do reinforcement learning for 1000 iterations 31 | * ```--output random_pos_ppo_full```: save logs in ```logs/random_pos_ppo_full```. If the directory already exists, remove it manually or run the script with ```--overwrite``` 32 | * Optionally, run with ```--render``` to visualize exploration and constraints. 33 | 34 | Optionally, we can train the control policy using only a selection of available observations: 35 | ``` 36 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosRandomHolesCeres-v0 \ 37 | --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \ 38 | --policy_observation_filter 0:1:2:3 --max_iter 1000 --output random_pos_ppo_partial 39 | ``` 40 | Argument: 41 | * ```--policy_observation_filter 0:1:2:3```: only provide the policy with state elements 0, 1, 2 and 3 (agent and target 2D locations), not 4 and up (distances to surrounding obstacles) 42 | 43 | ## Learning constraints through exploration and recovery 44 | 45 | Train direct and recovery policies with CERES: 46 | ``` 47 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosRandomHolesCeres-v0 \ 48 | --cnet_n_ineq 4 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \ 49 | --cnet_spherical_coordinates --cnet_predict_interior_point --unconstrained_recovery \ 50 | --adaptive_constraint_activation prior_min --interrupt_constraint_training prior_accuracy:0.95:5:0.90:1 \ 51 | --max_iter 1000 --output random_pos_ceres_full 52 | ``` 53 | 54 | Arguments: 55 | * ```--cnet_n_ineq 4```: 4 inequality constraints 56 | * ```--cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6```: loss weights for positive violation max, negative satisfaction min, L2 regularization 57 | * ```--cnet_spherical_coordinates```: predict unit-norm constraints using spherical coordinates (alternatively, use ```--cnet_normalize_ineq_mat``` for post-normalization) 58 | * ```--cnet_predict_interior_point```: predict constraints such that there exists an interior point that satisfies them all 59 | * ```--unconstrained_recovery```: only apply constraints to the direct agent, not recovery 60 | * ```--adaptive_constraint_activation prior_min```: adjust the constraint activation probability based on their accuracy at this iteration before training 61 | * ```--interrupt_constraint_training prior_accuracy:0.95:5:0.90:1```: stop training constraints if their accuracy before training (prior_accuracy) exceeds 95% (0.95) for at least 5 iterations (5). Re-enable training if constraint accuracy falls below 90% (0.90) for at least 1 iteration (1) 62 | * If omitted, the ```--n_recovery``` argument is set to be equal to the number of direct agents ```--n_direct``` (1 by default) 63 | 64 | Similarly to the baseline PPO, we can train the policy from agent and target positions only and let the constraint network deal with obstacle avoidance: 65 | ``` 66 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosRandomHolesCeres-v0 \ 67 | --cnet_n_ineq 4 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \ 68 | --cnet_spherical_coordinates --cnet_predict_interior_point --unconstrained_recovery \ 69 | --adaptive_constraint_activation prior_min --interrupt_constraint_training prior_accuracy:0.95:5:0.90:1 \ 70 | --policy_observation_filter 0:1:2:3 --max_iter 1000 --output random_pos_ceres_partial 71 | ``` 72 | 73 | ## Compare rewards with and without constrained exploration 74 | 75 | Plot the rewards during training: 76 | ``` 77 | python3 -m ceres.scripts.plot_rewards \ 78 | --plot_path "PPO full state=logs/random_pos_ppo_full/worker_0_direct" \ 79 | --plot_path "PPO partial state=logs/random_pos_ppo_partial/worker_0_direct" \ 80 | --plot_path "CERES full state=logs/random_pos_ceres_full/worker_0_direct" \ 81 | --plot_path "CERES partial state=logs/random_pos_ceres_partial/worker_0_direct" 82 | ``` 83 | 84 | 85 | 86 | ## Random obstacles with force control 87 | 88 | We can apply the same method to the case where the agent is controlled with force commands. 89 | 90 | Visualize the environment with a random policy: 91 | ``` 92 | python3 -m ceres.scripts.play_policy --env_id Nav2dForceRandomHolesCeres-v0 \ 93 | --max_episodes 1000 --render 94 | ``` 95 | 96 | 97 | Baseline PPO from full state: 98 | ``` 99 | python3 -m ceres.scripts.train_ceres --env_id Nav2dForceRandomHolesCeres-v0 \ 100 | --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \ 101 | --max_iter 1000 --output random_force_ppo_full 102 | ``` 103 | 104 | Baseline PPO from filtered state (0 to 5: agent position and velocity, target position): 105 | ``` 106 | python3 -m ceres.scripts.train_ceres --env_id Nav2dForceRandomHolesCeres-v0 \ 107 | --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \ 108 | --policy_observation_filter 0:1:2:3:4:5 --max_iter 1000 --output random_force_ppo_partial 109 | ``` 110 | 111 | CERES from full state: 112 | ``` 113 | python3 -m ceres.scripts.train_ceres --env_id Nav2dForceRandomHolesCeres-v0 \ 114 | --cnet_n_ineq 4 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \ 115 | --cnet_spherical_coordinates --cnet_predict_interior_point --unconstrained_recovery \ 116 | --adaptive_constraint_activation prior_min --interrupt_constraint_training prior_accuracy:0.95:5:0.90:1 \ 117 | --max_iter 1000 --output random_force_ceres_full 118 | ``` 119 | 120 | CERES from filtered state: 121 | ``` 122 | python3 -m ceres.scripts.train_ceres --env_id Nav2dForceRandomHolesCeres-v0 \ 123 | --cnet_n_ineq 4 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \ 124 | --cnet_spherical_coordinates --cnet_predict_interior_point --unconstrained_recovery \ 125 | --adaptive_constraint_activation prior_min --interrupt_constraint_training prior_accuracy:0.95:5:0.90:1 \ 126 | --policy_observation_filter 0:1:2:3:4:5 --max_iter 1000 --output random_force_ceres_partial 127 | ``` 128 | 129 | Plot the rewards during training: 130 | ``` 131 | python3 -m ceres.scripts.plot_rewards \ 132 | --plot_path "PPO full state=logs/random_force_ppo_full/worker_0_direct" \ 133 | --plot_path "PPO partial state=logs/random_force_ppo_partial/worker_0_direct" \ 134 | --plot_path "CERES full state=logs/random_force_ceres_full/worker_0_direct" \ 135 | --plot_path "CERES partial state=logs/random_force_ceres_partial/worker_0_direct" 136 | ``` 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /ceres/tools/plot/plot_logs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import os 8 | 9 | class PlotLogs(object): 10 | ''' 11 | Base class for plotting reward plots, provided input logs 12 | ''' 13 | 14 | default_seed_value = 'N/A' 15 | plot_as_rows = True 16 | 17 | def __init__(self, plot_config): 18 | self.plot_config = plot_config 19 | self.plots = [] 20 | self.n_plots = 0 21 | super().__init__() 22 | 23 | def add_plot(self, title='', paths='', label='', color='k', skip_error=False): 24 | if not type(paths) == list: 25 | assert type(paths) == str 26 | paths = [paths] 27 | for i_path, path in enumerate(paths): 28 | assert os.path.exists(path), 'Experiment path does not exist: {0}'.format(path) 29 | if path[-1] == '/': 30 | paths[i_path] = path[:-1] 31 | try: 32 | path_sessions, suffix_sessions = self.load_paths(paths) 33 | plot_info = { 34 | 'title': title, 35 | 'label': label, 36 | 'color': color, 37 | 'path_sessions': path_sessions, 38 | 'suffix_sessions': suffix_sessions, 39 | } 40 | self.plots.append(plot_info) 41 | print('Found logs for {0}: {1}'.format(' '.join(title.split('\n')), plot_info['path_sessions'])) 42 | success = True 43 | self.n_plots += 1 44 | except Exception as e: 45 | print('Could not find logs for {0}'.format(' '.join(title.split('\n')))) 46 | success = False 47 | if skip_error: 48 | print('Skip') 49 | else: 50 | raise(e) 51 | return success 52 | 53 | 54 | def load_paths(self, paths=None): 55 | raise NotImplementedError('Implement this into your child class') 56 | 57 | def calc_plots(self): 58 | raise NotImplementedError('Implement this in your own child class') 59 | 60 | def plot(self, show=True): 61 | self.calc_plots() 62 | plt.figure() 63 | n_cols = self.n_plots 64 | max_elements = min([len(plot_info['t']) for plot_info in self.plots]) 65 | reward_min = min([min(plot_info['rewards']) for plot_info in self.plots]) 66 | reward_max = max([max(plot_info['rewards']) for plot_info in self.plots]) 67 | y_lim = [reward_min - 0.10*(reward_max - reward_min), 68 | reward_max + 0.10*(reward_max - reward_min)] 69 | y_lim = np.array(y_lim) 70 | x_vec_average_list = [] 71 | y_vec_average_list = [] 72 | for i_plot, plot_info in enumerate(self.plots): 73 | t_averaged = plot_info['t'] 74 | rew_averaged = plot_info['rewards'] 75 | if self.plot_as_rows: 76 | ax = plt.subplot(n_cols, 1, i_plot+1) 77 | else: 78 | ax = plt.subplot(1, n_cols, i_plot+1) 79 | x_vec = np.array(t_averaged)/self.plot_config.timesteps_per_iteration 80 | label_x = self.plot_config.label_x_iterations 81 | y_vec = np.array(rew_averaged) 82 | # Individual rewards 83 | x_plot = x_vec 84 | y_plot = y_vec 85 | plt.plot(x_plot, y_plot, self.plot_config.color_rewards_ind, alpha=0.5, label='Episode reward') 86 | # Compute moving average 87 | x_vec_select = x_vec 88 | y_vec_average = self.calc_moving_average(y_vec, n=self.plot_config.n_average) 89 | x_vec_average_list.append(x_vec_select) 90 | y_vec_average_list.append(y_vec_average) 91 | # Standard deviation 92 | y_vec_std = self.calc_moving_std(y_vec, n=self.plot_config.n_average) 93 | y_vec_average_minus_std = y_vec_average - y_vec_std 94 | y_vec_average_plus_std = y_vec_average + y_vec_std 95 | x_plot = x_vec_select 96 | y_plot_minus = y_vec_average_minus_std 97 | y_plot_plus = y_vec_average_plus_std 98 | plt.fill_between(x_plot, y_plot_minus, y_plot_plus, facecolor=self.plot_config.color_rewards_std, alpha=1., label='Standard deviation') 99 | # Average rewards 100 | x_plot = x_vec_select 101 | y_plot = y_vec_average 102 | plt.plot(x_plot, y_plot, self.plot_config.color_rewards_avg, alpha=1., label='Average reward') 103 | if self.plot_as_rows: 104 | plt.ylabel(self.plot_config.label_y) 105 | if i_plot == len(self.plots)-1: 106 | plt.xlabel(label_x) 107 | else: 108 | pass 109 | else: 110 | plt.xlabel(label_x) 111 | if i_plot == 0: 112 | plt.ylabel(self.plot_config.label_y) 113 | else: 114 | pass 115 | title_loc = plot_info['title'] 116 | plt.title(title_loc) 117 | plt.ylim(y_lim) 118 | # Legends 119 | bottom_legend_artists = [] 120 | bottom_legend_labels = [] 121 | # Individual rewards 122 | reward_ind_artist = plt.Line2D((0, 1), (0, 0), alpha=0.5, color=self.plot_config.color_rewards_ind) 123 | bottom_legend_artists.append(reward_ind_artist) 124 | bottom_legend_labels.append('Episode reward') 125 | # Average rewards 126 | reward_avg_artist = plt.Line2D((0, 1), (0, 0), color=self.plot_config.color_rewards_avg) 127 | bottom_legend_artists.append(reward_avg_artist) 128 | bottom_legend_labels.append('Reward average') 129 | # Standard deviations 130 | reward_std_artist = plt.Line2D((0, 1), (0, 0), color=self.plot_config.color_rewards_std) 131 | bottom_legend_artists.append(reward_std_artist) 132 | bottom_legend_labels.append('Reward std. dev.') 133 | ax.legend(bottom_legend_artists, 134 | bottom_legend_labels, 135 | loc='lower center', 136 | fancybox=True, 137 | ncol=3) 138 | 139 | # Plot average rewards in the same graph 140 | plt.figure() 141 | for i_plot, plot_info in enumerate(self.plots): 142 | x_vec = x_vec_average_list[i_plot] 143 | y_vec = y_vec_average_list[i_plot] 144 | plt.plot(x_vec, y_vec, label=plot_info['label'], color=plot_info['color']) 145 | plt.xlabel(label_x) 146 | plt.ylabel(self.plot_config.label_y) 147 | plt.title('Average rewards') 148 | plt.legend() 149 | 150 | if show: 151 | plt.show() 152 | 153 | def calc_moving_average(self, a, n=3, fill=True) : 154 | assert n % 2 == 1, 'Number of samples to average must be odd' 155 | assert len(a) >= n, 'Not enough samples to average: {0} vs {1}'.format(len(a), n) 156 | if fill: 157 | assert len(a) > 1.5*n, 'Not enough samples to fill' 158 | if n == 1: 159 | return a 160 | ret = np.cumsum(a, dtype=float) 161 | ret[n:] = ret[n:] - ret[:-n] 162 | res = ret[n - 1:] / n 163 | if fill: 164 | add_el = len(a) - len(res) 165 | add_left = int((n-1)/2) 166 | add_right = add_left 167 | el_left = [] 168 | for _i in range(add_left): 169 | subvec = a[_i:_i + n] 170 | assert len(subvec) > 0 171 | el_left.append(np.mean(subvec)) 172 | el_right = [] 173 | for _i in range(len(a)-add_right, len(a)): 174 | subvec = a[-n+_i:_i] 175 | assert len(subvec) > 0 176 | el_right.append(np.mean(subvec)) 177 | res = el_left + list(res) + el_right 178 | res = np.array(res) 179 | assert len(res) == len(a) 180 | return res 181 | 182 | 183 | def calc_moving_std(self, a, n=3, fill=True) : 184 | assert n % 2 == 1, 'Number of samples to average must be odd' 185 | n_total = len(a) 186 | res = np.zeros(n_total) 187 | if n == 1: 188 | return res 189 | n_side = int((n-1)/2) 190 | assert n_side > 0 191 | for i_center in range(n_total): 192 | i_left = max(0, i_center - n_side) 193 | i_right = min(n_total-1, i_center + n_side) 194 | res[i_center] = np.std(a[i_left:(i_right+1)]) 195 | return res 196 | 197 | 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /ceres/tools/io/extra_args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import os 6 | import datetime 7 | import argparse 8 | import shutil 9 | import numpy as np 10 | 11 | class ExtraArgs(object): 12 | ''' 13 | A simple class to parse and check command-line arguments for CERES 14 | ''' 15 | 16 | def __init__(self, log_root='/tmp', args=None, 17 | **kwargs): 18 | if args is not None: 19 | self.args = args 20 | else: 21 | self.args = self.parse_args() 22 | for _k, _v in kwargs.items(): 23 | assert hasattr(self.args, _k) 24 | setattr(self.args, _k, _v) 25 | self.check_args() 26 | self.import_env_module(module_id=self.args.module_id) 27 | self.timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 28 | self.log_root = log_root 29 | self.path_xp = self.build_path() 30 | 31 | def __getattr__(self, _k): 32 | return getattr(self.args, _k) 33 | 34 | def build_path(self): 35 | if len(self.args.output) > 0: 36 | xp_dirname = self.args.output 37 | else: 38 | xp_dirname = self.timestamp 39 | path_xp = os.path.join(self.log_root, xp_dirname) 40 | if os.path.exists(path_xp): 41 | exists_str = 'Log path already exists: {0}'.format(path_xp) 42 | if self.args.overwrite: 43 | path_move = path_xp + '_moved_{0}'.format(self.timestamp) 44 | exists_str += '\n Moved existing logs to: {0}'.format(path_move) 45 | else: 46 | exists_str += '\n Remove dir manually or run with --overwrite' 47 | raise ValueError(exists_str) 48 | shutil.move(path_xp, path_move) 49 | print(exists_str) 50 | return path_xp 51 | 52 | @staticmethod 53 | def parse_env_module(env_id): 54 | if ':' in env_id: 55 | module_id, env_id = env_id.split(':') 56 | else: 57 | module_id = '' 58 | return env_id, module_id 59 | 60 | def check_args(self): 61 | self.args.env_id, self.args.module_id = self.parse_env_module(self.args.env_id) 62 | self.args.cnet_hidden_layers = list(map(int, self.args.cnet_hidden_layers.split(','))) 63 | 64 | self.args.cnet_loss_weights = {} 65 | for loss_weight_arg_str in self.args.cnet_loss: 66 | try: 67 | loss_name, loss_weight = loss_weight_arg_str.split(':') 68 | loss_weight = float(loss_weight) 69 | except: 70 | raise ValueError('Invalid --loss_weight argument {0}, excepted format :'.format(loss_weight_arg_str)) 71 | self.args.cnet_loss_weights[loss_name] = loss_weight 72 | assert not (self.args.cnet_spherical_coordinates and self.args.cnet_normalize_ineq_mat), 'Cannot have simultaneously --cnet_spherical_coordinates and --cnet_normalize_ineq_mat' 73 | assert self.args.n_direct > 0, 'Set at least one direct agent' 74 | if self.args.n_recovery is None: # By default, set equal number of direct and recovery agents 75 | self.args.n_recovery = self.args.n_direct 76 | 77 | assert (self.args.constant_constraint_activation is None) or (len(self.args.adaptive_constraint_activation) == 0), 'Cannot set both constant and adaptive constraint activation probability' 78 | # Plot 79 | if (self.args.plot_average % 2) == 0: self.args.plot_average += 1 # make it odd 80 | 81 | @staticmethod 82 | def import_env_module(env_id=None, module_id=None): 83 | if module_id is None: 84 | assert env_id is not None 85 | env_id, module_id = ExtraArgs.parse_env_module(env_id) 86 | if len(module_id) > 0: 87 | import importlib 88 | print('Import module {0}'.format(module_id)) 89 | importlib.import_module(module_id) 90 | 91 | def parse_args(self): 92 | parser = argparse.ArgumentParser() 93 | args, unprocessed_args = parser.parse_known_args() 94 | 95 | # Base reinforcement learning parameters 96 | parser.add_argument('-e', '--env_id', default='', help='Environment name') 97 | parser.add_argument('--timesteps_per_actorbatch', default=1024, type=int, help='timesteps per actor per training batch') 98 | parser.add_argument('--max_iterations', default=0, type=int, help='maximum total number of training iterations') 99 | parser.add_argument('--max_episodes', default=0, type=int, help='maximum total number of episodes') 100 | parser.add_argument('--max_timesteps', default=0, type=int, help='maximum total number of timesteps') 101 | parser.add_argument('--max_seconds', default=0, type=int, help='maximum training time in seconds') 102 | parser.add_argument('--seed', default=0, type=int, help='random seed') 103 | parser.add_argument('--backup_frequency', default=1, type=int, help='save every n iterations') 104 | parser.add_argument('--backup_keep', default=1, type=int, help='number of backups to keep, set to 0 to keep all') 105 | parser.add_argument('--min_time_between_backups', default=60., type=float, help='minimum time in seconds between model backups') 106 | parser.add_argument('--continue_ceres_training', default='', help='root directory for CERES logs') 107 | parser.add_argument('--output', default='', help='output log dir') 108 | parser.add_argument('--render', action='store_true', help='render') 109 | parser.add_argument('--save_render', default='', help='directory to save render') 110 | parser.add_argument('--policy_hidden_size', type=int, default=64) 111 | parser.add_argument('--policy_hidden_layers', type=int, default=2) 112 | parser.add_argument('--policy_entcoeff', help='entropy coefficiency of policy', type=float, default=0.) 113 | parser.add_argument('--policy_learning_rate_schedule', default='linear', choices=['constant', 'linear'], help='policy learning rate schedule',) 114 | 115 | # Constraint network architecture 116 | parser.add_argument('--cnet_n_ineq', default=2, type=int, help='Number of inequality constraints') 117 | parser.add_argument('--cnet_batch_size', default=64, type=int, help='Batch size') 118 | parser.add_argument('--cnet_hidden_layers', default='64,64', help='Number of inequality constraints') 119 | parser.add_argument('--cnet_spherical_coordinates', action='store_true', help='Inequality matrix is first predicted as (n-1)-dim spherical coordinates') 120 | parser.add_argument('--cnet_normalize_ineq_mat', action='store_true', help='Normalize each row of the inequality matrices') 121 | parser.add_argument('--cnet_predict_interior_point', action='store_true', help='Predict one point satisfying all constraints') 122 | parser.add_argument('--cnet_interior_point_margin_min', default=0.1, type=float, help='minimum distance to interior point') 123 | parser.add_argument('--cnet_interior_point_margin_max', default=1., type=float, help='maximum distance to interior point') 124 | parser.add_argument('--cnet_interior_point_max', default=1., type=float, help='maximum value for the interior point per action component') 125 | parser.add_argument('--cnet_loss', default=[], action='append', help='loss weights') 126 | 127 | # CERES 128 | parser.add_argument('-n', '--n_direct', default=1, type=int, help='number of agents for direct reinforcement learning') 129 | parser.add_argument('--n_recovery', default=None, type=int, help='number of agents learning recovery') 130 | parser.add_argument('--constant_constraint_activation', default=None, type=float, help='constant constraint activation probability') 131 | parser.add_argument('--adaptive_constraint_activation', type=str, choices=['average', 'positive', 'negative', 'prior_average', 'prior_positive', 'prior_negative', 'prior_min'], default='', help='which constraint accuracy to use, if not empty') 132 | parser.add_argument('--interrupt_constraint_training', default='', help='condition for stopping CNet training') 133 | parser.add_argument('--policy_observation_filter', default='', help='use only these state elements') 134 | parser.add_argument('--only_train_constraints', action='store_true', help='only train constraint network in CERES') 135 | parser.add_argument('--only_train_policy', action='store_true', help='only train policy in CERES') 136 | parser.add_argument('--constraint_demonstration_buffer', default='', help='path to constraint demonstration buffer to restore') 137 | parser.add_argument('--constraint_demonstration_buffer_size', default=2048, type=int, help='Experience constraint demonstration size') 138 | parser.add_argument('--cnet_decay_epochs', default=0, type=int, help='decay CNet learning rate every N epochs without improvement') 139 | parser.add_argument('--cnet_decay_max', default=0.01, type=float, help='Keep learning rate >= this') 140 | parser.add_argument('--early_stop_positive', default=1.0, type=float) 141 | parser.add_argument('--early_stop_negative', default=1.0, type=float) 142 | parser.add_argument('--conservative_exploration', default=0.09, type=float, help='remove this from ineq vec') 143 | parser.add_argument('--max_recovery_attempts', default=10, type=int, help='number of recovery attempts per reference trajectory') 144 | parser.add_argument('--unconstrained_recovery', action='store_true', help='do not constrain recovery agent') 145 | parser.add_argument('--cnet_training_epochs', default=10, type=int, help='Number of training epoch') 146 | parser.add_argument('--cnet_training_batches', default=0, type=int, help='If > 0, maximum number of batches per epoch') 147 | parser.add_argument('--cnet_learning_rate', default=1.e-3, type=float) 148 | parser.add_argument('--cnet_improvement_metric', type=str, default='total_loss', choices=['mean_accuracy', 'min_accuracy', 'total_loss', 'mean_loss', 'max_loss'], help='Improvement metric for LR annealing') 149 | 150 | # Write, restore and replay 151 | parser.add_argument('--play_step_duration', default=0., type=float, help='wait duration in seconds when replaying baselines') 152 | parser.add_argument('--trained_policy', default='', help='load policy model backup') 153 | parser.add_argument('--trained_cnet', default='', help='Path to constraint network configuration') 154 | parser.add_argument('--overwrite', action='store_true', help='automatically moves log dir if it already exists') 155 | 156 | # Plot 157 | parser.add_argument('--plot_average', default=401, type=int, help='Moving average over N episodes') 158 | parser.add_argument('--plot_path', default=[], action='append', help='Path to logs') 159 | 160 | args = parser.parse_args(unprocessed_args) 161 | return args 162 | -------------------------------------------------------------------------------- /ceres/baselines/common/plot_logs_baselines.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import os 8 | import ast 9 | from ceres.tools.plot import PlotConfig, PlotLogs 10 | 11 | class PlotLogsBaselines(PlotLogs): 12 | ''' 13 | Load (multiple) baselines logs and plots rewards together with useful statistics. 14 | Supports logs distributed across multiple processes and seeds. 15 | ''' 16 | 17 | key_ep_reward = 'r' 18 | key_ep_length = 'l' 19 | key_ep_time = 't' 20 | keys_ep = [key_ep_time, key_ep_length, key_ep_reward] 21 | 22 | def load_paths(self, paths=None): 23 | ''' 24 | Find log paths by looking for directories of the form "worker_*" 25 | ''' 26 | assert type(paths) == list 27 | suffix_sessions = {} 28 | target_dirname_0 = {} 29 | target_worker_dir_default = 'worker_0' 30 | for _i, _d in enumerate(paths): 31 | _d_basename = os.path.basename(_d) 32 | if 'worker' in _d_basename: 33 | target_worker_dir = '_'.join(_d_basename.split('_')[:2]) 34 | else: 35 | target_worker_dir = target_worker_dir_default 36 | if _d_basename[:len(target_worker_dir)] == target_worker_dir: 37 | target_dirname_0[_i] = _d_basename 38 | suffix_sessions[_i] = _d_basename[len(target_worker_dir):] 39 | else: 40 | target_dirname_0[_i] = target_worker_dir 41 | suffix_sessions[_i] = '' 42 | 43 | path_sessions = {} 44 | for i_path, path_loc in enumerate(paths): 45 | path_sessions[i_path] = path_loc 46 | # For each directory in path_sessions, look for most recent worker_0 directory 47 | for _i, _d in path_sessions.items(): 48 | target_dirname = target_dirname_0[_i] 49 | if os.path.basename(_d) == target_dirname: 50 | continue 51 | subdirs = [subdir for subdir in os.listdir(_d) if os.path.isdir(os.path.join(_d, subdir))] 52 | assert len(subdirs) > 0, 'Could not find any subdirectory in {0}'.format(_d) 53 | if target_dirname in subdirs: 54 | path_sessions[_i] = os.path.join(_d, target_dirname) 55 | else: # Take the most recent directory 56 | subdirs.sort() 57 | found_target_dir = False 58 | for subdir in reversed(subdirs): 59 | path_xp = os.path.join(_d, subdir, target_dirname) 60 | if os.path.exists(path_xp): 61 | path_sessions[_i] = path_xp 62 | found_target_dir = True 63 | break 64 | assert found_target_dir, 'Could not find {0} directory in {1}'.format(target_dirname, path_xp) 65 | # Finally, return folder that contains worker_0 66 | for _i, _d in path_sessions.items(): 67 | assert os.path.basename(_d) == target_dirname_0[_i] 68 | path_sessions[_i] = os.path.join(_d, os.pardir) 69 | return path_sessions, suffix_sessions 70 | 71 | def calc_plots(self): 72 | ''' 73 | Build plots from session paths 74 | ''' 75 | for i_plot, plot_info in enumerate(self.plots): 76 | path_sessions = plot_info['path_sessions'] 77 | suffix_sessions = plot_info['suffix_sessions'] 78 | plot_info['t'], plot_info['rewards'] = self.calc_plot_sessions(path_sessions, suffix_sessions) 79 | 80 | def calc_plot_sessions(self, path_sessions, suffix_sessions): 81 | ''' 82 | Calculate reward plots across multiple sessions (e.g., seeds) 83 | ''' 84 | t_sessions = [] 85 | rewards_sessions = [] 86 | lengths_sessions = [] 87 | for i_session, (seed, path_session) in enumerate(path_sessions.items()): 88 | t_session, rewards_session, lengths_session = self.calc_plot_workers(path_session, suffix_session=suffix_sessions[seed]) 89 | t_sessions.append(t_session) 90 | rewards_sessions.append(rewards_session) 91 | lengths_sessions.append(lengths_session) 92 | if i_session == 0: 93 | len_min = len(t_session) 94 | else: 95 | len_min = min(len_min, len(t_session)) 96 | assert len_min > 0 97 | n_steps_sessions = [np.cumsum(x) for x in lengths_sessions] 98 | t_averaged = list(n_steps_sessions[0]) 99 | rewards_averaged = list(rewards_sessions[0]) 100 | for t_worker, rewards_worker in zip(n_steps_sessions[1:], rewards_sessions[1:]): 101 | t_averaged, [rewards_averaged] = self.combine_logs_xy(t_averaged, [rewards_averaged], t_worker, [rewards_worker]) 102 | return t_averaged, rewards_averaged 103 | 104 | def parse_worker_monitor_csv(self, path_monitor, n_workers=1): 105 | ''' 106 | Parse baselines monitor files 107 | ''' 108 | with open(path_monitor, 'r') as f: 109 | lines = f.read().splitlines() 110 | header = lines[0] 111 | labels = lines[1] 112 | labels = labels.split(',') 113 | i_key = {_k: labels.index(_k) for _k in self.keys_ep} 114 | lines = lines[2:] 115 | res = {_k: [] for _k in self.keys_ep} 116 | n_ep = len(lines) 117 | for i_episode in range(n_ep): 118 | line = lines[i_episode] 119 | ep_info = line.split(',') 120 | res[self.key_ep_reward].append(float(ep_info[i_key[self.key_ep_reward]])) 121 | res[self.key_ep_time].append(float(ep_info[i_key[self.key_ep_time]])) 122 | res[self.key_ep_length].append(int(ep_info[i_key[self.key_ep_length]])) 123 | continue_index = 0 124 | continue_path = path_monitor + '.continue{0}'.format(continue_index) 125 | while os.path.exists(continue_path): 126 | raise NotImplementedError() 127 | t = res[self.key_ep_time] 128 | rewards = res[self.key_ep_reward] 129 | lengths = res[self.key_ep_length] 130 | return t, rewards, lengths 131 | 132 | def combine_logs_xy(self, x1, y1_list, x2, y2_list): 133 | ''' 134 | Combine (x1, y1) with (x2, y2) sorting on increasing elements of (x1, x2) 135 | ''' 136 | n_x1 = len(x1) 137 | n_x2 = len(x2) 138 | x = np.zeros(n_x1 + n_x2) 139 | y_list = [np.zeros(n_x1 + n_x2) for _ in y1_list] 140 | i_x1 = 0 141 | i_x2 = 0 142 | i_x = 0 143 | while (i_x1 < n_x1) and (i_x2 < n_x2): 144 | if x1[i_x1] < x2[i_x2]: 145 | x[i_x] = x1[i_x1] 146 | for (y, y1) in zip(y_list, y1_list): 147 | y[i_x] = y1[i_x1] 148 | i_x1 += 1 149 | else: 150 | x[i_x] = x2[i_x2] 151 | for (y, y2) in zip(y_list, y2_list): 152 | y[i_x] = y2[i_x2] 153 | i_x2 += 1 154 | i_x += 1 155 | for _i, _x in enumerate(x1[i_x1:]): 156 | x[i_x + _i] = _x 157 | for (y, y1) in zip(y_list, y1_list): 158 | y[i_x + _i] = y1[i_x1 + _i] 159 | for _i, (_x, _y) in enumerate(zip(x2[i_x2:], y2[i_x2:])): 160 | x[i_x + _i] = _x 161 | for (y, y2) in zip(y_list, y2_list): 162 | y[i_x + _i] = y2[i_x2 + _i] 163 | return x, y_list 164 | 165 | def combine_logs_xyz(self, x1, y1, z1, x2, y2, z2): 166 | ''' 167 | Combine (x1, y1, z1) with (x2, y2, z2) sorting on increasing elements of (x1, x2) 168 | ''' 169 | x, [y, z] = self.combine_logs_xy(x1, [y1, z1], x2, [y2, z2]) 170 | return x, y, z 171 | 172 | def combine_workers(self, t_workers, rewards_workers, lengths_workers): 173 | ''' 174 | Combine individual worker reward sequences into a single reward sequence 175 | ''' 176 | t = t_workers[0] 177 | rewards = rewards_workers[0] 178 | lengths = lengths_workers[0] 179 | for t_worker, rewards_worker, lengths_worker in zip(t_workers[1:], rewards_workers[1:], lengths_workers[1:]): 180 | t, rewards, lengths = self.combine_logs_xyz(t, rewards, lengths, t_worker, rewards_worker, lengths_worker) 181 | return t, rewards, lengths 182 | 183 | def calc_plot_workers(self, path_session, suffix_session=''): 184 | ''' 185 | Parse logs across workers and build a reward sequence 186 | ''' 187 | # Load rewards across workers 188 | path_worker_logs = [] 189 | print('Processing session {0}'.format(path_session)) 190 | for _d in os.listdir(path_session): 191 | if 'worker' in _d: 192 | worker_str_suffix = _d[-len(suffix_session):] 193 | if len(suffix_session) > 0: 194 | if worker_str_suffix != suffix_session: 195 | print(' (ignore path {0}: does not contain suffix {1})'.format(_d, suffix_session)) 196 | continue 197 | worker_monitor_dir = os.path.join(path_session, _d) 198 | worker_monitor_path = os.path.join(worker_monitor_dir, 'monitor.csv') 199 | assert os.path.isfile(worker_monitor_path), 'Could not find logs at path: {0}'.format(worker_monitor_path) 200 | path_worker_logs.append(worker_monitor_path) 201 | n_workers = len(path_worker_logs) 202 | t_workers = [] 203 | rewards_workers = [] 204 | lengths_workers = [] 205 | for path_monitor_json in path_worker_logs: 206 | print(' {0}'.format(path_monitor_json)) 207 | t_worker, rewards_worker, lengths_worker = self.parse_worker_monitor_csv(path_monitor_json, n_workers=n_workers) 208 | t_workers.append(t_worker) 209 | rewards_workers.append(rewards_worker) 210 | lengths_workers.append(lengths_worker) 211 | t, rewards, lengths = self.combine_workers(t_workers, rewards_workers, lengths_workers) 212 | return t, rewards, lengths 213 | 214 | 215 | def main(): 216 | from ceres.tools.io import ExtraArgs 217 | extra_args = ExtraArgs() 218 | plot_config = PlotConfig.from_extra_args(extra_args) 219 | plotter = PlotLogsBaselines(plot_config) 220 | 221 | assert len(extra_args.plot_path) > 0 222 | color_list = ['g', 'b', 'r', 'k'] 223 | path_info_dict = {} 224 | path_list = [] 225 | title_list = [] 226 | for i_path, path_info in enumerate(extra_args.plot_path): 227 | if '=' in path_info: 228 | title, path_loc = path_info.split('=') 229 | else: 230 | path_loc = path_info 231 | title = '{0}:{1}'.format(i_path, os.path.basename(path_loc)) 232 | if not (title in title_list): 233 | title_list.append(title) 234 | path_info_dict[title] = [] 235 | path_info_dict[title].append(path_loc) 236 | if len(color_list) < len(title_list): 237 | for _ in range(len(title_list) - len(color_list)): 238 | color_random = np.random.rand(3) 239 | color_list.append(color_random) 240 | for i_plot, title in enumerate(title_list): 241 | plotter.add_plot(title=title, 242 | paths=path_info_dict[title], 243 | label=title, 244 | color=color_list[i_plot], 245 | skip_error=False) 246 | 247 | plotter.plot(show=True) 248 | 249 | 250 | 251 | if __name__ == '__main__': 252 | main() 253 | -------------------------------------------------------------------------------- /ceres/constraints/constraint_network.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | from .constraint_loss import ConstraintLoss 8 | from .constraint_config import ConstraintConfig 9 | from ceres.tools.math.spherical_coordinates import SphericalCoordinates 10 | 11 | class ConstraintNetwork(object): 12 | ''' 13 | Learn and predict state-dependent constraints on actions 14 | ''' 15 | 16 | def __init__(self, observation_space, action_space, config): 17 | self.observation_space = observation_space 18 | self.action_space = action_space 19 | self.action_space_pm = np.array([0.5*(high - low) for (low, high) in zip(self.action_space.low, self.action_space.high)]) 20 | self.action_space_mid = np.array([0.5*(high + low) for (low, high) in zip(self.action_space.low, self.action_space.high)]) 21 | self.n_obs = self.observation_space.shape[0] 22 | self.n_act = self.action_space.shape[0] 23 | self.config = config 24 | self.n_ineq = self.config.n_ineq 25 | self.init_default() 26 | 27 | # Constraints are of the form G x <= h, with G of size n_ineq x n_act and h of size n_ineq x 1 28 | self.init_prediction_size() 29 | 30 | # Build model 31 | self.observation = tf.placeholder(dtype=tf.float32, shape=(None, self.n_obs), name='observation') 32 | self.batch_size = tf.shape(self.observation)[0] 33 | self.batch_size_float = tf.cast(self.batch_size, dtype=tf.float32) 34 | self.ones_batch = tf.ones([self.batch_size], dtype=tf.float32) 35 | self.zeros_batch = tf.zeros([self.batch_size], dtype=tf.float32) 36 | 37 | self.input_layer = self.observation 38 | self.output_layer = self.build_model() 39 | self.init_regularization() 40 | 41 | # Transform output layer into constraint matrices G, h 42 | self.ineq_mat_params, self.ineq_vec_params = self.split_predictions(self.output_layer) 43 | self.ineq_mat, self.ineq_vec = self.build_ineq(self.ineq_mat_params, self.ineq_vec_params) 44 | 45 | ################################## 46 | # Individual actions with labels # 47 | ################################## 48 | 49 | # Reference action and action indicator for training 50 | self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.n_act), name='action') 51 | self.action_indicator = tf.placeholder(dtype=tf.float32, shape=(None), name='action_indicator') 52 | self.action_tensor = tf.expand_dims(self.action, -1) 53 | 54 | # Count positive and negative demonstrations 55 | self.is_positive = self.action_indicator 56 | self.is_negative = 1. - self.is_positive 57 | self.n_positive = tf.cast(tf.reduce_sum(self.is_positive), dtype=tf.int32) 58 | self.n_negative = tf.cast(tf.reduce_sum(self.is_negative), dtype=tf.int32) 59 | 60 | # Build constraint margins and scores 61 | self.ineq_diff, self.ineq_satisfaction_margin, self.ineq_violation_margin = self.build_ineq_margins(self.ineq_mat, self.ineq_vec, self.action_tensor) 62 | self.n_positive_satisfied, self.n_positive_violated, self.n_negative_satisfied, self.n_negative_violated = self.calc_constraint_score() 63 | 64 | # Construct training losses 65 | self.loss, self.losses = self.init_training_loss() 66 | 67 | def init_prediction_size(self): 68 | ''' 69 | Initialize the number of output variables, depending on: number of constraints; spherical coordinates; interior point prediction 70 | ''' 71 | if self.config.spherical_coordinates: 72 | assert self.n_act >= 2 73 | self.n_ineq_mat_params = self.n_ineq * (self.n_act - 1) 74 | else: 75 | self.n_ineq_mat_params = self.n_ineq * self.n_act 76 | self.n_ineq_vec_params = self.n_ineq 77 | if self.config.predict_interior_point: 78 | # Add an action x0 to the constraint predictions 79 | self.n_ineq_vec_params += self.n_act 80 | self.n_outputs = self.n_ineq_mat_params + self.n_ineq_vec_params 81 | 82 | def init_training_loss(self): 83 | ''' 84 | Defer the construction of constraint loss terms to a ConstraintLoss object 85 | ''' 86 | constraint_loss = ConstraintLoss(self) 87 | total_loss = constraint_loss.total_loss 88 | losses = constraint_loss.losses 89 | return total_loss, losses 90 | 91 | def calc_constraint_score(self): 92 | ''' 93 | Count the number of positive / negative demonstrations that satisfy / violate the constraint 94 | ''' 95 | positive_loss = tf.reduce_max(self.ineq_violation_margin, axis=1) 96 | positive_loss = tf.multiply(self.action_indicator, positive_loss) 97 | n_positive_violated = tf.where(positive_loss > 0., self.ones_batch, self.zeros_batch) 98 | n_positive_violated = tf.cast(tf.reduce_sum(n_positive_violated), dtype=tf.int32) 99 | n_positive_satisfied = self.n_positive - n_positive_violated 100 | negative_loss = tf.reduce_min(self.ineq_satisfaction_margin, axis=1) 101 | negative_loss = tf.multiply(self.is_negative, negative_loss) 102 | n_negative_satisfied = tf.where(negative_loss > 0., self.ones_batch, self.zeros_batch) 103 | n_negative_satisfied = tf.cast(tf.reduce_sum(n_negative_satisfied), dtype=tf.int32) 104 | n_negative_violated = self.n_negative - n_negative_satisfied 105 | return n_positive_satisfied, n_positive_violated, \ 106 | n_negative_satisfied, n_negative_violated 107 | 108 | def init_default(self): 109 | ''' 110 | Default initialization and nonlinearity parameters 111 | ''' 112 | self.initializer = tf.random_normal_initializer(mean=0., stddev=0.1) 113 | self.activation_common = tf.nn.relu 114 | 115 | def split_predictions(self, output_layer): 116 | ''' 117 | Split the output layer into inequality matrix and vector parameters 118 | ''' 119 | # First n_ineq * self.n_act are G, remaining n_ineq are h 120 | ineq_mat_params = tf.slice(output_layer, [0, 0], [-1, self.n_ineq_mat_params]) 121 | ineq_vec_params = tf.slice(output_layer, [0, self.n_ineq_mat_params], [-1, self.n_ineq_vec_params]) 122 | return ineq_mat_params, ineq_vec_params 123 | 124 | def build_ineq(self, ineq_mat_params, ineq_vec_params): 125 | ''' 126 | Transform ineq predicted parameters into solver-compatible matrices 127 | ''' 128 | ineq_mat = self.build_ineq_mat(ineq_mat_params) 129 | ineq_vec = self.build_ineq_vec(ineq_vec_params, ineq_mat) 130 | return ineq_mat, ineq_vec 131 | 132 | def build_ineq_mat(self, ineq_mat_params): 133 | ''' 134 | Reshape predicted parameters and optionally ensure ineq matrix normalization 135 | ''' 136 | ineq_mat_epsilon = 1.e-7 137 | assert not (self.config.spherical_coordinates and self.config.normalize_ineq_mat), 'Cannot have simultaneously spherical coordinates and ineq mat normalization' 138 | if self.config.spherical_coordinates: 139 | ineq_mat = tf.reshape(ineq_mat_params, [-1, self.n_ineq, self.n_act-1]) 140 | self.sc = SphericalCoordinates(self.n_act, input_angles=ineq_mat) 141 | ineq_mat = self.sc.output_unit_vec 142 | else: 143 | ineq_mat = tf.reshape(ineq_mat_params, [-1, self.n_ineq, self.n_act]) 144 | if self.config.normalize_ineq_mat: 145 | # Make each line of G of unit norm 146 | norm_row = tf.norm(ineq_mat, ord='euclidean', axis=-1, keep_dims=True) 147 | norm_row += ineq_mat_epsilon 148 | norm_mat = tf.tile(norm_row, [1, 1, self.n_act]) 149 | ineq_mat = tf.divide(ineq_mat, norm_mat) 150 | return ineq_mat 151 | 152 | def build_ineq_vec(self, ineq_vec_params, ineq_mat): 153 | ''' 154 | Reshape predicted parameters and optionally build an interior point that satisfies all constraints 155 | ''' 156 | ineq_vec = ineq_vec_params 157 | if self.config.predict_interior_point: 158 | # In this case, we split h_pred into x0 and h+ 159 | # with x0 an action and h+ non negative. 160 | # Then: h = g*x0 + h+ 161 | interior_point = tf.slice(ineq_vec, [0, 0], [-1, self.n_act]) 162 | action_space_mid_tensor = tf.convert_to_tensor(self.action_space_mid, dtype=ineq_vec.dtype) 163 | action_space_mid_tensor = tf.expand_dims(action_space_mid_tensor, axis=0) 164 | action_space_mid_tensor = tf.tile(action_space_mid_tensor, [tf.shape(ineq_vec)[0], 1]) 165 | action_space_pm_tensor = tf.convert_to_tensor(self.action_space_pm, dtype=ineq_vec.dtype) 166 | action_space_pm_tensor = tf.expand_dims(action_space_pm_tensor, axis=0) 167 | action_space_pm_tensor = tf.tile(action_space_pm_tensor, [tf.shape(ineq_vec)[0], 1]) 168 | if self.config.interior_point_max >= 0.: 169 | # force the interior point to be in a given domain within the action space 170 | interior_point_low = action_space_mid_tensor - self.config.interior_point_max*action_space_pm_tensor 171 | interior_point_high = action_space_mid_tensor + self.config.interior_point_max*action_space_pm_tensor 172 | interior_point = tf.clip_by_value(interior_point, interior_point_low, interior_point_high) 173 | interior_point = tf.expand_dims(interior_point, axis=-1) 174 | ineq_vec_plus = tf.slice(ineq_vec, [0, self.n_act], [-1, -1]) 175 | ineq_vec_plus = tf.nn.relu(ineq_vec_plus) 176 | zeros_like_ineq_vec_plus = tf.zeros_like(ineq_vec_plus) 177 | if self.config.interior_point_margin_min != 0.: 178 | # each row of h is a scalar, so we use the min action range as basis for the margin 179 | ineq_vec_plus_min_val = self.config.interior_point_margin_min * np.min(self.action_space_pm) 180 | ineq_vec_plus += ineq_vec_plus_min_val 181 | if self.config.interior_point_margin_max > 0.: 182 | # each row of h is a scalar, so we use the min action range as basis for the margin 183 | ineq_vec_plus_max_val = self.config.interior_point_margin_max * np.min(self.action_space_pm) 184 | ineq_vec_plus = tf.clip_by_value(ineq_vec_plus, 0., ineq_vec_plus_max_val) 185 | ineq_vec_interior_point = tf.matmul(ineq_mat, interior_point) 186 | ineq_vec_interior_point = tf.squeeze(ineq_vec_interior_point, axis=-1) 187 | ineq_vec = ineq_vec_interior_point + ineq_vec_plus 188 | else: 189 | interior_point = tf.zeros([1, self.n_act, 1], dtype=ineq_vec.dtype) 190 | interior_point = tf.tile(interior_point, [tf.shape(ineq_vec)[0], 1, 1]) 191 | # Reshape into matrices 192 | self.interior_point = interior_point 193 | ineq_vec = tf.expand_dims(ineq_vec, axis=-1) 194 | return ineq_vec 195 | 196 | def build_ineq_margins(self, ineq_mat, ineq_vec, action_tensor, do_squeeze=True): 197 | ''' 198 | Constraints are satisfied if Gx <= h, hence satisfaction margin = max(0, h - Gx) 199 | Constraints are violated if Gx > h, hence violation margin = max(0, Gx - h) 200 | ''' 201 | ineq_diff = ineq_vec - tf.matmul(ineq_mat, action_tensor) 202 | ineq_satisfaction_margin = tf.nn.relu(ineq_diff) 203 | ineq_violation_margin = tf.nn.relu(-ineq_diff) 204 | if do_squeeze: 205 | ineq_diff = tf.squeeze(ineq_diff, axis=-1) 206 | ineq_satisfaction_margin = tf.squeeze(ineq_satisfaction_margin, axis=-1) 207 | ineq_violation_margin = tf.squeeze(ineq_violation_margin, axis=-1) 208 | return ineq_diff, ineq_satisfaction_margin, ineq_violation_margin 209 | 210 | def init_regularization(self): 211 | ''' 212 | Find model weights for regularization 213 | ''' 214 | gr = tf.get_default_graph() 215 | self.model_weights = {name: gr.get_tensor_by_name('{0}/kernel:0'.format(name)) for name in self.layer_names} 216 | 217 | def build_model(self): 218 | raise NotImplementedError('Implement build_model within child classes') 219 | 220 | -------------------------------------------------------------------------------- /ceres/baselines/ceres/pposgd_ceres.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved. 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping 3 | # This project is licensed under the MIT License, see LICENSE 4 | 5 | from baselines import logger 6 | import baselines.common.tf_util as U 7 | import numpy as np 8 | from ceres.baselines.common import mpi_select 9 | from ceres.baselines.common.mpi_adam_select import MpiAdamSelect 10 | from ceres.baselines.common.mpi_moments_select import mpi_moments_select 11 | from ceres import ConstraintDemonstration, ConstraintDemonstrationTrajectory, ConstraintDemonstrationBuffer 12 | from .ceres_logic import CeresLogic 13 | from .constraint_trainer import ConstraintTrainer 14 | 15 | from ceres.baselines.ppo1.pposgd_simple_helper import build_policy_training_vars, build_counters, adjust_policy_learning_rate, update_policy, log_iter_info, calc_end_training 16 | from .pposgd_ceres_helper import update_constraint_activation_probability, build_policy_observation_filter, build_mpi_vars, save_models_and_data 17 | 18 | def traj_segment_generator(pi, env, horizon, 19 | ceres_logic, is_direct_policy, policy_observation_filter, 20 | stochastic=True, render=False): 21 | ''' 22 | Sample trajectories and collect positive/negative/uncertain demonstrations for constraint learning 23 | ''' 24 | t = 0 25 | ac = env.action_space.sample() # not used, just so we have the datatype 26 | new = True # marks if we're on first timestep of an episode 27 | ob = env.reset() 28 | policy_ob = policy_observation_filter(ob) 29 | if render: 30 | env.render() 31 | i_iteration = 0 32 | 33 | cur_ep_ret = 0 # return in current episode 34 | cur_ep_len = 0 # len of current episode 35 | ep_rets = [] # returns of completed episodes in this segment 36 | ep_lens = [] # lengths of ... 37 | 38 | # Initialize history arrays 39 | obs = np.array([policy_ob for _ in range(horizon)]) 40 | rews = np.zeros(horizon, 'float32') 41 | vpreds = np.zeros(horizon, 'float32') 42 | news = np.zeros(horizon, 'int32') 43 | acs = np.array([ac for _ in range(horizon)]) 44 | prevacs = acs.copy() 45 | 46 | demonstration_trajectory = [] # store sampled demonstrations here 47 | uncertain_demonstration_trajectories = [] # demonstrations that cannot yet be identified as positive or negative will go here 48 | snapshot = env.unwrapped.calc_snapshot() 49 | recovery_info = env.unwrapped.recovery_info 50 | 51 | while True: 52 | prevac = ac 53 | policy_ob = policy_observation_filter(ob) 54 | ac, vpred = pi.act(stochastic, policy_ob) 55 | # Slight weirdness here because we need value function at time T 56 | # before returning segment [0, T-1] so we get the correct 57 | # terminal value 58 | if t > 0 and t % horizon == 0: 59 | res = {'ob' : obs, 'rew' : rews, 'vpred' : vpreds, 'new' : news, 60 | 'ac' : acs, 'prevac' : prevacs, 'nextvpred': vpred * (1 - new), 61 | 'ep_rets' : ep_rets, 'ep_lens' : ep_lens, 62 | } 63 | res['uncertain_demonstration_trajectories'] = uncertain_demonstration_trajectories 64 | yield res 65 | # Be careful!!! if you change the downstream algorithm to aggregate 66 | # several of these batches, then be sure to do a deepcopy 67 | ep_rets = [] 68 | ep_lens = [] 69 | uncertain_demonstration_trajectories = [] 70 | i_iteration += 1 71 | i = t % horizon 72 | obs[i] = policy_ob 73 | vpreds[i] = vpred 74 | news[i] = new 75 | acs[i] = ac 76 | prevacs[i] = prevac 77 | 78 | ob_new, rew, new, info = env.step(ac) 79 | if render: 80 | env.render() 81 | rews[i] = rew 82 | ac_constrained = info[env.unwrapped.info_key_constrained_action] 83 | 84 | ceres_demonstration = ConstraintDemonstration(state=ob, snapshot=snapshot, action=ac_constrained) 85 | demonstration_trajectory.append(ceres_demonstration) 86 | ob = ob_new 87 | snapshot = env.unwrapped.calc_snapshot() 88 | 89 | cur_ep_ret += rew 90 | cur_ep_len += 1 91 | if new: 92 | # Add final demonstration as terminal (state without action) 93 | demonstration_trajectory.append(ConstraintDemonstration(state=ob, snapshot=snapshot, is_terminal=True)) 94 | # Sort demonstrations into positive and negative 95 | ceres_logic.process_trajectory(ConstraintDemonstrationTrajectory(demonstration_trajectory), 96 | info[env.unwrapped.info_key_failure], info[env.unwrapped.info_key_success], 97 | uncertain_demonstration_trajectories, 98 | env.unwrapped.recovery_info, is_direct_policy, 99 | remove_reference_trajectory_if_emptied=True, 100 | increment_reset_count_on_change=i_iteration) 101 | ep_rets.append(cur_ep_ret) 102 | ep_lens.append(cur_ep_len) 103 | # Reset trajectory parameters 104 | cur_ep_ret = 0 105 | cur_ep_len = 0 106 | ob = env.reset() 107 | snapshot = env.unwrapped.calc_snapshot() 108 | if render: 109 | env.render() 110 | demonstration_trajectory = [] 111 | t += 1 112 | 113 | def learn(env, policy_fn, *, 114 | timesteps_per_actorbatch, # timesteps per actor per update 115 | clip_param, entcoeff, # clipping parameter epsilon, entropy coeff 116 | optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers 117 | gamma, lam, # advantage estimation 118 | extra_args, cnet, constraint_demonstration_buffer, 119 | max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint 120 | callback=None, # you can do anything in the callback, since it takes locals(), globals() 121 | adam_epsilon=1e-5, 122 | schedule='constant' # annealing for stepsize parameters (epsilon and adam) 123 | ): 124 | 125 | assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, 'Only one time constraint permitted' 126 | 127 | # Train different networks across processes 128 | mpi_comm, mpi_rank, is_direct_policy, mpi_root, mpi_group, mpi_destinations, mpi_n_processes, is_root, cnet_recovery_id_in_direct_exchange_ids, cnet_exchange_ids, n_exchange_processes = build_mpi_vars(extra_args) 129 | 130 | # Setup observation filtering (use partial state information) 131 | policy_ob_space = env.observation_space 132 | policy_ac_space = env.action_space 133 | policy_ob_space, policy_observation_filter = build_policy_observation_filter(extra_args, policy_ob_space) 134 | pi = policy_fn('pi', policy_ob_space, policy_ac_space) # Construct network for new policy 135 | oldpi = policy_fn('oldpi', policy_ob_space, policy_ac_space) # Network for old policy 136 | 137 | # Create policy optimizers 138 | policy_loss_names, policy_var_list, policy_lossandgrad, policy_adam, policy_assign_old_eq_new, policy_compute_losses = build_policy_training_vars(pi, oldpi, clip_param, entcoeff, adam_epsilon) 139 | # Use rank-selective Adam to train direct and recovery with separate data 140 | policy_adam = MpiAdamSelect(mpi_rank, mpi_root, mpi_group, policy_var_list, epsilon=adam_epsilon) 141 | mpi_moments_fn = lambda losses: mpi_moments_select(losses, mpi_rank, mpi_root, mpi_destinations, mpi_n_processes, axis=0) 142 | allgather_fn = lambda x: mpi_select.allgather_select(mpi_comm, mpi_rank, mpi_root, mpi_destinations, x, tag=mpi_root) 143 | 144 | # Constraints 145 | cnet = env.unwrapped.cnet 146 | last_backup_time = None 147 | 148 | constraint_trainer = ConstraintTrainer(extra_args, logger, 149 | cnet, constraint_demonstration_buffer, 150 | mpi_comm, mpi_rank, is_direct_policy, 151 | cnet_recovery_id_in_direct_exchange_ids, 152 | cnet_exchange_ids, n_exchange_processes, 153 | adam_epsilon=adam_epsilon) 154 | 155 | # Enable conservative exploration (force margin w.r.t. constraints) 156 | env.unwrapped.set_ineq_margin(extra_args.conservative_exploration) 157 | # Set initial constraint activation probability to zero 158 | update_constraint_activation_probability(env, extra_args, logger, is_direct_policy, True, 0., 0.) 159 | 160 | U.initialize() 161 | if len(extra_args.trained_policy) > 0: 162 | pi.restore_model(extra_args.trained_policy) 163 | oldpi.restore_model(extra_args.trained_policy, backup_network_id='pi') 164 | if len(extra_args.trained_cnet) > 0: 165 | cnet.restore_model(extra_args.trained_cnet) 166 | if len(extra_args.constraint_demonstration_buffer) > 0: 167 | constraint_demonstration_buffer.restore_buffer(extra_args.constraint_demonstration_buffer, keep_size=False, verbose=True) 168 | 169 | policy_adam.sync() 170 | constraint_trainer.init() 171 | 172 | # Prepare for rollouts 173 | ceres_logic = CeresLogic(env, constraint_demonstration_buffer, extra_args) 174 | seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, 175 | ceres_logic, 176 | is_direct_policy, 177 | policy_observation_filter, 178 | stochastic=True, 179 | render=(extra_args.render and (mpi_rank == 0))) 180 | 181 | iters_so_far, episodes_so_far, timesteps_so_far, tstart, lenbuffer, rewbuffer = build_counters() 182 | 183 | do_train_policy = True 184 | do_sync_recovery = True 185 | do_train_cnet = True 186 | 187 | if extra_args.n_recovery == 0: 188 | do_sync_recovery = False 189 | do_train_cnet = False # turn it back on if only train constraints 190 | 191 | if extra_args.only_train_constraints: 192 | do_train_policy = False 193 | do_sync_recovery = False 194 | do_train_cnet = True 195 | assert len(extra_args.constraint_demonstration_buffer) > 0, 'Required constraint demonstration buffer' 196 | max_iters, max_timesteps, max_episodes, max_seconds = 1, 0, 0, 0 197 | 198 | if extra_args.only_train_policy: 199 | do_train_policy = True 200 | do_sync_recovery = False 201 | do_train_cnet = False 202 | 203 | while True: 204 | if callback: callback(locals(), globals()) 205 | 206 | logger.log('********** Begin iteration {0} ************'.format(iters_so_far)) 207 | 208 | n_reference_trajectories_before_sampling = len(env.unwrapped.reference_trajectories) 209 | 210 | if do_train_policy: 211 | # Collect new trajectories and update policy 212 | seg = seg_gen.__next__() 213 | policy_cur_lrmult = adjust_policy_learning_rate(schedule, max_timesteps, timesteps_so_far, max_episodes, episodes_so_far, max_iters, iters_so_far) 214 | vpredbefore, tdlamret, optim_batchsize = update_policy(pi, seg, gamma, lam, 215 | logger, optim_epochs, optim_batchsize, optim_stepsize, policy_cur_lrmult, 216 | policy_loss_names, policy_lossandgrad, policy_adam, policy_assign_old_eq_new, policy_compute_losses, 217 | mpi_moments_fn) 218 | 219 | if do_train_policy and do_sync_recovery: 220 | # Transfer uncertain demonstrations from direct to recovery agents 221 | constraint_trainer.synchronize_recovery_trajectories(env, seg, n_reference_trajectories_before_sampling) 222 | 223 | # Compute constraint losses on the newly collected data, prior to training 224 | do_train_cnet, activation_probability_before = constraint_trainer.prepare_constraint_update(do_train_cnet, iters_so_far) 225 | 226 | # Train constraints on the new data and return final losses 227 | do_train_cnet, activation_probability_after = constraint_trainer.update_constraint_network(do_train_cnet) 228 | 229 | # Change the environment constraint activation probability dependending on the constraint prediction accuracy 230 | update_constraint_activation_probability(env, extra_args, logger, is_direct_policy, do_train_cnet, 231 | activation_probability_before, activation_probability_after) 232 | 233 | # Log iteration results 234 | if do_train_policy: 235 | episodes_so_far, timesteps_so_far = log_iter_info(lenbuffer, rewbuffer, tstart, 236 | vpredbefore, tdlamret, seg, 237 | episodes_so_far, timesteps_so_far, 238 | is_root, allgather_fn) 239 | iters_so_far += 1 240 | end_training = calc_end_training(max_timesteps, timesteps_so_far, 241 | max_episodes, episodes_so_far, 242 | max_iters, iters_so_far, 243 | max_seconds, tstart) 244 | 245 | # Save models and data 246 | last_backup_time = save_models_and_data(extra_args, iters_so_far, end_training, last_backup_time, 247 | is_root, mpi_rank, pi, cnet, constraint_demonstration_buffer) 248 | 249 | if end_training: 250 | break 251 | --------------------------------------------------------------------------------