├── examples
    ├── .gitignore
    ├── media
    │   ├── maze_average.png
    │   ├── maze_individual.png
    │   ├── random_pos_average.png
    │   ├── random_force_average.png
    │   ├── random_force_individual.png
    │   ├── random_pos_individual.png
    │   └── random_exploration
    │   │   ├── maze_constrained.gif
    │   │   ├── maze_unconstrained.gif
    │   │   ├── random_pos_unconstrained.gif
    │   │   └── random_force_unconstrained.gif
    ├── data
    │   └── Nav2dFixedMaze-500T.npz
    ├── README.md
    ├── 01-constraints-from-demonstrations.md
    └── 02-constraints-from-scratch.md
├── ceres
    ├── scripts
    │   ├── .gitignore
    │   ├── train_ceres.py
    │   ├── plot_rewards.py
    │   └── play_policy.py
    ├── tools
    │   ├── __init__.py
    │   ├── plot
    │   │   ├── __init__.py
    │   │   ├── plot_config.py
    │   │   └── plot_logs.py
    │   ├── io
    │   │   ├── __init__.py
    │   │   ├── h5_helper.py
    │   │   └── extra_args.py
    │   └── math
    │   │   ├── __init__.py
    │   │   ├── qpsolver_quadprog.py
    │   │   ├── spherical_coordinates.py
    │   │   └── qpsolver.py
    ├── __init__.py
    ├── networks
    │   ├── __init__.py
    │   ├── network_saver_mlp.py
    │   └── network_saver.py
    ├── envs
    │   ├── resetter
    │   │   ├── __init__.py
    │   │   ├── resetter_env.py
    │   │   └── resetter_env_ceres.py
    │   ├── __init__.py
    │   ├── constrained
    │   │   ├── __init__.py
    │   │   ├── constrained_env_fixed.py
    │   │   ├── constrained_env_network.py
    │   │   └── constrained_env.py
    │   ├── nav2d
    │   │   ├── __init__.py
    │   │   ├── nav2d_force.py
    │   │   ├── nav2d_ceres.py
    │   │   └── obstacles.py
    │   └── ceres_env.py
    ├── constraints
    │   ├── __init__.py
    │   ├── constraint_config.py
    │   ├── constraint_network_mlp.py
    │   ├── constraint_loss.py
    │   └── constraint_network.py
    └── baselines
    │   ├── ceres
    │       ├── mlp_policy_saver.py
    │       ├── run_continuous.py
    │       ├── pposgd_ceres_helper.py
    │       └── pposgd_ceres.py
    │   ├── common
    │       ├── mpi_moments_select.py
    │       ├── mpi_select.py
    │       ├── mpi_adam_select.py
    │       └── plot_logs_baselines.py
    │   └── ppo1
    │       ├── pposgd_simple.py
    │       └── pposgd_simple_helper.py
├── MAINTAINERS.txt
├── HEADER
├── setup.py
├── LICENSE
├── DCO1.1.txt
├── README.md
├── CONTRIBUTING.md
└── CONDUCT.md


/examples/.gitignore:
--------------------------------------------------------------------------------
1 | logs
2 | 


--------------------------------------------------------------------------------
/ceres/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | demonstrations
2 | cnet
3 | cact
4 | 


--------------------------------------------------------------------------------
/MAINTAINERS.txt:
--------------------------------------------------------------------------------
1 | Maintainers
2 | 
3 | Tu-Hoa Pham ph4m pham@jp.ibm.com
4 | 


--------------------------------------------------------------------------------
/examples/media/maze_average.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/maze_average.png


--------------------------------------------------------------------------------
/examples/media/maze_individual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/maze_individual.png


--------------------------------------------------------------------------------
/examples/data/Nav2dFixedMaze-500T.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/data/Nav2dFixedMaze-500T.npz


--------------------------------------------------------------------------------
/examples/media/random_pos_average.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_pos_average.png


--------------------------------------------------------------------------------
/examples/media/random_force_average.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_force_average.png


--------------------------------------------------------------------------------
/examples/media/random_force_individual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_force_individual.png


--------------------------------------------------------------------------------
/examples/media/random_pos_individual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_pos_individual.png


--------------------------------------------------------------------------------
/examples/media/random_exploration/maze_constrained.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_exploration/maze_constrained.gif


--------------------------------------------------------------------------------
/HEADER:
--------------------------------------------------------------------------------
1 | Copyright (c) <%= owner %> <%= years %>. All Rights Reserved.
2 | Project name: <%= name %>
3 | This project is licensed under the MIT License, see LICENSE
4 | 


--------------------------------------------------------------------------------
/examples/media/random_exploration/maze_unconstrained.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_exploration/maze_unconstrained.gif


--------------------------------------------------------------------------------
/examples/media/random_exploration/random_pos_unconstrained.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_exploration/random_pos_unconstrained.gif


--------------------------------------------------------------------------------
/examples/media/random_exploration/random_force_unconstrained.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/constrained-rl/HEAD/examples/media/random_exploration/random_force_unconstrained.gif


--------------------------------------------------------------------------------
/ceres/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .math import *
6 | from .io import *
7 | 


--------------------------------------------------------------------------------
/ceres/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .envs import *
6 | from .constraints import *
7 | from .tools import *
8 | 


--------------------------------------------------------------------------------
/ceres/tools/plot/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .plot_config import PlotConfig
6 | from .plot_logs import PlotLogs
7 | 


--------------------------------------------------------------------------------
/ceres/networks/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .network_saver import NetworkSaver
6 | from .network_saver_mlp import NetworkSaverMLP
7 | 
8 | 


--------------------------------------------------------------------------------
/ceres/tools/io/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .extra_args import ExtraArgs
6 | from .h5_helper import save_dict_as_h5, load_dict_from_h5
7 | 


--------------------------------------------------------------------------------
/ceres/envs/resetter/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .resetter_env import ResetterEnv
6 | from .resetter_env_ceres import ResetterEnvCeres
7 | 
8 | 


--------------------------------------------------------------------------------
/ceres/scripts/train_ceres.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | if __name__ == '__main__':
6 |     from ceres.baselines.ceres.run_continuous import main
7 |     main()
8 | 


--------------------------------------------------------------------------------
/ceres/envs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .resetter import *
6 | from .constrained import *
7 | from .ceres_env import CeresEnv
8 | from .nav2d import *
9 | 


--------------------------------------------------------------------------------
/ceres/scripts/plot_rewards.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | if __name__ == '__main__':
6 |     from ceres.baselines.common.plot_logs_baselines import main
7 |     main()
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # setup.py
 2 | from setuptools import setup
 3 | 
 4 | setup(
 5 |     name='ceres',
 6 |     version='0.1.0',
 7 |     packages=['ceres'],
 8 |     install_requires=[
 9 |         'numpy',
10 |         'matplotlib',
11 |         'baselines',
12 |         'gym',
13 |         'h5py',
14 |         'pygame',
15 |         'quadprog',
16 |     ],
17 | )
18 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Constrained Exploration and Recovery from Experience Shaping - Examples
2 | 
3 | 1. [Learning action space constraints from positive and negative demonstrations](01-constraints-from-demonstrations.md): fixed maze
4 | 2. [Learning action space constraints from scratch](02-constraints-from-scratch.md): random obstacles with position and force control
5 | 


--------------------------------------------------------------------------------
/ceres/tools/math/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .qpsolver import QPSolver
6 | from .qpsolver_quadprog import QPSolverQuadprog
7 | from .spherical_coordinates import SphericalCoordinates
8 | 


--------------------------------------------------------------------------------
/ceres/envs/constrained/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .constrained_env import ConstrainedEnv
6 | from .constrained_env_fixed import ConstrainedEnvFixed
7 | from .constrained_env_network import ConstrainedEnvNetwork
8 | 
9 | 


--------------------------------------------------------------------------------
/ceres/constraints/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
3 | # This project is licensed under the MIT License, see LICENSE
4 | 
5 | from .constraint_config import ConstraintConfig
6 | from .constraint_demonstration import ConstraintDemonstration, ConstraintDemonstrationTrajectory, ConstraintDemonstrationBuffer
7 | from .constraint_network import ConstraintNetwork
8 | from .constraint_network_mlp import ConstraintNetworkMLP
9 | 


--------------------------------------------------------------------------------
/ceres/baselines/ceres/mlp_policy_saver.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | from baselines.ppo1.mlp_policy import MlpPolicy
 6 | from ceres.networks import NetworkSaverMLP
 7 | 
 8 | class MlpPolicySaver(MlpPolicy, NetworkSaverMLP):
 9 | 
10 |     '''
11 |     Baselines MlpPolicy with save / restore functions
12 |     '''
13 | 
14 |     def __init__(self, name, *args, session=None, **kwargs):
15 |         MlpPolicy.__init__(self, name, *args, **kwargs)
16 |         NetworkSaverMLP.__init__(self, network_id=name)
17 | 


--------------------------------------------------------------------------------
/ceres/envs/constrained/constrained_env_fixed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | from .constrained_env import ConstrainedEnv
 6 | 
 7 | class ConstrainedEnvFixed(ConstrainedEnv):
 8 |     '''
 9 |     A base class for constrained environments with fixed constraints.
10 |     '''
11 | 
12 |     def __init__(self, *args, **kwargs):
13 |         super().__init__(*args, **kwargs)
14 |         self.init_ineq_matrices()
15 | 
16 |     def update_ineq_matrices(self, state):
17 |         '''
18 |         This purposely does nothing since ineq matrices only need to be built once
19 |         '''
20 |         pass
21 | 
22 |     def init_ineq_matrices(self):
23 |         '''
24 |         Define fixed ineq matrices here 
25 |         '''
26 |         raise NotImplementedError('Implement init_ineq_matrices in subclass {0}'.format(type(self)))
27 | 
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT license
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/ceres/envs/nav2d/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | from gym.envs.registration import register
 6 | from .nav2d_ceres import *
 7 | 
 8 | # We list all used Nav2d environments here and call them by '<environment class name>-v0'
 9 | gwenv_list = []
10 | gwenv_list.append('Nav2dPosCeres')
11 | gwenv_list.append('Nav2dPosFixedMazeCeres')
12 | gwenv_list.append('Nav2dPosRandomHolesCeres')
13 | gwenv_list.append('Nav2dForceCeres')
14 | gwenv_list.append('Nav2dForceFixedMazeCeres')
15 | gwenv_list.append('Nav2dForceRandomHolesCeres')
16 | gwenv_list.append('Nav2dPosFixedMazeCeres5N')
17 | gwenv_list.append('Nav2dPosRandomHolesCeres5N')
18 | gwenv_list.append('Nav2dForceRandomHolesCeres5N')
19 | 
20 | for gwenv in gwenv_list:
21 |     env = locals()[gwenv]
22 |     register(
23 |         id='{0}-v0'.format(gwenv),
24 |         entry_point='ceres.envs.nav2d:{0}'.format(gwenv),
25 |         max_episode_steps=env.max_episode_steps,
26 |         reward_threshold=100.0,
27 |     )
28 | 


--------------------------------------------------------------------------------
/ceres/tools/plot/plot_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | class PlotConfig(object):
 6 |     '''
 7 |     Plot parameters, optionally built from ExtraArgs objects
 8 |     '''
 9 | 
10 |     def __init__(self, n_average=201, timesteps_per_iteration=1024):
11 |         self.n_average = n_average
12 |         self.color_rewards_ind = 'b'
13 |         self.color_rewards_avg = 'r'
14 |         self.color_rewards_std = 'b'
15 |         self.label_y = 'Reward'
16 |         self.set_timesteps_per_iteration(timesteps_per_iteration)
17 | 
18 |     @classmethod
19 |     def from_extra_args(cls, extra_args):
20 |         plot_config = cls(n_average=extra_args.plot_average,
21 |                           timesteps_per_iteration=(extra_args.n_direct * extra_args.timesteps_per_actorbatch))
22 |         return plot_config
23 | 
24 |     def set_timesteps_per_iteration(self, timesteps_per_iteration):
25 |         self.timesteps_per_iteration = timesteps_per_iteration
26 |         if self.timesteps_per_iteration == 1:
27 |             self.label_x_iterations = 'Timesteps'
28 |         else:
29 |             self.label_x_iterations = 'Iterations [{0} timesteps]'.format(self.timesteps_per_iteration)
30 | 
31 | 


--------------------------------------------------------------------------------
/DCO1.1.txt:
--------------------------------------------------------------------------------
 1 | Developer's Certificate of Origin 1.1
 2 | 
 3 |        By making a contribution to this project, I certify that:
 4 | 
 5 |        (a) The contribution was created in whole or in part by me and I
 6 |            have the right to submit it under the open source license
 7 |            indicated in the file; or
 8 | 
 9 |        (b) The contribution is based upon previous work that, to the best
10 |            of my knowledge, is covered under an appropriate open source
11 |            license and I have the right under that license to submit that
12 |            work with modifications, whether created in whole or in part
13 |            by me, under the same open source license (unless I am
14 |            permitted to submit under a different license), as indicated
15 |            in the file; or
16 | 
17 |        (c) The contribution was provided directly to me by some other
18 |            person who certified (a), (b) or (c) and I have not modified
19 |            it.
20 | 
21 |        (d) I understand and agree that this project and the contribution
22 |            are public and that a record of the contribution (including all
23 |            personal information I submit with it, including my sign-off) is
24 |            maintained indefinitely and may be redistributed consistent with
25 |            this project or the open source license(s) involved.
26 | 


--------------------------------------------------------------------------------
/ceres/networks/network_saver_mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | import tensorflow as tf
 6 | from .network_saver import NetworkSaver
 7 | 
 8 | class NetworkSaverMLP(NetworkSaver):
 9 |     '''
10 |     A simple multilayer perceptron with save / restore functions
11 |     '''
12 | 
13 |     def build_model(self, observation, n_outputs,
14 |                     hidden_layers,
15 |                     kernel_initializer,
16 |                     activation_hidden):
17 |         var_names_begin = [_v.name for _v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)]
18 |         self.hidden_layers = hidden_layers
19 |         assert len(self.hidden_layers) > 0
20 |         self.dense_layers = []
21 |         last_layer = observation
22 |         self.layer_names = []
23 |         for i_layer, layer_size in enumerate(self.hidden_layers):
24 |             layer_name = '{0}dense_{1}'.format(self.tf_var_prefix, i_layer)
25 |             self.layer_names.append(layer_name)
26 |             dense_layer = tf.layers.dense(inputs=last_layer, units=layer_size, activation=None, kernel_initializer=kernel_initializer, name=layer_name)
27 |             dense_layer = activation_hidden(dense_layer)
28 |             self.dense_layers.append(dense_layer)
29 |             last_layer = dense_layer
30 |         layer_name = '{0}dense_{1}'.format(self.tf_var_prefix, 'output')
31 |         self.layer_names.append(layer_name)
32 |         output_layer = tf.layers.dense(inputs=last_layer, units=n_outputs, kernel_initializer=kernel_initializer, name=layer_name)
33 |         self.dense_layers.append(output_layer)
34 |         var_names_end = [_v.name for _v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)]
35 |         self.network_var_names = [_v for _v in var_names_end if not (_v in var_names_begin)]
36 |         return output_layer
37 | 
38 | 


--------------------------------------------------------------------------------
/ceres/tools/io/h5_helper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | import os
 6 | import h5py
 7 | 
 8 | '''
 9 | Helper functions for writing and loading data in the HDF5 format
10 | '''
11 | 
12 | def save_dict_as_h5(d, path_save, confirm_overwrite=True, verbose=False):
13 |     assert type(d) == dict, 'Invalid dictionary argument: {0}'.format(d)
14 |     assert type(path_save) == str, 'Invalid path argument: {0}'.format(path_save)
15 |     assert os.path.isdir(os.path.dirname(path_save)), 'Directory for save path does not exist: {0}'.format(path_save)
16 |     if os.path.isfile(path_save):
17 |         if confirm_overwrite:
18 |             if input('File exists: {0}\nOverwrite?[y/N]\n'.format(path_save)).lower() != 'y':
19 |                 print('Cancel write')
20 |                 return False
21 |         os.remove(path_save)
22 |     # Check that no nested dictionary
23 |     with h5py.File(path_save, 'w') as h5f:
24 |         write_dict(h5f, d)
25 |     if verbose:
26 |         print('Wrote backup: {0}'.format(path_save))
27 |     return True
28 | 
29 | def write_dict(h5f, d):
30 |     for _k, _v in d.items():
31 |         if type(_v) == dict:
32 |             grp = h5f.create_group(_k)
33 |             write_dict(grp, _v)
34 |         else:
35 |             h5f.create_dataset(_k, data=_v)
36 | 
37 | def load_dict_from_h5(path_save, verbose=False):
38 |     d = {}
39 |     with h5py.File(path_save, 'r') as h5f:
40 |         read_dict(h5f, d)
41 |     if verbose:
42 |         print('Loaded {0} from backup: {0}'.format(','.join(d.keys())))
43 |     return d
44 | 
45 | def read_dict(h5f, d):
46 |     for _k, _v in h5f.items():
47 |         if isinstance(_v, h5py.Dataset):
48 |             d[_k] = _v[()]
49 |         else:
50 |             assert isinstance(_v, h5py.Group)
51 |             d[_k] = {}
52 |             read_dict(_v, d[_k])
53 | 
54 | 


--------------------------------------------------------------------------------
/ceres/baselines/common/mpi_moments_select.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | from mpi4py import MPI
 6 | import numpy as np
 7 | from . import mpi_select
 8 | 
 9 | def mpi_mean_select(x, rank, root, destinations, n_processes,
10 |                     axis=0, comm=None, keepdims=False):
11 |     '''
12 |     Compute a mean on a selection of processes instead of all
13 |     '''
14 |     x = np.asarray(x)
15 |     assert x.ndim > 0
16 |     if comm is None: comm = MPI.COMM_WORLD
17 |     xsum = x.sum(axis=axis, keepdims=keepdims)
18 |     n = xsum.size
19 |     localsum = np.zeros(n+1, x.dtype)
20 |     localsum[:n] = xsum.ravel()
21 |     localsum[n] = x.shape[axis]
22 |     #globalsum = np.zeros_like(localsum)
23 |     #comm.Allreduce(localsum, globalsum, op=MPI.SUM)
24 |     globalsum = mpi_select.Allreduce_select(comm, rank, root, destinations, localsum, tag_reduce=root, tag_bcast=root + n_processes)
25 |     return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
26 | 
27 | def mpi_moments_select(x, rank, root, destinations, n_processes,
28 |                        axis=0, comm=None, keepdims=False):
29 |     '''
30 |     Compute a mean on a selection of processes instead of all
31 |     '''
32 |     x = np.asarray(x)
33 |     assert x.ndim > 0
34 |     mean, count = mpi_mean_select(x, rank, root, destinations, n_processes,
35 |                                   axis=axis, comm=comm, keepdims=True)
36 |     sqdiffs = np.square(x - mean)
37 |     meansqdiff, count1 = mpi_mean_select(sqdiffs,
38 |                                          rank, root, destinations, n_processes,
39 |                                          axis=axis, comm=comm, keepdims=True)
40 |     assert count1 == count
41 |     std = np.sqrt(meansqdiff)
42 |     if not keepdims:
43 |         newshape = mean.shape[:axis] + mean.shape[axis+1:]
44 |         mean = mean.reshape(newshape)
45 |         std = std.reshape(newshape)
46 |     return mean, std, count
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/ceres/baselines/common/mpi_select.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | import numpy as np
 6 | 
 7 | '''
 8 | These reproduce the behavior of MPI functions, except applied to a subset of available processes.
 9 | In addition, all functions return something instead of acting on argument buffers directly.
10 | '''
11 | 
12 | def Bcast_select(comm, rank, root, destinations, data_buffer, tag=0):
13 |     if rank == root:
14 |         for dest_rank in destinations:
15 |             comm.send(data_buffer, dest=dest_rank, tag=tag)
16 |         return data_buffer
17 |     else:
18 |         recv_buffer = comm.recv(source=root, tag=tag)
19 |         assert len(recv_buffer) == len(data_buffer)
20 |         return recv_buffer
21 | 
22 | def Reduce_select(comm, rank, root, destinations, val_buffer, sum_buffer=None, tag=0):
23 |     if sum_buffer is None:
24 |         sum_buffer = np.zeros_like(val_buffer)
25 |     if rank == root:
26 |         sum_buffer += val_buffer
27 |         for dest_rank in destinations:
28 |             recv_buffer = comm.recv(source=dest_rank, tag=root)
29 |             sum_buffer += recv_buffer
30 |     else:
31 |         comm.send(val_buffer, dest=root, tag=root)
32 |     return sum_buffer
33 | 
34 | def Allreduce_select(comm, rank, root, destinations, val_buffer, tag_reduce=0, tag_bcast=0):
35 |     assert tag_reduce != tag_bcast
36 |     sum_buffer = Reduce_select(comm, rank, root, destinations, val_buffer, tag=tag_reduce)
37 |     sum_buffer = Bcast_select(comm, rank, root, destinations, sum_buffer, tag=tag_bcast)
38 |     return sum_buffer
39 | 
40 | def allgather_select(comm, rank, root, destinations, data_buffer, tag=0):
41 |     # Gather everything to root
42 |     index_map = {_v: _i for _i, _v in enumerate([root] + list(destinations))}
43 |     gather_buffer = [None] * len(index_map)
44 |     if rank == root:
45 |         gather_buffer[index_map[root]] = data_buffer
46 |         for dest_rank in destinations:
47 |             recv_buffer = comm.recv(source=dest_rank, tag=tag)
48 |             gather_buffer[index_map[dest_rank]] = recv_buffer
49 |     else:
50 |         comm.send(data_buffer, dest=root, tag=tag)
51 |     # Broadcast to destinations
52 |     allgather_buffer = Bcast_select(comm, rank, root, destinations, gather_buffer, tag=tag)
53 |     return allgather_buffer
54 | 
55 | 


--------------------------------------------------------------------------------
/ceres/baselines/common/mpi_adam_select.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | ### Control reduce / broadcast at the process level
 6 | 
 7 | from mpi4py import MPI
 8 | import baselines.common.tf_util as U
 9 | import tensorflow as tf
10 | import numpy as np
11 | 
12 | from baselines.common.mpi_adam import MpiAdam
13 | from . import mpi_select
14 | 
15 | class MpiAdamSelect(MpiAdam):
16 |     '''
17 |     Extend MpiAdam with parallelization across only a selection of processes (direct or recovery) instead of all
18 |     '''
19 | 
20 |     def __init__(self, rank, root, group, var_list, *args, select_params=None, **kwargs):
21 |         super().__init__(var_list, *args, **kwargs)
22 |         self.init_select(rank, root, group)
23 | 
24 |     def init_select(self, rank, root, group):
25 |         self.rank  = rank
26 |         self.root  = root
27 |         self.group = group
28 |         self.destinations = [_e for _e in self.group if _e != self.root]
29 |         self.n_processes = self.comm.Get_size()
30 | 
31 |     def update(self, localg, stepsize):
32 |         if self.t % 100 == 0:
33 |             self.check_synced()
34 |         localg = localg.astype('float32')
35 |         globalg = mpi_select.Allreduce_select(self.comm, self.rank, self.root, self.destinations, localg, tag_reduce=self.root, tag_bcast=self.root + self.n_processes)
36 |         
37 |         if self.scale_grad_by_procs:
38 |             globalg /= len(self.group)
39 | 
40 |         self.t += 1
41 |         a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
42 |         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
43 |         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
44 |         step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
45 |         self.setfromflat(self.getflat() + step)
46 | 
47 |     def sync(self):
48 |         theta = self.getflat()
49 |         theta = mpi_select.Bcast_select(self.comm, self.rank, self.root, self.destinations, theta, tag=self.root)
50 |         self.setfromflat(theta)
51 | 
52 |     def check_synced(self):
53 |         if self.rank == self.root: # this is root
54 |             theta = self.getflat()
55 |             theta = mpi_select.Bcast_select(self.comm, self.rank, self.root, self.destinations, theta, tag=self.root)
56 |         else:
57 |             thetalocal = self.getflat()
58 |             thetaroot = np.empty_like(thetalocal)
59 |             thetaroot = mpi_select.Bcast_select(self.comm, self.rank, self.root, self.destinations, thetaroot, tag=self.root)
60 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
61 | 


--------------------------------------------------------------------------------
/ceres/envs/nav2d/nav2d_force.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | import numpy as np
 6 | from .nav2d_pos import Nav2dPos
 7 | 
 8 | class Nav2dForce(Nav2dPos):
 9 |     '''
10 |     Control an agent to navigate to a point by force commands
11 |     '''
12 | 
13 |     max_vel = 0.10
14 |     max_acc = 0.05
15 |     delta_time = 1. # one action per frame
16 |     do_clip_vel = True
17 |     do_randomize_agent_vel = True
18 |     agent_mass = 1.
19 | 
20 |     def setup_actions(self):
21 |         self.add_action('agent_set_acc_x', -self.max_acc, self.max_acc)
22 |         self.add_action('agent_set_acc_y', -self.max_acc, self.max_acc)
23 | 
24 |     def setup_observations_task_specific(self):
25 |         super().setup_observations_task_specific()
26 |         # Agent velocity
27 |         self.agent_vel_x_range = [-self.max_vel, self.max_vel]
28 |         self.agent_vel_y_range = [-self.max_vel, self.max_vel]
29 |         self.add_observation('agent_vel_x', self.agent_vel_x_range[0], self.agent_vel_x_range[1])
30 |         self.add_observation('agent_vel_y', self.agent_vel_y_range[0], self.agent_vel_y_range[1])
31 | 
32 |     def do_action(self):
33 |         self.agent_acc_x, self.agent_acc_y = self.agent_mass * self.command_play
34 |         self.agent_vel_x += self.agent_acc_x*self.delta_time
35 |         self.agent_vel_y += self.agent_acc_y*self.delta_time
36 |         if self.do_clip_vel:
37 |             self.agent_vel_x, self.agent_vel_y = self.clip_vector_by_norm([self.agent_vel_x, self.agent_vel_y], self.max_vel)
38 |         self.agent_pos_x += self.agent_vel_x*self.delta_time
39 |         self.agent_pos_y += self.agent_vel_y*self.delta_time
40 | 
41 | 
42 |     def clip_command(self, a):
43 |         if self.do_clip_command:
44 |             return self.clip_vector_by_norm(a, self.max_acc)
45 |         else:
46 |             return a
47 | 
48 |     def fill_state_task_specific(self, state):
49 |         super().fill_state_task_specific(state)
50 |         state[self.observation_index['agent_vel_x']] = self.agent_vel_x
51 |         state[self.observation_index['agent_vel_y']] = self.agent_vel_y
52 | 
53 |     def reset_agent(self):
54 |         super().reset_agent()
55 |         self.reset_agent_vel()
56 | 
57 |     def reset_agent_vel(self):
58 |         if self.do_randomize_agent_vel:
59 |             agent_vel_norm = np.random.rand()*self.max_vel
60 |             agent_vel_angle = np.random.rand()*2.*np.pi
61 |             self.agent_vel_x = agent_vel_norm*np.cos(agent_vel_angle)
62 |             self.agent_vel_y = agent_vel_norm*np.sin(agent_vel_angle)
63 |         else:
64 |             self.agent_vel_x = 0
65 |             self.agent_vel_y = 0
66 | 
67 | 


--------------------------------------------------------------------------------
/ceres/envs/resetter/resetter_env.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | import gym
 6 | import numpy as np
 7 | 
 8 | class SnapshotInfo(object):
 9 |     '''
10 |     A simple class to store snapshot metadata
11 |     '''
12 |     __slots__ = ['i_trajectory', 'i_state', 'action_level', 'action_weight']
13 |     def __init__(self, i_trajectory=None, i_state=None, action_level=None, action_weight=None):
14 |         self.i_trajectory = i_trajectory
15 |         self.i_state = i_state
16 |         self.action_level = action_level
17 |         self.action_weight = action_weight
18 | 
19 | 
20 | class ResetterEnv(gym.Env):
21 |     '''
22 |     A base class to uniformize trajectory snapshotting and restoring.
23 |     Some functions must be implemented either by the base RL environment (e.g., Nav2d)
24 |     or by a task-specific resetter environment (e.g., ResetterEnvCeres)
25 |     '''
26 |     max_reference_steps_per_episode = -1
27 | 
28 |     def __init__(self):
29 |         super(ResetterEnv, self).__init__()
30 |         self.init_reference()
31 | 
32 |     def init_reference(self):
33 |         '''
34 |         Define the type of reference snapshots to restore the environment to
35 |         '''
36 |         self._init_reference_parameters()
37 |         self._init_reference_trajectories()
38 | 
39 |     def get_random_reference_index(self):
40 |         '''
41 |         Pick a random trajectory, then a random state within that trajectory
42 |         '''
43 |         i_traj  = np.random.randint(0, len(self.reference_trajectories))
44 |         i_state = np.random.randint(0, len(self.reference_trajectories[i_traj]))
45 |         return i_traj, i_state
46 | 
47 |     def get_random_reference_snapshot(self):
48 |         '''
49 |         Get a random snapshot with the associated metadata
50 |         '''
51 |         i_traj, i_state = self.get_random_reference_index()
52 |         snapshot = self.reference_trajectories[i_traj][i_state].snapshot
53 |         assert snapshot is not None
54 |         snapshot_info = SnapshotInfo(i_trajectory=i_traj, i_state=i_state)
55 |         return snapshot, snapshot_info
56 | 
57 |     def reset_random(self):
58 |         raise NotImplementedError('Implement this in environment class {0}'.format(type(self)))
59 | 
60 |     def reset_and_restore(self, snapshot):
61 |         raise NotImplementedError('Implement this in environment class {0}'.format(type(self)))
62 | 
63 |     def _init_reference_parameters(self):
64 |         raise NotImplementedError('Implement this in resetter class {0}'.format(type(self)))
65 | 
66 |     def _init_reference_trajectories(self):
67 |         raise NotImplementedError('Implement this in resetter class {0}'.format(type(self)))
68 | 
69 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Constrained Exploration and Recovery from Experience Shaping
 2 | Constrained Exploration and Recovery from Experience Shaping is an algorithm for model-free reinforcement learning to actively reshape the action space of an agent during training so that reward-driven exploration is constrained within safety limits.
 3 | 
 4 | This repository accompanies the following paper on arXiv: https://arxiv.org/abs/1809.08925
 5 | 
 6 | | Unconstrained Random Exploration            |  Constrained Random Exploration |
 7 | :-------------------------:|:-------------------------:
 8 | <img src="examples/media/random_exploration/maze_unconstrained.gif" width="400">  |  <img src="examples/media/random_exploration/maze_constrained.gif" width="400">
 9 | 
10 | ## Installing
11 | 
12 | This implementation requires Python 3 and relies on Tensorflow for building and training constraint networks.
13 | Depending on your setup, run:
14 | ```
15 | pip install tensorflow-gpu
16 | ```
17 | if you have a CUDA-compatible device or:
18 | ```
19 | pip install tensorflow
20 | ```
21 | 
22 | For training constraint networks together with control policies, we built on top of the [OpenAI Baselines framework](https://github.com/openai/baselines/).
23 | Install it with:
24 | ```
25 | pip install baselines
26 | ```
27 | We will maintain compatibility with the OpenAI Baselines ```master``` branch (last confirmed check on 2018-09-08: [commit](https://github.com/openai/baselines/commit/58b1021b28345a902ea20cb99ac0fe3914ee4171)), though feel free to create an [issue](https://github.com/IBM/constrained-rl/issues) if you notice something wrong.
28 | 
29 | Quadratic program solving is performed using quadprog.
30 | Install first Cython:
31 | ```
32 | pip install Cython
33 | ```
34 | Then:
35 | ```
36 | pip install quadprog
37 | ```
38 | 
39 | Finally, clone this repository and install the local package with pip:
40 | ```
41 | git clone git@github.com:IBM/constrained-rl.git
42 | cd constrained-rl
43 | pip install -e .
44 | ```
45 | 
46 | ## Examples
47 | Examples and reference data are provided in the [examples](examples) directory:
48 | 1. [Learning action space constraints from positive and negative demonstrations](examples/01-constraints-from-demonstrations.md): fixed maze
49 | 2. [Learning action space constraints from scratch](examples/02-constraints-from-scratch.md): random obstacles with position and force control
50 | 
51 | ## License <a name="license"></a>
52 | The Constrained Exploration and Recovery from Experience Shaping Project uses the [MIT](LICENSE) software license.
53 | 
54 | ## Contributing to the project
55 | Full details of how to contribute to this project are documented in the [CONTRIBUTING.md](CONTRIBUTING.md) file.
56 | 
57 | ## Maintainers
58 | The project's [maintainers](MAINTAINERS.txt): are responsible for reviewing and merging all pull requests and they guide the over-all technical direction of the project.
59 | 
60 | 


--------------------------------------------------------------------------------
/ceres/constraints/constraint_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | import os
 6 | from ceres.tools.io.h5_helper import save_dict_as_h5, load_dict_from_h5
 7 | 
 8 | class ConstraintConfig(object):
 9 |     '''
10 |     Constraint network configuration with save and restore functions
11 |     '''
12 | 
13 |     valid_param = ['mlp_hidden_layers',
14 |                    'n_ineq',
15 |                    'loss_weights',
16 |                    'spherical_coordinates',
17 |                    'normalize_ineq_mat',
18 |                    'predict_interior_point',
19 |                    'interior_point_margin_min',
20 |                    'interior_point_margin_max',
21 |                    'interior_point_max']
22 |     cnet_config_filename = 'cnet_config.h5'
23 | 
24 |     def __init__(self, **kwargs):
25 |         self.set_default()
26 |         self.set(**kwargs)
27 | 
28 |     def set_default(self):
29 |         self.spherical_coordinates = False
30 |         self.normalize_ineq_mat = False
31 |         self.predict_interior_point = False
32 |         self.interior_point_margin_min = 0.
33 |         self.interior_point_margin_max = 0.
34 |         self.interior_point_max = 0.
35 |         self.loss_weights = {}
36 | 
37 |     def set(self, **kwargs):
38 |         for key, value in kwargs.items():
39 |             assert key in self.valid_param, 'Invalid parameter type {0}'.format(key)
40 |             setattr(self, key, value)
41 | 
42 |     def save(self, path_save):
43 |         d = self.__dict__
44 |         assert os.path.isdir(path_save), 'Config save function only takes a directory as input'
45 |         path_save = os.path.join(path_save, self.cnet_config_filename)
46 |         save_dict_as_h5(d, path_save, verbose=True)
47 | 
48 |     @classmethod
49 |     def from_backup(cls, path_save):
50 |         if os.path.isdir(path_save):
51 |             path_cnet_dir = path_save
52 |         else:
53 |             path_cnet_dir = os.path.dirname(path_save)
54 |         path_cnet_config = os.path.join(path_cnet_dir, cls.cnet_config_filename)
55 |         d = load_dict_from_h5(path_cnet_config, verbose=False)
56 |         cnet_config = cls(**d)
57 |         return cnet_config
58 | 
59 |     @classmethod
60 |     def from_extra_args(cls, args):
61 |         cnet_config = cls(mlp_hidden_layers=args.cnet_hidden_layers,
62 |                           n_ineq=args.cnet_n_ineq,
63 |                           loss_weights=args.cnet_loss_weights,
64 |                           spherical_coordinates=args.cnet_spherical_coordinates,
65 |                           normalize_ineq_mat=args.cnet_normalize_ineq_mat,
66 |                           predict_interior_point=args.cnet_predict_interior_point,
67 |                           interior_point_margin_min=args.cnet_interior_point_margin_min,
68 |                           interior_point_margin_max=args.cnet_interior_point_margin_max,
69 |                           interior_point_max=args.cnet_interior_point_max,
70 |         )
71 |         return cnet_config
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/ceres/envs/nav2d/nav2d_ceres.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | from ceres.envs import CeresEnv
 6 | from .obstacles import ObstacleSquare, ObstacleCircle
 7 | from .nav2d_pos import Nav2dPos
 8 | from .nav2d_force import Nav2dForce
 9 | from .nav2d_rendering import Nav2dRendering
10 | from .nav2d_obstacles import Nav2dObstacles
11 | import numpy as np
12 | 
13 | class FixedMaze(object):
14 |     '''
15 |     Fixed square obstacles defining a maze
16 |     '''
17 |     fixed_obstacles = [
18 |         ObstacleSquare(top_left_x=-0.70, top_left_y = 0.70, bottom_right_x=0.70, bottom_right_y=0.35),
19 |         ObstacleSquare(top_left_x=-1.00, top_left_y = -0.35, bottom_right_x=-0.30, bottom_right_y=-0.70),
20 |         ObstacleSquare(top_left_x=0.30, top_left_y = -0.35, bottom_right_x=1.00, bottom_right_y=-0.70),
21 |         ObstacleSquare(top_left_x=-1.00, top_left_y = 0.05, bottom_right_x=-0.65, bottom_right_y=-0.05),
22 |         ObstacleSquare(top_left_x=0.65, top_left_y = 0.05, bottom_right_x=1.00, bottom_right_y=-0.05),
23 |         ObstacleSquare(top_left_x=-0.40, top_left_y = 0.05, bottom_right_x=0.40, bottom_right_y=-0.05),
24 |         ObstacleSquare(top_left_x=-0.05, top_left_y = 0.40, bottom_right_x=0.05, bottom_right_y=-0.40),
25 |         ObstacleSquare(top_left_x=-0.05, top_left_y = -0.70, bottom_right_x=0.05, bottom_right_y=-1.00),
26 |     ]
27 | 
28 | class RandomHoles(object):
29 |     '''
30 |     Circle obstacles randomized for every episode
31 |     '''
32 |     n_random_circle_obstacles = 10
33 |     random_circle_obstacle_dim_range = [0.10, 0.25]
34 |     is_state_target_rel_pos = True
35 |     state_lidar_angles = np.linspace(0., 2.*np.pi, 8, endpoint=False)
36 | 
37 | class Nav2dPosCeres(Nav2dRendering, Nav2dPos, CeresEnv):
38 |     max_reference_trajectories = 1024
39 |     max_recovery_steps = 5
40 | 
41 | class Nav2dPosFixedMazeCeres(FixedMaze, Nav2dRendering, Nav2dObstacles, Nav2dPos, CeresEnv):
42 |     max_reference_trajectories = 1024
43 |     max_recovery_steps = 5
44 | 
45 | class Nav2dPosFixedMazeCeres5N(Nav2dPosFixedMazeCeres):
46 |     max_normalized_obs = 5.
47 |     max_normalized_act = 5.
48 | 
49 | class Nav2dPosRandomHolesCeres(RandomHoles, Nav2dRendering, Nav2dObstacles, Nav2dPos, CeresEnv):
50 |     max_reference_trajectories = 1024
51 |     max_recovery_steps = 5
52 | 
53 | class Nav2dPosRandomHolesCeres5N(Nav2dPosRandomHolesCeres):
54 |     max_normalized_obs = 5.
55 |     max_normalized_act = 5.
56 | 
57 | class Nav2dForceCeres(Nav2dRendering, Nav2dForce, CeresEnv):
58 |     max_reference_trajectories = 1024
59 |     max_recovery_steps = 10
60 | 
61 | class Nav2dForceFixedMazeCeres(FixedMaze, Nav2dRendering, Nav2dObstacles, Nav2dForce, CeresEnv):
62 |     max_reference_trajectories = 1024
63 |     max_recovery_steps = 10
64 | 
65 | class Nav2dForceRandomHolesCeres(RandomHoles, Nav2dRendering, Nav2dObstacles, Nav2dForce, CeresEnv):
66 |     max_reference_trajectories = 1024
67 |     max_recovery_steps = 10
68 | 
69 | class Nav2dForceRandomHolesCeres5N(Nav2dForceRandomHolesCeres):
70 |     max_normalized_obs = 5.
71 |     max_normalized_act = 5.
72 | 


--------------------------------------------------------------------------------
/ceres/envs/constrained/constrained_env_network.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | import numpy as np
 6 | import tensorflow as tf
 7 | from ceres.constraints import ConstraintNetworkMLP, ConstraintConfig
 8 | from .constrained_env import ConstrainedEnv
 9 | 
10 | class ConstrainedEnvNetwork(ConstrainedEnv):
11 |     '''
12 |     Environment with constraints predicted by a constraint network.
13 |     Make sure to call init_constraint_prediction before running it.
14 |     '''
15 | 
16 |     def __init__(self, *args, **kwargs):
17 |         super(ConstrainedEnvNetwork, self).__init__(*args, **kwargs)
18 |         self.ineq_mat_params, self.ineq_vec_params = None, None
19 |         self.is_initialized_constraint_prediction = False
20 | 
21 |     def init_constraint_prediction(self, cnet, session=None):
22 |         '''
23 |         Initialize constraint network, either from a ConstraintNetworkMLP object or a path to a trained network backup
24 |         '''
25 |         if session is None:
26 |             tf_config = tf.ConfigProto()
27 |             tf_config.gpu_options.allow_growth = True
28 |             self.cnet_session = tf.Session(config=tf_config)
29 |         else:
30 |             self.cnet_session = session
31 | 
32 |         if type(cnet) == str: # path to constraint network backup
33 |             cnet_config = ConstraintConfig.from_backup(cnet)
34 |             self.cnet = ConstraintNetworkMLP(self.observation_space, self.action_space, cnet_config)
35 |             self.cnet.restore_model(cnet, session=self.cnet_session)
36 |         else:
37 |             assert isinstance(cnet, ConstraintNetworkMLP)
38 |             self.cnet = cnet
39 |         self.is_initialized_constraint_prediction = True
40 | 
41 |     def update_ineq_matrices(self, state):
42 |         '''
43 |         Predict ineq matrices by passing an input state through the constraint network and compute auxiliary variables
44 |         '''
45 |         assert self.is_initialized_constraint_prediction, 'Constraint prediction is not initialized: call init_constraint_prediction'
46 |         feed_dict = {self.cnet.observation: np.expand_dims(state, axis=0)}
47 |         cnet_outputs = [self.cnet.ineq_mat, self.cnet.ineq_vec, self.cnet.ineq_mat_params, self.cnet.ineq_vec_params, self.cnet.interior_point]
48 |         ineq_outputs = self.cnet_session.run(cnet_outputs, feed_dict=feed_dict)
49 |         self.ineq_mat, self.ineq_vec, self.ineq_mat_params, self.ineq_vec_params, self.ineq_interior_point = [_v[0] for _v in ineq_outputs]
50 |         self.ineq_interior_point_flat = np.squeeze(self.ineq_interior_point)
51 |         # Check constraint prediction validity
52 |         if not (np.all(np.isfinite(self.ineq_mat)) and np.all(np.isfinite(self.ineq_vec))):
53 |             error_str = 'Invalid inequality matrices: make sure you are using a recent version of Tensorflow' # In some versions, tf.cos and tf.sin can output infinity for large inputs
54 |             print('Inequality parameters')
55 |             print(self.ineq_mat_params)
56 |             print(self.ineq_vec_params)
57 |             print('Processed constraints')
58 |             self.print_ineq()
59 |             raise ValueError(error_str)
60 | 


--------------------------------------------------------------------------------
/ceres/tools/math/qpsolver_quadprog.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | import numpy as np
 6 | import quadprog
 7 | from .qpsolver import QPSolver
 8 | 
 9 | class QPSolverQuadprog(QPSolver):
10 |     '''
11 |     A class interfacing with the Quadprog QP solver
12 |     '''
13 | 
14 |     def __init__(self, n_var=None, verbose=False):
15 |         super(QPSolverQuadprog, self).__init__(n_var=n_var, verbose=verbose)
16 | 
17 |     def update_solver_specific(self):
18 |         self.obj_mat_quadprog = self.obj_mat
19 |         self.obj_vec_quadprog = -np.squeeze(self.obj_vec, axis=1)
20 |         self.obj_mat_quadprog = self.obj_mat_quadprog.astype(dtype=np.float64)
21 |         self.obj_vec_quadprog = self.obj_vec_quadprog.astype(dtype=np.float64)
22 |         has_eq_loc   = self.eq_mat is not None
23 |         has_ineq_loc = self.ineq_mat is not None
24 |         if has_ineq_loc:
25 |             if has_eq_loc:
26 |                 self.constraint_mat_quadprog = -np.vstack([self.eq_mat, self.ineq_mat]).transpose()
27 |                 self.constraint_vec_quadprog = -np.hstack([np.squeeze(self.eq_vec, axis=1), np.squeeze(self.ineq_vec, axis=1)])
28 |             else:
29 |                 self.constraint_mat_quadprog = -self.ineq_mat.transpose()
30 |                 self.constraint_vec_quadprog = -np.squeeze(self.ineq_vec, axis=1)
31 |         else:
32 |             if has_eq_loc:
33 |                 self.constraint_mat_quadprog = -self.eq_mat.transpose()
34 |                 self.constraint_vec_quadprog = -np.squeeze(self.eq_vec, axis=1)
35 |             else:
36 |                 self.constraint_mat_quadprog = None
37 |                 self.constraint_vec_quadprog = None
38 |         if has_eq_loc or has_ineq_loc:
39 |             self.constraint_mat_quadprog = self.constraint_mat_quadprog.astype(dtype=np.float64)
40 |             self.constraint_vec_quadprog = self.constraint_vec_quadprog.astype(dtype=np.float64)
41 | 
42 |     def solve(self):
43 |         try:
44 |             self.solver_out = quadprog.solve_qp(self.obj_mat_quadprog, self.obj_vec_quadprog,
45 |                                                 self.constraint_mat_quadprog, self.constraint_vec_quadprog,
46 |                                                 self.n_eq)
47 |             self.optimum = self.solver_out[0]
48 |             self.success = True
49 |         except ValueError as e:
50 |             print('WARNING: solver failed ({0})'.format(e))
51 |             self.optimum = np.zeros(self.n_var)
52 |             self.success = False
53 |         return self.optimum, self.success
54 | 
55 | if __name__ == '__main__':
56 |     '''
57 |     Implement example from cvxopt.org
58 |     minimize 2 x1^2 + x2^2 + x1*x2 + x1 + x2
59 |     subject to:
60 |         x1 >= 0
61 |         x2 >= 0
62 |         x1 + x2 = 1
63 |     '''
64 |     qp_solver = QPSolverQuadprog()
65 |     Q = 2.*np.array([[2., 0.5],
66 |                      [0.5, 1.]])
67 |     p = np.array([[1.],
68 |                   [1.]])
69 |     G1 = np.array([[-1., 0.]])
70 |     h1 = np.array([[0.]])
71 |     G2 = np.array([[0., -1.]])
72 |     h2 = np.array([[0.]])
73 |     A = np.array([[1., 1.]])
74 |     b = np.array([[1.]])
75 |     qp_solver.add_obj(Q, p)
76 |     qp_solver.add_eq(A, b)
77 |     qp_solver.add_ineq(G1, h1)
78 |     qp_solver.add_ineq(G2, h2)
79 |     qp_solver.update()
80 |     x_opt, success = qp_solver.solve()
81 |     print(x_opt, success)
82 |     
83 |     if input('Enter debug mode? y/[N]\n').lower() == 'y':
84 |         import ipdb; ipdb.set_trace()
85 | 


--------------------------------------------------------------------------------
/ceres/constraints/constraint_network_mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
 2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
 3 | # This project is licensed under the MIT License, see LICENSE
 4 | 
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | from .constraint_network import ConstraintNetwork
 8 | from ceres.networks import NetworkSaverMLP
 9 | 
10 | class ConstraintNetworkMLP(ConstraintNetwork, NetworkSaverMLP):
11 |     '''
12 |     Constraint network with MLP and save/restore functions
13 |     '''
14 | 
15 |     def __init__(self, observation_space, action_space, config):
16 |         NetworkSaverMLP.__init__(self, network_id='cnet')
17 |         ConstraintNetwork.__init__(self, observation_space, action_space, config)
18 | 
19 |     def build_model(self):
20 |         return NetworkSaverMLP.build_model(self, self.observation, self.n_outputs,
21 |                                            self.config.mlp_hidden_layers,
22 |                                            self.initializer,
23 |                                            self.activation_common)
24 | 
25 | 
26 | def play_cnet():
27 |     '''
28 |     Load a trained constrained network and print constraint predictions from random states
29 |     '''
30 |     from ceres.tools import ExtraArgs
31 |     from ceres.envs import ConstrainedEnv
32 |     import gym
33 |     extra_args = ExtraArgs(ignore_max_timesteps=True, ignore_max_iterations=True, ignore_max_episodes=True)
34 |     assert len(extra_args.env_id) > 0, 'Required argument --env_id'
35 |     env = gym.make(extra_args.env_id)
36 |     assert isinstance(env.unwrapped, ConstrainedEnv), 'The chosen environment {0} does not support constraints'.format(extra_args.env_id)
37 |     assert len(extra_args.trained_cnet) > 0, 'Required argument --trained_cnet'
38 |     cnet_config = ConstraintConfig.from_backup(extra_args.trained_cnet)
39 |     cnet = ConstraintNetworkMLP(env.observation_space, env.action_space, cnet_config)
40 | 
41 |     n_obs = env.observation_space.shape[0]
42 |     def random_state():
43 |         pass
44 | 
45 |     cmd_str = '[r/Return]: random state, [q]: quit, otherwise input comma-separated state of length {0}\n'.format(n_obs)
46 |     with tf.Session() as sess:
47 |         def predict_constraints(state):
48 |             observation = [state]
49 |             ineq_mat, ineq_vec = sess.run([cnet.ineq_mat, cnet.ineq_vec], feed_dict={cnet.observation: observation})
50 |             ineq_mat = ineq_mat[0]
51 |             ineq_vec = ineq_vec[0]
52 |             return ineq_mat, ineq_vec
53 | 
54 |         def predict_and_print_constraints(state):
55 |             print('Input state: {0}'.format(state))
56 |             ineq_mat, ineq_vec = predict_constraints(state)
57 |             env.unwrapped.print_ineq(ineq_mat=ineq_mat, ineq_vec=ineq_vec)
58 | 
59 |         cnet.restore_model(extra_args.trained_cnet, session=sess)
60 |         while True:
61 |             cmd = input(cmd_str)
62 |             if cmd == 'q':
63 |                 break
64 |             elif (cmd == 'r') or (cmd == ''):
65 |                 state = np.random.rand(n_obs)
66 |                 predict_and_print_constraints(state)
67 |             else:
68 |                 try:
69 |                     state = list(map(float, cmd.split(',')))
70 |                     assert len(state) == n_obs, 'input state {0} is of length {1}, expected {2}'.format(state, len(state), n_obs)
71 |                     predict_and_print_constraints(state)
72 |                 except Exception as e:
73 |                     print('Invalid command \'{0}\': {1}'.format(cmd, str(e)))
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     play_cnet()
78 | 
79 | 


--------------------------------------------------------------------------------
/ceres/baselines/ppo1/pposgd_simple.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This is the learn function from OpenAI's baselines.ppo1.pposgd_simple
 3 | rewritten with individual functions in .pposgd_simple_helper
 4 | OpenAI Baselines is licensed under the MIT License, see LICENSE
 5 | '''
 6 | 
 7 | from baselines.common.mpi_moments import mpi_moments
 8 | from baselines.ppo1.pposgd_simple import traj_segment_generator
 9 | from baselines import logger
10 | import baselines.common.tf_util as U
11 | import tensorflow as tf, numpy as np
12 | from mpi4py import MPI
13 | from .pposgd_simple_helper import build_policy_training_vars, build_counters, adjust_policy_learning_rate, update_policy, log_iter_info, calc_end_training
14 | 
15 | def learn(env, policy_fn, *,
16 |         timesteps_per_actorbatch, # timesteps per actor per update
17 |         clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
18 |         optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
19 |         gamma, lam, # advantage estimation
20 |         max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
21 |         callback=None, # you can do anything in the callback, since it takes locals(), globals()
22 |         adam_epsilon=1e-5,
23 |         schedule='constant' # annealing for stepsize parameters (epsilon and adam)
24 |         ):
25 |     # Setup losses and stuff
26 |     # ----------------------------------------
27 | 
28 |     ob_space = env.observation_space
29 |     ac_space = env.action_space
30 |     pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy
31 |     oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy
32 | 
33 |     loss_names, var_list, lossandgrad, adam, assign_old_eq_new, compute_losses = build_policy_training_vars(pi, oldpi, clip_param, entcoeff, adam_epsilon)
34 |     mpi_moments_fn = lambda losses: mpi_moments(losses, axis=0)
35 |     allgather_fn = MPI.COMM_WORLD.allgather
36 | 
37 |     U.initialize()
38 |     adam.sync()
39 | 
40 |     # Prepare for rollouts
41 |     # ----------------------------------------
42 |     seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)
43 | 
44 |     iters_so_far, episodes_so_far, timesteps_so_far, tstart, lenbuffer, rewbuffer = build_counters()
45 | 
46 |     assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"
47 | 
48 |     while True:
49 |         if callback: callback(locals(), globals())
50 |         
51 |         if calc_end_training(max_timesteps, timesteps_so_far,
52 |                              max_episodes, episodes_so_far,
53 |                              max_iters, iters_so_far,
54 |                              max_seconds, tstart):
55 |             break
56 | 
57 |         logger.log("********** Iteration %i ************"%iters_so_far)
58 | 
59 |         seg = seg_gen.__next__()
60 | 
61 |         cur_lrmult = adjust_policy_learning_rate(schedule, max_timesteps, timesteps_so_far, max_episodes, episodes_so_far, max_iters, iters_so_far)
62 |         vpredbefore, tdlamret, optim_batchsize = update_policy(pi, seg, gamma, lam,
63 |                                                      logger, optim_epochs, optim_batchsize, optim_stepsize, cur_lrmult,
64 |                                                      loss_names, lossandgrad, adam, assign_old_eq_new, compute_losses,
65 |                                                      mpi_moments_fn, allgather_fn)
66 | 
67 |         episodes_so_far, timesteps_so_far = log_iter_info(lenbuffer, rewbuffer, tstart,
68 |                                                           vpredbefore, tdlamret, seg,
69 |                                                           episodes_so_far, timesteps_so_far,
70 |                                                           MPI.COMM_WORLD.Get_rank()==0)
71 |         iters_so_far += 1
72 | 
73 |     return pi
74 | 


--------------------------------------------------------------------------------
/ceres/tools/math/spherical_coordinates.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | class SphericalCoordinates(object):
  9 |     '''
 10 |     Implement N-dimensional coordinates in Numpy and Tensorflow
 11 |     https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates
 12 |     '''
 13 | 
 14 |     def __init__(self, dim, input_angles=None):
 15 |         assert dim >= 2
 16 |         self.dim = dim
 17 |         self.n_angles = dim - 1
 18 |         if input_angles is None:
 19 |             self.input_angles = tf.placeholder(tf.float32, shape=(None, self.n_angles))
 20 |         else:
 21 |             self.input_angles = input_angles
 22 |         self.init_angles_to_unit_vec()
 23 |     
 24 |     def spherical_to_cartesian(self, angles, radius=1):
 25 |         assert angles.shape[-1] == self.n_angles
 26 |         shape_angles_in = angles.shape
 27 |         angles_batch = np.reshape(angles, [-1, self.n_angles])
 28 |         vec_batch = []
 29 |         for angles in angles_batch:
 30 |             previous = radius
 31 |             vec = []
 32 |             for angle in angles[:-1]:
 33 |                 coord = previous*np.cos(angle)
 34 |                 vec.append(coord)
 35 |                 previous *= np.sin(angle)
 36 |             angle = angles[-1]
 37 |             vec.append(previous*np.cos(angle))
 38 |             vec.append(previous*np.sin(angle))
 39 |             vec_batch.append(vec)
 40 |         shape_vec_out = list(shape_angles_in)
 41 |         shape_vec_out[-1] += 1
 42 |         vec_batch = np.array(vec_batch)
 43 |         vec_batch = np.reshape(vec_batch, shape_vec_out)
 44 |         return vec_batch
 45 | 
 46 |     def init_angles_to_unit_vec(self, radius=None):
 47 |         angles = self.input_angles
 48 |         shape_angles_in = tf.shape(angles)
 49 |         angles = tf.reshape(angles, [-1, shape_angles_in[-1]])
 50 |         angles_cos = tf.cos(angles)
 51 |         angles_sin = tf.sin(angles)
 52 |         vec = []
 53 |         previous = 1.
 54 |         for i_angle in range(self.n_angles-1):
 55 |             axis_cos = tf.slice(angles_cos, [0, i_angle], [-1, 1])
 56 |             axis_sin = tf.slice(angles_sin, [0, i_angle], [-1, 1])
 57 |             coord = tf.multiply(previous, axis_cos)
 58 |             previous  = tf.multiply(previous, axis_sin)
 59 |             vec.append(coord)
 60 |         i_angle = self.n_angles-1
 61 |         axis_cos = tf.slice(angles_cos, [0, i_angle], [-1, 1])
 62 |         axis_sin = tf.slice(angles_sin, [0, i_angle], [-1, 1])
 63 |         vec.append(tf.multiply(previous, axis_cos))
 64 |         vec.append(tf.multiply(previous, axis_sin))
 65 |         vec = tf.concat(vec, axis=1)
 66 |         if radius is not None:
 67 |             radius = tf.reshape(radius, [-1, shape_angles_in[-1]])
 68 |             vec = tf.multiply(radius, vec)
 69 |         shape_vec_last = tf.constant([self.dim], dtype=shape_angles_in.dtype)
 70 |         shape_vec_out = tf.concat([shape_angles_in[:-1], shape_vec_last], axis=0)
 71 |         vec = tf.reshape(vec, shape_vec_out)
 72 |         self.output_unit_vec = vec
 73 | 
 74 | def main():
 75 |     while True:
 76 |         input_str = input('Input list of angles, in degrees, comma-separated\n')
 77 |         if len(input_str) == 0:
 78 |             #angles_deg = np.array([[0.0], [90.]])
 79 |             angles_deg = np.array([
 80 |                                    [[0.],
 81 |                                     [45.]],
 82 |                                    [[90.],
 83 |                                     [135.]]
 84 |                                   ])
 85 |             print('Use default example: {0}'.format(angles_deg))
 86 |         else:
 87 |             angles_deg = np.array([float(e) for e in input_str.split(',')])
 88 |         n_angles = angles_deg.shape[-1]
 89 |         dim = n_angles + 1
 90 |         angles_rad = np.radians(angles_deg)
 91 |         print('Degrees: {0}'.format(angles_deg))
 92 |         print('Radians: {0}'.format(angles_rad))
 93 |         sc = SphericalCoordinates(dim)
 94 |         vec_np = sc.spherical_to_cartesian(angles_rad)
 95 |         print('Unit vector')
 96 |         print('  Numpy: {0}'.format(vec_np))
 97 |         with tf.Session() as sess:
 98 |             angles_rad_reshaped = np.reshape(angles_rad, [-1, n_angles])
 99 |             vec_tf = sess.run([sc.output_unit_vec], feed_dict={sc.input_angles: angles_rad_reshaped})
100 |             print('  Tensorflow: {0}'.format(vec_tf))
101 | 
102 | if __name__ == '__main__':
103 |     main()
104 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ### Welcome
 2 | 
 3 | We welcome contributions to the Constrained Exploration and Recovery from Experience Shaping Project in many forms, and there's always plenty to do!
 4 | 
 5 | First things first, please review the Constrained Exploration and Recovery from Experience Shaping Project's [Code of Conduct](CONDUCT.md) before participating. It is important that we keep things civil.
 6 | 
 7 | ### Reporting bugs
 8 | If you are a user and you find a bug, please submit an [issue](https://github.com/IBM/constrained-rl/issues). Please try to provide sufficient information for someone else to reproduce the issue. One of the project's maintainers should respond to your issue within 24 hours. If not, please bump the issue and request that it be reviewed.
 9 | 
10 | ### Fixing issues and working stories
11 | Review the [issues list](https://github.com/IBM/constrained-rl/issues) and find something that interests you. You could also check the ["help wanted"](https://github.com/IBM/constrained-rl/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) list. It is wise to start with something relatively straight forward and achievable. Usually there will be a comment in the issue that indicates whether someone has already self-assigned the issue. If no one has already taken it, then add a comment assigning the issue to yourself, eg.: ```I'll work on this issue.```. Please be considerate and rescind the offer in comments if you cannot finish in a reasonable time, or add a comment saying that you are still actively working the issue if you need a little more time.
12 | 
13 | We are using the [GitHub Flow](https://guides.github.com/introduction/flow/) process to manage code contributions. If you are unfamiliar, please review that link before proceeding.
14 | 
15 | To work on something, whether a new feature or a bugfix:
16 |   1. Create a [fork](https://help.github.com/articles/fork-a-repo/) (if you haven't already)
17 | 
18 |   2. Clone it locally
19 |   ```
20 |   git clone https://github.com/yourid/constrained-rl.git
21 |   ```
22 |   3. Add the upstream repository as a remote
23 |   ```
24 |   git remote add upstream https://github.com/IBM/constrained-rl.git
25 |   ```
26 |   4. Create a branch
27 | 
28 |   Create a descriptively-named branch off of your cloned fork ([more detail here](https://help.github.com/articles/syncing-a-fork/))
29 |   ```
30 |   cd constrained-rl
31 |   git checkout -b issue-nnnn
32 |   ```
33 |   5. Commit your code
34 | 
35 |   Commit to that branch locally, and regularly push your work to the same branch on the server.
36 | 
37 |   6. Commit messages
38 | 
39 |   Commit messages must have a short description no longer than 50 characters followed by a blank line and a longer, more descriptive message that includes reference to issue(s) being addressed so that they will be automatically closed on a merge e.g. ```Closes #1234``` or ```Fixes #1234```.
40 | 
41 |   7. Pull Request (PR)
42 | 
43 |   When you need feedback or help, or you think the branch is ready for merging, open a pull request (make sure you have first successfully built and tested your changes.
44 | 
45 |    _Note: if your PR does not merge cleanly, use ```git rebase master``` in your feature branch to update your pull request rather than using ```git merge master```_.
46 | 
47 |   8. Did we mention tests? All code changes should be accompanied by new or modified tests.
48 | 
49 |   9. Any code changes that affect documentation should be accompanied by corresponding changes (or additions) to the documentation and tests. This will ensure that if the merged PR is reversed, all traces of the change will be reversed as well.
50 | 
51 | After your Pull Request (PR) has been reviewed and signed off, a maintainer will merge it into the master branch.
52 | 
53 | ## Coding guidelines
54 | 
55 | ### Becoming a maintainer
56 | Projects or sub-projects will be lead by a set of maintainers. New projects can designate an initial set of maintainers that will be approved by the Technical Steering Committee when the project is first approved. The project's maintainers will, from time-to-time, consider adding a new maintainer. An existing maintainer will post a pull request to the [MAINTAINERS.txt](MAINTAINERS.txt) file. If a majority of the maintainers concur in the comments, the pull request is then merged and the individual becomes a maintainer.
57 | 
58 | ### Legal stuff
59 | We have tried to make it as easy as possible to make contributions. This applies to how we handle the legal aspects of contribution. We use the same approach&mdash;the [Developer's Certificate of Origin 1.1 (DCO)](DCO1.1.txt)&mdash;that the Linux&reg; Kernel [community](http://elinux.org/Developer_Certificate_Of_Origin) uses to manage code contributions.
60 | We simply ask that when submitting a pull request, the developer must include a sign-off statement in the pull request description.
61 | 
62 | Here is an example Signed-off-by line, which indicates that the submitter accepts the DCO:
63 | 
64 | ```
65 | Signed-off-by: John Doe <john.doe@hisdomain.com>
66 | ```
67 | 


--------------------------------------------------------------------------------
/ceres/scripts/play_policy.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import sys
  6 | import os
  7 | import numpy as np
  8 | import time
  9 | 
 10 | from ceres.tools import ExtraArgs
 11 | from baselines.common.cmd_util import make_mujoco_env
 12 | import baselines.common.tf_util as U
 13 | from ceres.envs import CeresEnv
 14 | from ceres.baselines.ceres.pposgd_ceres import build_policy_observation_filter
 15 | 
 16 | class DummyPolicy(object):
 17 |     '''
 18 |     A dummy policy that outputs either zero or random actions in the size expected by the environment
 19 |     '''
 20 | 
 21 |     def __init__(self, name, ob_space, ac_space):
 22 |         self.name = name
 23 |         self.ob_space = ob_space
 24 |         self.ac_space = ac_space
 25 | 
 26 |         self.ac_zero = np.zeros(self.ac_space.shape)
 27 |         self.vpred_zero = 0.
 28 | 
 29 |     def act(self, stochastic, ob):
 30 |         if stochastic:
 31 |             return self.ac_space.sample(), self.vpred_zero
 32 |         else:
 33 |             return self.ac_zero, self.vpred_zero
 34 | 
 35 | def main():
 36 |     '''
 37 |     Load and play trained policy
 38 |     '''
 39 |     log_root = os.path.join(os.getcwd(), 'logs')
 40 |     extra_args = ExtraArgs(log_root=log_root)
 41 | 
 42 |     env = make_mujoco_env(extra_args.env_id, extra_args.seed)
 43 | 
 44 |     if isinstance(env.unwrapped, CeresEnv) and (len(extra_args.trained_cnet) > 0):
 45 |         env.unwrapped.init_ceres()
 46 |         env.unwrapped.init_constraint_prediction(extra_args.trained_cnet)
 47 | 
 48 |     episode_lengths = np.zeros(extra_args.max_episodes)
 49 |     episode_rewards = np.zeros(extra_args.max_episodes)
 50 |     ob = env.reset()
 51 | 
 52 |     do_save_render = extra_args.render and len(extra_args.save_render) > 0
 53 |     if do_save_render:
 54 |         os.makedirs(extra_args.save_render, exist_ok=True)
 55 | 
 56 |     def save_render(i_step, max_step=300, verbose=True):
 57 |         n_digits = len(str(max_step))
 58 |         do_save_step = (max_step <= 0) or (i_step <= max_step)
 59 |         if do_save_render and do_save_step:
 60 |             path_save = os.path.join(extra_args.save_render, str(i_step).zfill(n_digits) + '.png')
 61 |             env.unwrapped.save_render(path_save, verbose=verbose)
 62 | 
 63 |     ob_space = env.unwrapped.observation_space
 64 |     ac_space = env.unwrapped.action_space
 65 |     ob_space, policy_observation_filter= build_policy_observation_filter(extra_args, ob_space)
 66 | 
 67 |     env.unwrapped.set_ineq_margin(extra_args.conservative_exploration)
 68 | 
 69 |     if len(extra_args.trained_policy) > 0:
 70 |         assert os.path.exists(extra_args.trained_policy), 'Invalid path to model: \'{0}\''.format(extra_args.trained_policy)
 71 |         from ceres.baselines.ceres.mlp_policy_saver import MlpPolicySaver
 72 |         from baselines.common import tf_util as U
 73 |         sess = U.single_threaded_session()
 74 |         sess.__enter__()
 75 | 
 76 |         def policy_fn(name, ob_space, ac_space):
 77 |             return MlpPolicySaver(name, ob_space=ob_space, ac_space=ac_space,
 78 |                 hid_size=extra_args.policy_hidden_size, num_hid_layers=extra_args.policy_hidden_layers)
 79 |         pi = policy_fn('pi', ob_space, ac_space)
 80 | 
 81 |         U.initialize()
 82 |         pi.restore_model(extra_args.trained_policy, session=sess)
 83 |     else:
 84 |         print('Invalid model path \'{0}\', use dummy agent'.format(extra_args.trained_policy))
 85 |         pi = DummyPolicy('pi', ob_space, ac_space)
 86 | 
 87 |     time_total = 0.
 88 |     n_steps_global = -1
 89 |     for i_episode in range(extra_args.max_episodes):
 90 |         print('Episode {0}'.format(i_episode))
 91 |         time_episode_begin = time.time()
 92 |         ob = policy_observation_filter(ob)
 93 |         n_steps_global += 1
 94 |         if extra_args.render:
 95 |             env.render()
 96 |             save_render(n_steps_global)
 97 |         done = False
 98 |         ep_rew = 0.
 99 |         i_step = 0
100 |         time.sleep(extra_args.play_step_duration)
101 |         
102 |         while not done:
103 |             action, vpred = pi.act(True, ob)
104 |             ob, rew, done, info = env.step(action)
105 |             ob = policy_observation_filter(ob)
106 |             ep_rew += rew
107 |             i_step += 1
108 |             n_steps_global += 1
109 |             if extra_args.render:
110 |                 env.render()
111 |                 save_render(n_steps_global)
112 |             time.sleep(extra_args.play_step_duration)
113 |         episode_lengths[i_episode] = i_step
114 |         episode_rewards[i_episode] = ep_rew
115 |         time_episode = time.time() - time_episode_begin
116 |         time_total += time_episode
117 |         print('  Episode length: {0} (average {1:.1f}), episode reward {2:.1f} (average {5:.1f}), duration {3:.1f} ms (average {4:.1f})'.format(i_step, np.average(episode_lengths[:i_episode+1]), ep_rew, 1000.*time_episode, 1000.*time_total/(i_episode+1), np.average(episode_rewards[:i_episode+1])))
118 |         ob = env.reset()
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     main()
123 | 


--------------------------------------------------------------------------------
/ceres/tools/math/qpsolver.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import numpy as np
  6 | 
  7 | class QPSolver(object):
  8 |     '''
  9 |     A base class to interface with QP solvers
 10 |     '''
 11 | 
 12 |     def __init__(self, n_var=None, verbose=False):
 13 |         self.n_var = n_var
 14 |         self.verbose = verbose
 15 |         self.reset()
 16 | 
 17 |     def reset(self, do_reset_obj=True, do_reset_eq=True, do_reset_ineq=True):
 18 |         if do_reset_obj:
 19 |             self.reset_obj()
 20 |         if do_reset_eq:
 21 |             self.reset_eq()
 22 |         if do_reset_ineq:
 23 |             self.reset_ineq()
 24 | 
 25 |     def update(self):
 26 |         self.build_obj()
 27 |         self.build_eq()
 28 |         self.build_ineq()
 29 |         self.update_solver_specific()
 30 | 
 31 |     def reset_eq(self):
 32 |         self.eq_mat_list = []
 33 |         self.eq_vec_list = []
 34 |         self.eq_mat = None
 35 |         self.eq_vec = None
 36 |         self.n_eq = 0
 37 |         self.reset_eq_solver_specific()
 38 | 
 39 |     def reset_ineq(self):
 40 |         self.ineq_mat_list = []
 41 |         self.ineq_vec_list = []
 42 |         self.ineq_mat = None
 43 |         self.ineq_vec = None
 44 |         self.n_ineq = 0
 45 |         self.reset_ineq_solver_specific()
 46 | 
 47 |     def reset_obj(self):
 48 |         self.obj_mat_list = []
 49 |         self.obj_vec_list = []
 50 |         self.obj_mat = None
 51 |         self.obj_vec = None
 52 |         self.n_obj = 0
 53 |         self.reset_obj_solver_specific()
 54 | 
 55 |     def check_mat_vec(self, mat, vec):
 56 |         '''
 57 |         Ensure that mat and vec are numpy arrays and of appropriate dimensions
 58 |         '''
 59 |         mat = np.array(mat)
 60 |         vec = np.array(vec)
 61 |         if self.n_var is None:
 62 |             self.n_var = mat.shape[1]
 63 |         else:
 64 |             assert mat.shape[1] == self.n_var, 'Invalid constraint matrix size {0} for {1} variables'.format(mat.shape, self.n_var)
 65 |         assert mat.ndim == 2, 'Invalid constraint matrix dimensions: expected 2, got {0}'.format(mat.ndim)
 66 |         assert vec.ndim == 2, 'Invalid constraint vector dimensions: expected 2, got {0}'.format(vec.ndim)
 67 |         assert mat.shape[0] == vec.shape[0], 'Inconsistent constraint matrix and vector sizes'
 68 |         assert vec.shape[1] == 1, 'Invalid constraint vector size {0}, should have one column'.format(mat.shape)
 69 |         return mat, vec
 70 | 
 71 |     def add_obj(self, mat, vec, build=False):
 72 |         mat, vec = self.check_mat_vec(mat, vec)
 73 |         assert mat.shape[0] == mat.shape[1], 'Invalid objective matrix shape {0}, should be square'.format(mat.shape)
 74 |         self.obj_mat_list.append(mat)
 75 |         self.obj_vec_list.append(vec)
 76 |         if build:
 77 |             self.build_obj()
 78 | 
 79 |     def build_obj(self):
 80 |         self.n_obj = len(self.obj_mat_list)
 81 |         assert self.n_obj > 0
 82 |         self.obj_mat = sum(self.obj_mat_list)
 83 |         self.obj_vec = sum(self.obj_vec_list)
 84 |         self.build_obj_solver_specific()
 85 | 
 86 |     def add_eq(self, mat, vec, build=False):
 87 |         mat, vec = self.check_mat_vec(mat, vec)
 88 |         self.eq_mat_list.append(mat)
 89 |         self.eq_vec_list.append(vec)
 90 |         if build:
 91 |             self.build_eq()
 92 | 
 93 |     def build_eq(self):
 94 |         if len(self.eq_mat_list) > 0:
 95 |             self.eq_mat = np.concatenate(self.eq_mat_list, axis=0)
 96 |             self.eq_vec = np.concatenate(self.eq_vec_list, axis=0)
 97 |             self.n_eq = self.eq_mat.shape[0]
 98 |         else:
 99 |             self.eq_mat = None
100 |             self.eq_vec = None
101 |             self.n_eq = 0
102 |         self.build_eq_solver_specific()
103 | 
104 |     def add_ineq(self, mat, vec, build=False):
105 |         if (mat is None) or (vec is None):
106 |             assert (mat is None) and (vec is None), 'Constraint incomplete: mat={0}, vec={1}'.format(mat, vec)
107 |             return
108 |         mat, vec = self.check_mat_vec(mat, vec)
109 |         n_ineq_loc = mat.shape[0]
110 |         if n_ineq_loc > 0:
111 |             self.ineq_mat_list.append(mat)
112 |             self.ineq_vec_list.append(vec)
113 |             if build:
114 |                 self.build_ineq()
115 | 
116 |     def build_ineq(self):
117 |         if len(self.ineq_mat_list) > 0:
118 |             self.ineq_mat = np.concatenate(self.ineq_mat_list, axis=0)
119 |             self.ineq_vec = np.concatenate(self.ineq_vec_list, axis=0)
120 |             self.n_ineq = self.ineq_mat.shape[0]
121 |         else:
122 |             self.ineq_mat = None
123 |             self.ineq_vec = None
124 |             self.n_ineq = 0
125 |         self.build_ineq_solver_specific()
126 | 
127 |     def reset_obj_solver_specific(self):
128 |         pass
129 | 
130 |     def reset_eq_solver_specific(self):
131 |         pass
132 | 
133 |     def reset_ineq_solver_specific(self):
134 |         pass
135 | 
136 |     def build_obj_solver_specific(self):
137 |         pass
138 | 
139 |     def build_eq_solver_specific(self):
140 |         pass
141 | 
142 |     def build_ineq_solver_specific(self):
143 |         pass
144 | 
145 |     def update_solver_specific(self):
146 |         pass
147 | 
148 |     def solve(self):
149 |         raise NotImplementedError()
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/ceres/constraints/constraint_loss.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import tensorflow as tf
  6 | 
  7 | class ConstraintLoss(object):
  8 |     '''
  9 |     Define constraint network loss terms
 10 |     '''
 11 | 
 12 |     no_normalization = ['_loss_l2'] # add loss functions that do not require division by batch size
 13 | 
 14 |     def __init__(self, network):
 15 |         '''
 16 |         Loss terms are defined from constraint network variables
 17 |         '''
 18 |         self.network = network
 19 |         self.init_losses()
 20 |         self.init_total_loss()
 21 | 
 22 |     def init_losses(self):
 23 |         '''
 24 |         Initialize individual losses from a dictionary of loss names and weights,
 25 |         e.g., loss_weights = {'l2': 0.0001, 'positive_violation_max': 1.0, 'negative_satisfaction_min': 1.0}
 26 |         will call loss functions '_loss_l2', '_loss_positive_violation_max' and '_loss_negative_satisfaction_min'
 27 |         '''
 28 |         self.losses = {}
 29 |         for loss_name, loss_weight in self.network.config.loss_weights.items():
 30 |             if loss_weight != 0.:
 31 |                 loss_func_name = '_loss_{0}'.format(loss_name)
 32 |                 assert hasattr(self, loss_func_name), 'Undefined loss function {0}'.format(loss_func_name)
 33 |                 loss = getattr(self, loss_func_name)()
 34 |                 if not loss_func_name in self.no_normalization:
 35 |                     loss = loss / self.network.batch_size_float
 36 |                 self.losses[loss_name] = loss_weight * loss
 37 | 
 38 |     def init_total_loss(self):
 39 |         '''
 40 |         Sum up individual loss terms if available, otherwise zero
 41 |         '''
 42 |         loss_list = [v for k, v in self.losses.items()]
 43 |         if len(loss_list) == 0:
 44 |             print('Warning: no CNet loss defined')
 45 |             self.total_loss = 0.
 46 |         else:
 47 |             self.total_loss = tf.add_n(loss_list)
 48 | 
 49 |     def _loss_l2(self):
 50 |         '''
 51 |         L2 norm of the neural network weights
 52 |         '''
 53 |         assert len(self.network.model_weights) > 0
 54 |         loss = tf.add_n([tf.nn.l2_loss(w) for _k, w in self.network.model_weights.items()])
 55 |         return loss
 56 | 
 57 |     def _loss_positive_violation_max(self, order=1):
 58 |         '''
 59 |         Maximum violation margin for positive demonstrations, supports squaring
 60 |         '''
 61 |         loss = self.network.ineq_violation_margin
 62 |         loss = tf.reduce_max(loss, axis=1)
 63 |         if order == 2:
 64 |             loss = tf.square(loss)
 65 |         loss = tf.multiply(self.network.is_positive, loss)
 66 |         loss = tf.reduce_sum(loss)
 67 |         return loss
 68 | 
 69 |     def _loss_pvm(self, order=1):
 70 |         '''
 71 |         Shortname for positive violation max
 72 |         '''
 73 |         pvm_loss = self._loss_positive_violation_max(order=order)
 74 |         return pvm_loss
 75 | 
 76 |     def _loss_pvm_1d(self):
 77 |         '''
 78 |         Positive violation max, 1st order
 79 |         '''
 80 |         return self._loss_pvm(order=1)
 81 | 
 82 |     def _loss_pvm_2d(self):
 83 |         '''
 84 |         Positive violation max, squared
 85 |         '''
 86 |         return self._loss_pvm(order=2)
 87 | 
 88 |     def _loss_positive_violation_norm(self, order=1):
 89 |         '''
 90 |         Since we're seeking to zero all violation margins, we can minimize the total norm (L1 or L2)
 91 |         '''
 92 |         loss = self.network.ineq_violation_margin
 93 |         if order == 2:
 94 |             loss = tf.square(loss)
 95 |         else:
 96 |             assert order == 1, 'Only order 1 and 2 supported'
 97 |         loss = tf.reduce_sum(loss, axis=1)
 98 |         loss = tf.multiply(self.network.is_positive, loss)
 99 |         loss = tf.reduce_sum(loss)
100 |         return loss
101 | 
102 |     def _loss_pvn(self, order=1):
103 |         '''
104 |         Shortname for positive violation norm
105 |         '''
106 |         pvn_loss = self._loss_positive_violation_norm(order=order)
107 |         return pvn_loss
108 | 
109 |     def _loss_pvn_1d(self):
110 |         '''
111 |         Positive violation norm, L1 norm
112 |         '''
113 |         return self._loss_pvn(order=1)
114 | 
115 |     def _loss_pvn_2d(self):
116 |         '''
117 |         Positive violation norm, L2 norm
118 |         '''
119 |         return self._loss_pvn(order=2)
120 | 
121 |     def _loss_negative_satisfaction_min(self, order=1):
122 |         '''
123 |         Minimum satisfaction margin for negative demonstrations, supports squaring
124 |         '''
125 |         loss = self.network.ineq_satisfaction_margin
126 |         loss = tf.reduce_min(loss, axis=1)
127 |         if order == 2:
128 |             loss = tf.square(loss)
129 |         loss = tf.multiply(self.network.is_negative, loss)
130 |         loss = tf.reduce_sum(loss)
131 |         return loss
132 | 
133 |     def _loss_nsm(self, order=1):
134 |         '''
135 |         Shortname for negative satisfaction min
136 |         '''
137 |         nsm_loss = self._loss_negative_satisfaction_min(order=order)
138 |         return nsm_loss
139 | 
140 |     def _loss_nsm_1d(self):
141 |         '''
142 |         Negative satisfaction min, 1st order
143 |         '''
144 |         return self._loss_nsm(order=1)
145 | 
146 |     def _loss_nsm_2d(self):
147 |         '''
148 |         Negative satisfaction min, squared
149 |         '''
150 |         return self._loss_nsm(order=2)
151 | 


--------------------------------------------------------------------------------
/examples/01-constraints-from-demonstrations.md:
--------------------------------------------------------------------------------
  1 | # Guiding exploration with constraints from demonstrations
  2 | 
  3 | We consider an environment in which an agent navigates through a fixed maze to reach a target.
  4 | The starting position of the agent and the position of the target are randomized at each episode.
  5 | 
  6 | Visualize the environment with a random policy:
  7 | ```
  8 | python3 -m ceres.scripts.play_policy --env_id Nav2dPosFixedMazeCeres-v0 \
  9 |         --max_episodes 1000 --render
 10 | ```
 11 | <img src="media/random_exploration/maze_unconstrained.gif" width="400">
 12 | 
 13 | Arguments:
 14 | * ```--env_id Nav2dPosFixMazeCeres-v0```: environment name (can load environments from modules other than ```ceres``` with the extended argument format ```<module>:<environment name>```)
 15 | * ```--max_episodes 1000```: play a random policy for 1000 episodes
 16 | * ```--render```: (optional) render to screen
 17 | 
 18 | ## Learning constraints from demonstrations
 19 | 
 20 | 500 positive trajectories with negative demonstrations were collected in ```data/Nav2dFixedMaze-500T.npz```
 21 | 
 22 | Visualize the demonstrations with:
 23 | ```
 24 | python3 -m ceres.constraints.constraint_demonstration --env_id Nav2dPosFixedMazeCeres-v0 \
 25 |         --constraint_demonstration_buffer data/Nav2dFixedMaze-500T.npz --render
 26 | ```
 27 | Possible commands to replay demonstrations within the environment are described in the terminal.
 28 | 
 29 | Train a constraint network using the ground truth data with:
 30 | ```
 31 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosFixedMazeCeres-v0 \
 32 |         --constraint_demonstration_buffer data/Nav2dFixedMaze-500T.npz \
 33 |         --cnet_n_ineq 2 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \
 34 |         --cnet_spherical_coordinates --cnet_predict_interior_point \
 35 |         --cnet_training_epochs 1000 --cnet_decay_epochs 10 --early_stop_positive 0.99 --early_stop_negative 0.99 \
 36 |         --max_iter 1 --only_train_constraints --output maze_cnet
 37 | ```
 38 | 
 39 | Arguments:
 40 | * ```--constraint_demonstration_buffer data/Nav2dFixedMaze-500T.npz```: use existing demonstration buffer
 41 | * ```--cnet_n_ineq 2```: 2 inequality constraints
 42 | * ```--cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6```: loss weights for positive violation max, negative satisfaction min, L2 regularization
 43 | * ```--cnet_spherical_coordinates```: predict unit-norm constraints using spherical coordinates (alternatively, use ```--cnet_normalize_ineq_mat``` for post-normalization)
 44 | * ```--cnet_predict_interior_point```: predict constraints such that there exists an interior point that satisfies them all
 45 | * ```--cnet_training_epochs 1000```: train the constraint network over 1000 epochs
 46 | * ```--cnet_decay_epochs 10```: halve constraint network learning rate every 10 epochs without loss reduction
 47 | * ```--early_stop_positive 0.99 --early_stop_negative 0.99```: interrupt constraint network training if it reaches 99% separation accuracy
 48 | * ```--max_iter 1```: run one iteration of CERES training
 49 | * ```--only_train_constraints```: only train constraint network within CERES, not policy
 50 | * ```--output maze_cnet```: save logs in ```logs/maze_cnet```. If the directory already exists, remove it manually or run the script with ```--overwrite```
 51 | 
 52 | The trained constraint network will be saved in ```logs/maze_cnet/worker_0_direct/constraints```.
 53 | 
 54 | Visualize the constraints with a random policy:
 55 | ```
 56 | python3 -m ceres.scripts.play_policy --env_id Nav2dPosFixedMazeCeres-v0 \
 57 |         --trained_cnet logs/maze_cnet/worker_0_direct/constraints --max_episodes 1000 --render
 58 | ```
 59 | <img src="media/random_exploration/maze_constrained.gif" width="400">
 60 | 
 61 | ## Baseline PPO
 62 | 
 63 | Within CERES, disabling constraints and setting the number of recovery policies to zero amounts to training with PPO:
 64 | ```
 65 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosFixedMazeCeres-v0 \
 66 |         --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \
 67 |         --max_iter 5000 --output maze_ppo_unconstrained
 68 | ```
 69 | Arguments:
 70 | * ```--only_train_policy```: only train policy, not constraints
 71 | * ```--constant_constraint_activation 0.```: set the constraint activation probability to zero throughout training
 72 | * ```--n_recovery 0```: do not train recovery agents
 73 | * ```--max_iter 5000```: do reinforcement learning for 5000 iterations
 74 | * Optionally, run with ```--render``` to visualize exploration and constraints.
 75 | 
 76 | The trained policy will be saved in ```logs/maze_ppo_unconstrained/worker_0_direct/policy```
 77 | 
 78 | ## Applying constraints to guide PPO
 79 | 
 80 | Apply the trained constraint network to restrict the exploration range:
 81 | ```
 82 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosFixedMazeCeres-v0 \
 83 |         --only_train_policy --constant_constraint_activation 1. --n_recovery 0 \
 84 |         --trained_cnet logs/maze_cnet/worker_0_direct/constraints \
 85 |         --max_iter 5000 --output maze_ppo_constrained
 86 | ```
 87 | Arguments:
 88 | * ```--constant_constraint_activation 1.```: always enable constraints
 89 | * ```--trained_cnet logs/maze_cnet/worker_0_direct/constraints```: use the constraint network trained previously
 90 | 
 91 | The trained policy will be saved in ```logs/maze_ppo_constrained/worker_0_direct/policy```
 92 | 
 93 | ## Compare rewards with and without constraints
 94 | 
 95 | Plot the rewards during training:
 96 | ```
 97 | python3 -m ceres.scripts.plot_rewards \
 98 |         --plot_path "Unconstrained PPO=logs/maze_ppo_unconstrained/worker_0_direct" \
 99 |         --plot_path "Constrained PPO=logs/maze_ppo_constrained/worker_0_direct"
100 | ```
101 | <img src="media/maze_individual.png">
102 | <img src="media/maze_average.png">
103 | 
104 | 


--------------------------------------------------------------------------------
/ceres/envs/ceres_env.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | from .resetter import ResetterEnvCeres
  6 | from .constrained import ConstrainedEnvNetwork
  7 | import numpy as np
  8 | import gym
  9 | 
 10 | class CeresEnv(ResetterEnvCeres, ConstrainedEnvNetwork):
 11 |     '''
 12 |     Base class for CERES-compatible environments, with support for snapshotting and constraint prediction
 13 |     '''
 14 | 
 15 |     # ResetterEnv parameters
 16 |     max_reference_trajectories = 1024 # -1 to load all available episodes
 17 | 
 18 |     ### Solver parameters
 19 |     return_zero_if_opt_fails = False
 20 |     has_ineq = True
 21 |     has_eq   = False
 22 | 
 23 |     # Recovery parameters
 24 |     recovery_reward_alive = 1.
 25 |     info_key_constrained_action = 'constrained_action'
 26 | 
 27 |     is_ceres_initialized = False # by default, use base environment (no special reset or constraints)
 28 | 
 29 |     def init_ceres(self, is_recovery_mode=False):
 30 |         '''
 31 |         Setup CERES-specific behavior.
 32 |         If this function is not called, function as the base environment, without constraints or recovery.
 33 |         '''
 34 |         if not self.is_ceres_initialized: # avoid double initialization through main class super
 35 |             self.check_base_attributes()
 36 |             self.init_overloading()
 37 |             self.init_recovery()
 38 |             ResetterEnvCeres.__init__(self)
 39 |             ConstrainedEnvNetwork.__init__(self)
 40 |             self.is_ceres_initialized = True
 41 |             self.is_recovery_mode = is_recovery_mode
 42 |             assert self.max_reference_trajectories > 0
 43 |             self.enable_constraints = True
 44 |             self.set_constraint_activation_probability(1.)
 45 | 
 46 |     def set_constraint_activation_probability(self, val):
 47 |         self.constraint_activation_probability = val
 48 | 
 49 |     def check_base_attributes(self):
 50 |         # Old gym environments use _reset instead of reset directly
 51 |         if hasattr(self, 'reset'):
 52 |             self.reset_function_name = 'reset'
 53 |         else:
 54 |             assert hasattr(self, '_reset'), 'Could find neither \'reset\' nor \'_reset\' base function.'
 55 |             self.reset_function_name = '_reset'
 56 |         # Old gym environments use _step instead of step directly
 57 |         if hasattr(self, 'step'):
 58 |             self.step_function_name = 'step'
 59 |         else:
 60 |             assert hasattr(self, '_step'), 'Could find neither \'step\' nor \'_step\' base function.'
 61 |             self.step_function_name = '_step'
 62 |         # The base environment also needs to be able to calculate snapshots and reset to given snapshots
 63 |         assert hasattr(self, 'calc_snapshot'), 'Could not find base calc_snapshot function'
 64 |         assert hasattr(self, 'reset_and_restore'), 'Could not find base reset_and_restore function'
 65 | 
 66 |     def init_overloading(self):
 67 |         '''
 68 |         Replace base environment reset and step with CERES-specific functions
 69 |         '''
 70 |         self.init_overloading_reset()
 71 |         self.init_overloading_step()
 72 | 
 73 |     def init_overloading_reset(self):
 74 |         self.reset_base = getattr(self, self.reset_function_name)
 75 |         setattr(self, self.reset_function_name, self.reset_ceres)
 76 | 
 77 |     def init_overloading_step(self):
 78 |         self.step_base = getattr(self, self.step_function_name)
 79 |         setattr(self, self.step_function_name, self.step_ceres)
 80 | 
 81 |     def step_ceres(self, action_raw):
 82 |         '''
 83 |         Depending on the constraint activation probability, correct the input action, play the corrected action and update constraints.
 84 |         For recovery, change the reward and end condition.
 85 |         '''
 86 |         do_enable_constraints_this_step = (self.constraint_activation_probability == 1.) or (np.random.rand() < self.constraint_activation_probability)
 87 |         if self.enable_constraints and do_enable_constraints_this_step:
 88 |             action_constrained, success, viol = self.correct_action(action_raw)
 89 |         else:
 90 |             action_constrained = action_raw
 91 |         state, reward, done, info = self.step_base(action_constrained)
 92 |         self.update_ineq_matrices(state)
 93 |         info[self.info_key_constrained_action] = action_constrained
 94 |         if self.is_recovery_mode:
 95 |             self.n_recovery_steps += 1
 96 |             is_max_recovery_steps = self.n_recovery_steps == self.max_recovery_steps
 97 |             if info[self.info_key_failure]:
 98 |                 reward = self.recovery_reward_failure
 99 |             else:
100 |                 reward = self.recovery_reward_alive
101 |             done = done or is_max_recovery_steps
102 |         return state, reward, done, info
103 | 
104 |     def reset_ceres(self):
105 |         '''
106 |         Restore a reference snapshot when available (e.g., in recovery mode), otherwise use the base environment reset, and predict new constraints
107 |         '''
108 |         if len(self.reference_trajectories) == 0:
109 |             state = self.reset_base()
110 |             self.recovery_info = None
111 |         else:
112 |             snapshot, self.recovery_info = ResetterEnvCeres.get_random_reference_snapshot(self)
113 |             state = self.reset_and_restore(snapshot=snapshot)
114 |         self.update_ineq_matrices(state)
115 |         self.n_recovery_steps = 0
116 |         return state
117 |     
118 |     def init_recovery(self):
119 |         required_base_env_attrs = ['max_recovery_steps', 'info_key_failure', 'info_key_success']
120 |         for _k in required_base_env_attrs:
121 |             assert hasattr(self, _k), 'Undefined attribute {0} in base environment within {1}'.format(_k, type(self))
122 |         self.recovery_reward_failure = -self.max_recovery_steps
123 | 
124 | 


--------------------------------------------------------------------------------
/ceres/networks/network_saver.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import tensorflow as tf
  6 | from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file as pticf
  7 | import time
  8 | import os
  9 | 
 10 | class NetworkSaver(object):
 11 |     '''
 12 |     A simple class implementing save and restore functions for neural networks in Tensorflow
 13 |     '''
 14 | 
 15 |     model_basename = 'model'
 16 | 
 17 |     def __init__(self, network_id):
 18 |         self.network_id = network_id
 19 |         self.tf_var_prefix = '{0}/'.format(self.network_id)
 20 | 
 21 |     def get_var_name_mapping(self, backup_network_id=None):
 22 |         '''
 23 |         Return a dict associating this network's variable names to trainable tensors.
 24 |         Optional argument backup_network_id allows loading weights that were saved under a different name from network_id
 25 |         '''
 26 |         var_name_mapping = {}
 27 |         for v in tf.trainable_variables():
 28 |             if self.tf_var_prefix == v.name[:len(self.tf_var_prefix)]:
 29 |                 v_name_train = v.name
 30 |                 if backup_network_id is not None:
 31 |                     v_name_train = backup_network_id.join(v_name_train.split(self.network_id))
 32 |                 v_name_train = v_name_train.split(':')[0]
 33 |                 var_name_mapping[v_name_train] = v
 34 |         return var_name_mapping
 35 | 
 36 |     def restore_model(self, path_restore, session=None, backup_network_id=None, verbose=True):
 37 |         '''
 38 |         Restore trained weights
 39 |         '''
 40 |         if session is None:
 41 |             assert hasattr(self, 'session'), 'Either pass session as argument or set during saver initialization'
 42 |             session = self.session
 43 |         var_name_mapping = self.get_var_name_mapping(backup_network_id=backup_network_id)
 44 |         saver = tf.train.Saver(var_name_mapping)
 45 |         path_model = self.get_latest_model(path_restore, model_basename=self.model_basename)
 46 |         try:
 47 |             saver.restore(session, path_model)
 48 |             print('Restored network: {0}'.format(path_model))
 49 |         except Exception as e:
 50 |             print('Could not restore {0} from checkpoint: {1}'.format(type(self), path_model))
 51 |             print('This is the content of the checkpoint file:')
 52 |             pticf(file_name=path_model, tensor_name='', all_tensors=False)
 53 |             raise e
 54 | 
 55 |     def init_saver(self, path_backup_dir, session=None, max_to_keep=1):
 56 |         '''
 57 |         Build backup path for future use
 58 |         '''
 59 |         if session is not None:
 60 |             self.session = session
 61 |         os.makedirs(path_backup_dir, exist_ok=True)
 62 |         self.path_model = os.path.join(path_backup_dir, self.model_basename)
 63 |         var_name_mapping = self.get_var_name_mapping()
 64 |         var_to_save = [_e for _k, _e in var_name_mapping.items()]
 65 |         self.saver = tf.train.Saver(var_to_save, max_to_keep=max_to_keep)
 66 | 
 67 |     def save_model(self, global_step=None, verbose=True, path_model=None, session=None):
 68 |         '''
 69 |         Save model from given session to given path if specified,
 70 |         else take these from previous init_saver call
 71 |         '''
 72 |         if path_model is None:
 73 |             assert hasattr(self, 'path_model'), 'Specify path_model or set it at initialization'
 74 |             path_model = self.path_model
 75 |         if session is None:
 76 |             assert hasattr(self, 'path_model'), 'Specify session or set it at initialization'
 77 |             session = self.session
 78 |         if global_step is None:
 79 |             self.saver.save(session, self.path_model)
 80 |         else:
 81 |             self.saver.save(session, self.path_model, global_step=global_step)
 82 |         if verbose:
 83 |             print('Save network: {0}'.format(path_model))
 84 | 
 85 |     @classmethod
 86 |     def get_latest_model(cls, path_model, model_basename='model'):
 87 |         '''
 88 |         Check for files of the form <model_basename>-<iter> and return the most recent
 89 |         '''
 90 |         model_index_extension = '.index'
 91 |     
 92 |         if os.path.isdir(path_model):
 93 |             path_model = os.path.join(path_model, model_basename)
 94 |         path_model_full = path_model + model_index_extension
 95 |     
 96 |         if not os.path.isfile(path_model_full):
 97 |             # Check for files of the form model-1000.index
 98 |             path_model_dirname = os.path.dirname(path_model_full)
 99 |             model_basename = os.path.basename(path_model)
100 |             files_in_dir = os.listdir(path_model_dirname)
101 |             path_model_candidates = []
102 |             model_iter_numbers = []
103 |             for _f in files_in_dir:
104 |                 if _f[:len(model_basename)] != model_basename:
105 |                     continue
106 |                 if _f[-len(model_index_extension):] != model_index_extension:
107 |                     continue
108 |                 _f_base = _f[:-len(model_index_extension)]
109 |                 _f_base_split = _f_base.split('-')
110 |                 assert (_f_base_split[0] == model_basename) and (len(_f_base_split) == 2), 'Invalid file {0}, expected {1}-<iter>{2}'.format(_f, model_basename, model_index_extension)
111 |                 i_iter = int(_f_base_split[1])
112 |                 model_iter_numbers.append(i_iter)
113 |             assert len(model_iter_numbers) > 0, 'Cannot find any model candidate in directory {0}'.format(path_model_dirname)
114 |             model_basename = '{0}-{1}'.format(model_basename, max(model_iter_numbers))
115 |             path_model = os.path.join(path_model_dirname, model_basename)
116 |             path_model_full = path_model + model_index_extension
117 |             assert os.path.isfile(path_model_full), 'Model backup file does not exist: {0}'.format(path_model_full)
118 |         return path_model
119 | 
120 |     def build_model(self, *args, **kwargs):
121 |         raise NotImplementedError('Implement build_model in child classes')
122 | 


--------------------------------------------------------------------------------
/ceres/baselines/ceres/run_continuous.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import os
  6 | from mpi4py import MPI
  7 | from baselines.common.mpi_fork import mpi_fork
  8 | from baselines.common import tf_util as U
  9 | from baselines import logger
 10 | from . import pposgd_ceres
 11 | 
 12 | from baselines.common.cmd_util import make_mujoco_env
 13 | from .mlp_policy_saver import MlpPolicySaver
 14 | from ceres.envs import CeresEnv
 15 | from ceres import ConstraintNetworkMLP, ConstraintConfig
 16 | from ceres import ConstraintDemonstrationBuffer
 17 | 
 18 | def build_log_dirs(path_xp, rank, is_direct_policy):
 19 |     worker_name = 'worker_{0}_{1}'.format(rank, 'direct' if is_direct_policy else 'recovery')
 20 |     worker_dir = os.path.join(path_xp, worker_name)
 21 |     worker_policy_dir = os.path.join(worker_dir, 'policy')
 22 |     worker_constraints_dir = os.path.join(worker_dir, 'constraints')
 23 |     return worker_name, worker_dir, worker_policy_dir, worker_constraints_dir
 24 | 
 25 | def main():
 26 |     '''
 27 |     Initialize CERES environment and launch policy and constraint learning
 28 |     or restart from a previous training session.
 29 |     '''
 30 |     from ceres.tools import ExtraArgs
 31 |     log_root = os.path.join(os.getcwd(), 'logs')
 32 |     extra_args = ExtraArgs(log_root=log_root)
 33 | 
 34 |     n_agents_total = extra_args.n_direct + extra_args.n_recovery
 35 |     whoami  = mpi_fork(n_agents_total)
 36 |     if whoami == "parent":
 37 |         return
 38 |     sess = U.single_threaded_session()
 39 |     sess.__enter__()
 40 | 
 41 |     # Synchronize log directory between agents
 42 |     rank = MPI.COMM_WORLD.Get_rank()
 43 |     is_direct_policy = rank < extra_args.n_direct
 44 |     if rank == 0:
 45 |         path_xp = extra_args.path_xp
 46 |         for dest_rank in range(n_agents_total):
 47 |             send_buffer = [path_xp]
 48 |             MPI.COMM_WORLD.send(send_buffer, dest=dest_rank, tag=rank)
 49 |     else:
 50 |         recv_buffer = MPI.COMM_WORLD.recv(source=0, tag=0)
 51 |         path_xp = recv_buffer[0]
 52 | 
 53 |     # Find root processes for direct and recovery
 54 |     if is_direct_policy:
 55 |         root_rank = 0
 56 |     else:
 57 |         root_rank = extra_args.n_direct
 58 | 
 59 |     worker_name, worker_dir, worker_policy_dir, worker_constraints_dir = build_log_dirs(path_xp, rank, is_direct_policy)
 60 |     logger.configure(dir=worker_dir)
 61 | 
 62 |     if not rank == 0: # only log first direct
 63 |         logger.set_level(logger.DISABLED)
 64 | 
 65 |     workerseed = extra_args.seed + 10000 * rank
 66 |     assert len(extra_args.env_id) > 0, 'Missing argument --env_id'
 67 |     env = make_mujoco_env(extra_args.env_id, workerseed)
 68 |     assert isinstance(env.unwrapped, CeresEnv), 'Env {0} should be an instance of CeresEnv'.format(type(env))
 69 |     env.unwrapped.init_ceres(is_recovery_mode=(not is_direct_policy))
 70 | 
 71 |     # Setup restoration parameters from previous logs
 72 |     if len(extra_args.continue_ceres_training) > 0:
 73 |         assert os.path.isdir(extra_args.continue_ceres_training), 'Could not find log directory: {0}'.format(extra_args.continue_ceres_training)
 74 |         # All direct share one policy, all recovery share another
 75 |         _, _, extra_args.trained_policy, _ = build_log_dirs(extra_args.continue_ceres_training, root_rank, is_direct_policy)
 76 |         # All agents share a single constraint network
 77 |         _, _, _, extra_args.trained_cnet = build_log_dirs(extra_args.continue_ceres_training, 0, True)
 78 |         # All agents have separate demonstration buffers
 79 |         _, _, _, extra_args.constraint_demonstration_buffer = build_log_dirs(extra_args.continue_ceres_training, rank, is_direct_policy)
 80 | 
 81 |     def policy_fn(name, ob_space, ac_space):
 82 |         policy = MlpPolicySaver(name, ob_space=ob_space, ac_space=ac_space,
 83 |                                 hid_size=extra_args.policy_hidden_size, num_hid_layers=extra_args.policy_hidden_layers)
 84 |         policy.init_saver(worker_policy_dir, session=sess, max_to_keep=extra_args.backup_keep)
 85 |         return policy
 86 | 
 87 |     # Initialize backup directories
 88 |     os.makedirs(worker_constraints_dir, exist_ok=True)
 89 |     if len(extra_args.trained_cnet) > 0:
 90 |         cnet_config = ConstraintConfig.from_backup(extra_args.trained_cnet)
 91 |     else:
 92 |         cnet_config = ConstraintConfig.from_extra_args(extra_args)
 93 |     if rank == 0:
 94 |         cnet_config.save(worker_constraints_dir)
 95 |     cnet = ConstraintNetworkMLP(env.observation_space, env.action_space, cnet_config)
 96 |     cnet.init_saver(worker_constraints_dir, session=sess, max_to_keep=extra_args.backup_keep)
 97 |     env.unwrapped.init_constraint_prediction(cnet, session=sess)
 98 | 
 99 |     constraint_demonstration_buffer = ConstraintDemonstrationBuffer(extra_args.constraint_demonstration_buffer_size)
100 |     constraint_demonstration_buffer.init_saver(worker_constraints_dir)
101 | 
102 |     # Check end criterion
103 |     possible_end_criteria = ['max_iterations', 'max_timesteps', 'max_episodes', 'max_seconds']
104 |     active_end_criteria = [_k for _k in possible_end_criteria if getattr(extra_args, _k) > 0]
105 |     n_end_criteria = len(active_end_criteria)
106 |     if extra_args.max_iterations == 0:
107 |         raise ValueError('Specify one end criterion out of {0}'.format(possible_end_criteria))
108 |     else:
109 |         assert n_end_criteria == 1, 'Only one time constraint permitted but {0} specified: {1}'.format(n_end_criteria, active_end_criteria)
110 | 
111 |     # Start training!
112 |     pposgd_ceres.learn(env, policy_fn,
113 |             max_timesteps=extra_args.max_timesteps,
114 |             max_iters=extra_args.max_iterations,
115 |             max_episodes=extra_args.max_episodes,
116 |             max_seconds=extra_args.max_seconds,
117 |             timesteps_per_actorbatch=extra_args.timesteps_per_actorbatch,
118 |             clip_param=0.2, entcoeff=extra_args.policy_entcoeff,
119 |             optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
120 |             gamma=0.99, lam=0.95, schedule=extra_args.policy_learning_rate_schedule,
121 |             extra_args=extra_args, cnet=cnet, constraint_demonstration_buffer=constraint_demonstration_buffer,
122 |         )
123 |     env.close()
124 | 
125 |     if rank == 0:
126 |         print('Done! Logs are located in {0}'.format(path_xp))
127 | 
128 | if __name__ == '__main__':
129 |     main()
130 | 


--------------------------------------------------------------------------------
/ceres/envs/nav2d/obstacles.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import numpy as np
  6 | 
  7 | class Obstacle(object):
  8 |     '''
  9 |     Base class for obstacles, with export/import functions
 10 |     '''
 11 |     required_parameters = []
 12 |     def __init__(self, **kwargs):
 13 |         for _k in self.required_parameters:
 14 |             setattr(self, _k, kwargs[_k])
 15 |         self.check_parameters()
 16 | 
 17 |     def to_array(self):
 18 |         params = [getattr(self, _k) for _k in self.required_parameters]
 19 |         return params
 20 | 
 21 |     @classmethod
 22 |     def FromArray(cls, params):
 23 |         assert len(params) == len(cls.required_parameters)
 24 |         params_as_dict = {_k: _v for _k, _v in zip(cls.required_parameters, params)}
 25 |         obstacle = cls(**params_as_dict)
 26 |         return obstacle
 27 | 
 28 | class ObstacleSquare(Obstacle):
 29 |     '''
 30 |     Square obstacle, initialized from the 2D location of its top-left and bottom-right corners
 31 |     '''
 32 |     required_parameters = ['top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y']
 33 | 
 34 |     def check_parameters(self):
 35 |         assert self.top_left_x < self.bottom_right_x
 36 |         assert self.top_left_y > self.bottom_right_y
 37 |         self.bottom_left_x = self.top_left_x
 38 |         self.bottom_left_y = self.bottom_right_y
 39 |         self.top_right_x = self.bottom_right_x
 40 |         self.top_right_y = self.top_left_y
 41 | 
 42 |     def test_collision(self, x, y, conservative=False, min_distance=0.):
 43 |         # Set conservative=True to count border as collision
 44 |         x_proj, y_proj, is_strictly_inside = self.project(x, y)
 45 |         if is_strictly_inside:
 46 |             return True
 47 |         else:
 48 |             dist = np.linalg.norm(np.array([x, y]) - np.array([x_proj, y_proj]))
 49 |             if conservative:
 50 |                 is_collision = dist <= min_distance
 51 |             else:
 52 |                 is_collision = dist < min_distance
 53 |         return is_collision
 54 | 
 55 |     def to_polygon(self):
 56 |         path_closed = []
 57 |         path_closed.append((self.top_left_x, self.top_left_y))
 58 |         path_closed.append((self.bottom_left_x, self.bottom_left_y))
 59 |         path_closed.append((self.bottom_right_x, self.bottom_right_y))
 60 |         path_closed.append((self.top_right_x, self.top_right_y))
 61 |         return path_closed
 62 | 
 63 |     def project(self, x, y):
 64 |         strict_inside_x = False
 65 |         strict_inside_y = False
 66 |         if x >= self.bottom_right_x:
 67 |             x_proj = self.bottom_right_x
 68 |         elif x <= self.bottom_left_x:
 69 |             x_proj = self.bottom_left_x
 70 |         else:
 71 |             x_proj = x
 72 |             strict_inside_x = True
 73 |         if y >= self.top_left_y:
 74 |             y_proj = self.top_left_y
 75 |         elif y <= self.bottom_left_y:
 76 |             y_proj = self.bottom_left_y
 77 |         else:
 78 |             y_proj = y
 79 |             strict_inside_y = True
 80 |         is_strictly_inside = strict_inside_x and strict_inside_y
 81 |         return x_proj, y_proj, is_strictly_inside
 82 | 
 83 |     def intersection_with_line(self, p1, p2):
 84 |         raise NotImplementedError('Intersection between square and line not implemented')
 85 | 
 86 | 
 87 | class ObstacleCircle(Obstacle):
 88 |     '''
 89 |     Circle obstacle, initialized from the 2D location of its center and its radius
 90 |     '''
 91 |     required_parameters = ['center_x', 'center_y', 'radius']
 92 | 
 93 |     def check_parameters(self):
 94 |         assert self.radius > 0.
 95 |         self.center_xy = np.array([self.center_x, self.center_y])
 96 |         self.intersection_line_shift = np.dot(self.center_xy, self.center_xy) - self.radius**2 # use this when computing intersection with line
 97 | 
 98 |     def test_collision(self, x, y, conservative=False, min_distance=0.):
 99 |         # Set conservative=True to count border as collision
100 |         x_proj, y_proj, is_strictly_inside = self.project(x, y)
101 |         if is_strictly_inside:
102 |             return True
103 |         else:
104 |             dist = np.linalg.norm(np.array([x, y]) - np.array([x_proj, y_proj]))
105 |             if conservative:
106 |                 is_collision = dist <= min_distance
107 |             else:
108 |                 is_collision = dist < min_distance
109 |         return is_collision
110 | 
111 |     def to_polygon(self):
112 |         raise NotImplementedError('Use circle drawing function')
113 | 
114 |     def project(self, x, y):
115 |         center_to_point = np.array([x - self.center_x, y - self.center_y])
116 |         dist_from_center = np.linalg.norm(center_to_point)
117 |         is_strictly_inside = dist_from_center < self.radius
118 |         if is_strictly_inside:
119 |             x_proj, y_proj = x, y
120 |         else:
121 |             center_to_surface = center_to_point / dist_from_center * self.radius
122 |             x_proj = self.center_x + center_to_surface[0]
123 |             y_proj = self.center_y + center_to_surface[1]
124 |         return x_proj, y_proj, is_strictly_inside
125 | 
126 |     def intersection_with_line(self, p1, p2):
127 |         '''
128 |         Solve quadratic equation a x^2 + b x + c = 0
129 |         with a = np.dot(v, v) with v unit vector between p1 and p2,
130 |         b = 2 np.dot(v, p1 - center)
131 |         c = np.dot(p1, p1) + np.dot(center, center) - 2 np.dot(p1, center) - radius^2
132 |         '''
133 |         p1 = np.array(p1)
134 |         p2 = np.array(p2)
135 |         unit_vec = p2 - p1
136 |         dist = np.linalg.norm(unit_vec)
137 |         assert dist > 0.
138 |         unit_vec /= dist
139 |         a = np.dot(unit_vec, unit_vec)
140 |         b = 2. * np.dot(unit_vec, p1 - self.center_xy)
141 |         c = np.dot(p1, p1) - 2. * np.dot(p1, self.center_xy) + self.intersection_line_shift
142 | 
143 |         delta = b**2 - 4.* a * c
144 |         if delta < 0:
145 |             return False, None
146 |         delta_sqrt = np.sqrt(delta)
147 |         # Two solutions: x1, x2
148 |         x1 = (-b - delta_sqrt) / (2. * a)
149 |         x2 = (-b + delta_sqrt) / (2. * a)
150 |         if 0. <= x1 <= dist:
151 |             x_min = x1
152 |         elif 0. <= x2 <= dist:
153 |             x_min = x2
154 |         else:
155 |             #x_min = min(x1, x2)
156 |             return False, None
157 |         closest = p1 + x_min * unit_vec
158 |         return True, closest
159 |      
160 | 


--------------------------------------------------------------------------------
/ceres/baselines/ceres/pposgd_ceres_helper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | from baselines import logger
  6 | import baselines.common.tf_util as U
  7 | import numpy as np
  8 | import time
  9 | from mpi4py import MPI
 10 | import gym
 11 | 
 12 | def update_constraint_activation_probability(env, extra_args, logger, is_direct_policy, do_train_cnet,
 13 |                                              activation_probability_before, activation_probability_after):
 14 |     '''
 15 |     Update environment constraint activation probability using constraint accuracy before or after training
 16 |     '''
 17 |     activation_probability = extra_args.constant_constraint_activation
 18 |     if len(extra_args.adaptive_constraint_activation) > 0:
 19 |         do_use_prior_accuracy_as_activation_probability = 'prior' in extra_args.adaptive_constraint_activation
 20 |         if do_use_prior_accuracy_as_activation_probability or (not do_train_cnet):
 21 |             activation_probability = activation_probability_before
 22 |         else:
 23 |             activation_probability = activation_probability_after
 24 |     if (not is_direct_policy) and extra_args.unconstrained_recovery:
 25 |         activation_probability = 0.
 26 |     if activation_probability is not None:
 27 |         logger.log('Set constraint activation probability to {0:.1f} %'.format(activation_probability * 100.))
 28 |         env.unwrapped.set_constraint_activation_probability(activation_probability)
 29 | 
 30 | def check_time_between_backups(extra_args, last_backup_time=None):
 31 |     '''
 32 |     Only write backups every min_time_between_backups
 33 |     '''
 34 |     time_now = time.time()
 35 |     if last_backup_time is not None:
 36 |         time_since_last = time_now - last_backup_time
 37 |         do_save_backup = time_since_last > extra_args.min_time_between_backups
 38 |     else:
 39 |         do_save_backup = True
 40 |     if do_save_backup:
 41 |         last_backup_time = time_now
 42 |     return do_save_backup, last_backup_time
 43 | 
 44 | def build_policy_observation_filter(extra_args, ob_space):
 45 |     '''
 46 |     If extra_args.policy_observation_filter is a string of the form "1:3:6", only provide the policy with observations number 1, 3 and 6
 47 |     '''
 48 |     if len(extra_args.policy_observation_filter) == 0:
 49 |         observation_filter = lambda ob: ob
 50 |         ob_space_filtered = ob_space
 51 |     else:
 52 |         indices = [int(_v) for _v in extra_args.policy_observation_filter.split(':')]
 53 |         observation_filter = lambda ob: np.array([ob[_i] for _i in indices], dtype=ob.dtype)
 54 |         low_filtered = observation_filter(ob_space.low)
 55 |         high_filtered = observation_filter(ob_space.high)
 56 |         ob_space_filtered = gym.spaces.Box(low=low_filtered, high=high_filtered, dtype=ob_space.dtype)
 57 |     return ob_space_filtered, observation_filter
 58 | 
 59 | def build_mpi_vars(extra_args):
 60 |     '''
 61 |     Initialize process indices across direct and recovery agents
 62 |     '''
 63 |     mpi_comm = MPI.COMM_WORLD
 64 |     mpi_rank = mpi_comm.Get_rank()
 65 |     is_direct_policy = mpi_rank < extra_args.n_direct
 66 | 
 67 |     mpi_root_direct = 0
 68 |     mpi_group_direct = list(range(extra_args.n_direct))
 69 |     mpi_root_recovery = extra_args.n_direct
 70 |     mpi_group_recovery = list(range(extra_args.n_direct, extra_args.n_direct + extra_args.n_recovery))
 71 |     if is_direct_policy:
 72 |         mpi_root = mpi_root_direct
 73 |         mpi_group = mpi_group_direct
 74 |     else:
 75 |         mpi_root = mpi_root_recovery
 76 |         mpi_group = mpi_group_recovery
 77 |     mpi_destinations = [_e for _e in mpi_group if _e != mpi_root]
 78 |     mpi_n_processes = extra_args.n_direct + extra_args.n_recovery
 79 |     is_root = mpi_rank == mpi_root
 80 | 
 81 |     if extra_args.n_recovery > 0:
 82 |         # Correspondences between direct and recovery agents for CNet data exchange
 83 |         cnet_exchange_ids = {_i: [] for _i in mpi_group_direct + mpi_group_recovery}
 84 |         for _i in range(max(len(mpi_group_direct), len(mpi_group_recovery))):
 85 |             _i_direct = mpi_group_direct[_i % len(mpi_group_direct)]
 86 |             _i_recovery = mpi_group_recovery[_i % len(mpi_group_recovery)]
 87 |             if not (_i_recovery in cnet_exchange_ids[_i_direct]):
 88 |                 cnet_exchange_ids[_i_direct].append(_i_recovery)
 89 |             if not (_i_direct in cnet_exchange_ids[_i_recovery]):
 90 |                 cnet_exchange_ids[_i_recovery].append(_i_direct)
 91 | 
 92 |         # Also get the index of each recovery process within those associated to the corresponding direct process (re-read this several times)
 93 |         cnet_recovery_id_in_direct_exchange_ids = {_i: {} for _i in mpi_group_recovery}
 94 |         for _i_recovery in mpi_group_recovery:
 95 |             for _i_direct in cnet_exchange_ids[_i_recovery]:
 96 |                 cnet_recovery_id_in_direct_exchange_ids[_i_recovery][_i_direct] = cnet_exchange_ids[_i_direct].index(_i_recovery)
 97 |         n_exchange_processes = len(cnet_exchange_ids[mpi_rank])
 98 |     else:
 99 |         cnet_exchange_ids = None
100 |         cnet_recovery_id_in_direct_exchange_ids = None
101 |         n_exchange_processes = None
102 | 
103 |     return mpi_comm, mpi_rank, is_direct_policy, mpi_root, mpi_group, mpi_destinations, mpi_n_processes, is_root, cnet_recovery_id_in_direct_exchange_ids, cnet_exchange_ids, n_exchange_processes
104 | 
105 | def save_models_and_data(extra_args, iters_so_far, end_training, last_backup_time,
106 |                          is_root, mpi_rank, pi, cnet, constraint_demonstration_buffer):
107 |     '''
108 |     Save policy network, constraint network and constraint demonstration buffer
109 |     '''
110 |     do_save_at_all = extra_args.backup_frequency > 0
111 |     do_save_this_iter = (((iters_so_far - 1) % extra_args.backup_frequency) == 0) or end_training
112 |     do_save_this_time, last_backup_time = check_time_between_backups(extra_args, last_backup_time)
113 |     do_save_policy      = not extra_args.only_train_constraints
114 |     do_save_constraints = not extra_args.only_train_policy
115 |     do_save_buffer      = not (extra_args.only_train_policy or extra_args.only_train_constraints)
116 |     if do_save_at_all and do_save_this_iter and do_save_this_time:
117 |         if do_save_policy and is_root:
118 |             # save direct and recovery policies separatery
119 |             pi.save_model(global_step=(iters_so_far-1), verbose=True)
120 |         if do_save_constraints and (mpi_rank == 0):
121 |             # same CNet for all agents
122 |             cnet.save_model(global_step=(iters_so_far-1), verbose=True)
123 |         if do_save_buffer:
124 |             # different buffers for all agents
125 |             constraint_demonstration_buffer.write(verbose=is_root)
126 |     return last_backup_time
127 | 


--------------------------------------------------------------------------------
/ceres/envs/resetter/resetter_env_ceres.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import numpy as np
  6 | from .resetter_env import ResetterEnv
  7 | 
  8 | class ResetterEnvCeres(ResetterEnv):
  9 |     '''
 10 |     Resetter base environment implementing functions required by the CERES logic,
 11 |     e.g., reset from trajectory midpoints, remove identified demonstrations, etc.
 12 |     '''
 13 |     max_reference_trajectories = -1 # Set this to a non negative value in a child class
 14 | 
 15 |     def _init_reference_parameters(self):
 16 |         '''
 17 |         The maximum number of trajectories to keep is application-specific
 18 |         '''
 19 |         assert self.max_reference_trajectories >= 0, 'max_reference_trajectories must be non negative: set it in class {0}'.format(type(self))
 20 | 
 21 |     def _init_reference_trajectories(self):
 22 |         '''
 23 |         Setup empty reference trajectory list
 24 |         '''
 25 |         self.reference_trajectories = []
 26 |         self.reset_count_per_trajectory = [] # these values are incremented outside the environment
 27 |         self.filter_reset_per_trajectory = [] # check for difference before incrementing
 28 | 
 29 |     def add_reference_trajectory(self, trajectory):
 30 |         '''
 31 |         Add a new reference trajectory and generate new metadata
 32 |         '''
 33 |         if len(self.reference_trajectories) < self.max_reference_trajectories:
 34 |             self.reference_trajectories.append(trajectory)
 35 |             self.reset_count_per_trajectory.append(0)
 36 |             self.filter_reset_per_trajectory.append(None)
 37 |         else:
 38 |             pass # Skip if full, add other behaviors in the future
 39 | 
 40 |     def get_random_reference_index(self):
 41 |         '''
 42 |         Return trajectory midpoints
 43 |         '''
 44 |         assert len(self.reference_trajectories) > 0, 'No active trajectory'
 45 |         i_traj = np.random.randint(0, len(self.reference_trajectories))
 46 |         i_state = self.get_reference_trajectory_midpoint(i_traj)
 47 |         return i_traj, i_state
 48 | 
 49 |     def get_reference_trajectory(self, i_traj):
 50 |         return self.reference_trajectories[i_traj]
 51 | 
 52 |     def get_reference_trajectory_midpoint(self, i_traj):
 53 |         i_state = self.reference_trajectories[i_traj].get_midpoint()
 54 |         return i_state
 55 | 
 56 |     def remove_empty_trajectories(self):
 57 |         '''
 58 |         Remove trajectories that have no active state
 59 |         '''
 60 |         i_traj_active = []
 61 |         for (i, traj) in enumerate(self.reference_trajectories):
 62 |             if traj.length_active > 0:
 63 |                 i_traj_active.append(i)
 64 |         n_remove = len(self.reference_trajectories) - len(i_traj_active)
 65 |         self.reference_trajectories = [self.reference_trajectories[i] for i in i_traj_active]
 66 |         self.reset_count_per_trajectory = [self.reset_count_per_trajectory[i] for i in i_traj_active]
 67 |         self.filter_reset_per_trajectory = [self.filter_reset_per_trajectory[i] for i in i_traj_active]
 68 |         return n_remove
 69 | 
 70 |     def check_remove_traj(self, traj):
 71 |         '''
 72 |         Check if the trajectory can be removed based on the number of active snapshots
 73 |         '''
 74 |         do_remove_traj = traj.length_active == 0
 75 |         if traj.length_active == 1: # remove also if the only demonstration left is already classified
 76 |             demonstration = traj.get_demonstration(traj.active_demonstrations[0])
 77 |             do_remove_traj = demonstration.test_is_classified()
 78 |             if do_remove_traj:
 79 |                 #print('Final demonstration is already classified as {0}'.format(demonstration.action_indicator))
 80 |                 pass
 81 |             else:
 82 |                 traj.do_reset_after_last_active = True
 83 |         return do_remove_traj
 84 | 
 85 |     def update_reference_trajectory(self, i_traj, is_resized, remove_if_emptied=False):
 86 |         '''
 87 |         Reset trajectory metadata and remove if applicable
 88 |         '''
 89 |         traj = self.reference_trajectories[i_traj]
 90 |         if is_resized:
 91 |             self.reset_count_per_trajectory[i_traj] = 0
 92 |             self.filter_reset_per_trajectory[i_traj] = None
 93 |         if remove_if_emptied:
 94 |             if self.check_remove_traj(traj):
 95 |                 self.reference_trajectories.pop(i_traj)
 96 |                 self.reset_count_per_trajectory.pop(i_traj)
 97 |                 self.filter_reset_per_trajectory.pop(i_traj)
 98 | 
 99 |     def get_reference_trajectory_active_demonstrations_from(self, i_traj, begin, remove_demonstrations=False, return_copy=True, remove_if_emptied=False):
100 |         '''
101 |         Get a sub-trajectory starting from a given active demonstration and update metadata
102 |         '''
103 |         traj = self.reference_trajectories[i_traj]
104 |         subtraj, is_resized = traj.get_active_demonstrations_from(begin, remove_demonstrations=remove_demonstrations, return_copy=return_copy)
105 |         self.update_reference_trajectory(i_traj, is_resized, remove_if_emptied=remove_if_emptied)
106 |         return subtraj
107 | 
108 |     def get_reference_trajectory_active_demonstrations_to(self, i_traj, end, remove_demonstrations=False, return_copy=True, remove_if_emptied=False):
109 |         '''
110 |         Get a sub-trajectory up to a given active demonstration and update metadata
111 |         '''
112 |         traj = self.reference_trajectories[i_traj]
113 |         subtraj, is_resized = traj.get_active_demonstrations_to(end, remove_demonstrations=remove_demonstrations, return_copy=return_copy)
114 |         self.update_reference_trajectory(i_traj, is_resized, remove_if_emptied=remove_if_emptied)
115 |         return subtraj
116 | 
117 |     def get_reference_trajectory_demonstration(self, i_traj, i_state, return_copy=True):
118 |         '''
119 |         Get a chosen demonstration within a chosen trajectory, or copy thereof for separate processing
120 |         '''
121 |         traj = self.reference_trajectories[i_traj]
122 |         demonstration = traj.get_demonstration(i_state, return_copy=return_copy)
123 |         return demonstration
124 | 
125 |     def increment_trajectory_reset_count(self, i_traj, increment=1, increment_reset_count_on_change=None):
126 |         '''
127 |         Increment the number of times a trajectory has been reset too, unless a reset criterion is set
128 |         '''
129 |         if increment_reset_count_on_change is not None:
130 |             if increment_reset_count_on_change == self.filter_reset_per_trajectory[i_traj]:
131 |                 return
132 |             else:
133 |                 self.filter_reset_per_trajectory[i_traj] = increment_reset_count_on_change
134 |         self.reset_count_per_trajectory[i_traj] += increment
135 | 
136 |     def get_trajectory_reset_count(self, i_traj):
137 |         return self.reset_count_per_trajectory[i_traj]
138 | 
139 | 


--------------------------------------------------------------------------------
/ceres/envs/constrained/constrained_env.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import gym
  6 | from ceres.tools.math import QPSolverQuadprog
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | 
 10 | class ConstrainedEnv(gym.Env):
 11 |     '''
 12 |     Base class for constrained environments, with action correction prior to playing
 13 |     '''
 14 | 
 15 |     return_zero_if_opt_fails = True
 16 |     constraint_violation_factor = 1.
 17 |     ineq_vec_margin = 0.
 18 |     has_ineq = True
 19 |     has_eq   = True
 20 | 
 21 |     def __init__(self, *args, **kwargs):
 22 |         self.init_solver()
 23 |         self.check_instance()
 24 |         self.ineq_mat = None
 25 |         self.ineq_vec = None
 26 | 
 27 |     def init_solver(self):
 28 |         '''
 29 |         Correct actions with quadratic programming, define functions to implement within child classes
 30 |         '''
 31 |         self.correct_in_env = True
 32 |         self.required_functions = ['update_ineq_matrices']
 33 |         if self.has_ineq:
 34 |             self.required_functions.append('update_ineq_matrices')
 35 |         if self.has_eq:
 36 |             self.required_functions.append('update_eq_matrices')
 37 |         self.solver = QPSolverQuadprog()
 38 | 
 39 |     def check_instance(self):
 40 |         '''
 41 |         Check necessary attributes from parent and child classes
 42 |         '''
 43 |         for f_name in self.required_functions:
 44 |             assert hasattr(self, f_name), 'Required function {0} is not implemented in {1}'.format(f_name, type(self))
 45 |         for attr_name in ['observation_space', 'action_space']:
 46 |             assert hasattr(self, attr_name), 'Undefined attribute {0}: make sure the base environment is initialized'.format(attr_name)
 47 |         n_obs = self.observation_space.shape[0]
 48 |         if hasattr(self, 'n_obs'):
 49 |             assert self.n_obs == n_obs, 'Found two different values of n_obs: {0} and {1}'.format(self.n_obs, n_obs)
 50 |         else:
 51 |             self.n_obs = n_obs
 52 |         n_act = self.action_space.shape[0]
 53 |         if hasattr(self, 'n_act'):
 54 |             assert self.n_act == n_act, 'Found two different values of n_act: {0} and {1}'.format(self.n_act, n_act)
 55 |         else:
 56 |             self.n_act = n_act
 57 | 
 58 |     def set_ineq_margin(self, margin_param, relative=True):
 59 |         '''
 60 |         Define a margin that corrected actions must preserve w.r.t. constraints, that is, solve G x <= h - margin
 61 |         '''
 62 |         self.ineq_vec_margin = margin_param
 63 |         if relative:
 64 |             ac_space_pm = [0.5*(high-low) for low, high in zip(self.action_space.low, self.action_space.high)]
 65 |             self.ineq_vec_margin *= min(ac_space_pm)
 66 |         assert self.ineq_vec_margin >= 0, 'Negative margin not supported, but you can disable this check to allow constraint violation'
 67 | 
 68 | 
 69 |     def update_solver(self, do_update_eq=True, do_update_ineq=True, do_update_obj=True):
 70 |         '''
 71 |         Update QP solver parameters
 72 |         '''
 73 |         self.solver.reset(do_reset_eq=do_update_eq, do_reset_ineq=do_update_ineq, do_reset_obj=do_update_obj)
 74 |         self.solver.add_obj(self.obj_mat, self.obj_vec)
 75 |         if self.has_ineq:
 76 |             ineq_vec_solve = self.ineq_vec - self.ineq_vec_margin # account for conservative margin
 77 |             self.solver.add_ineq(self.ineq_mat, ineq_vec_solve)
 78 |         if self.has_eq:
 79 |             self.solver.add_eq(self.eq_mat, self.eq_vec)
 80 |         self.solver.update()
 81 | 
 82 |     def correct_action(self, target_action, do_update_eq=True, do_update_ineq=True, do_update_obj=True):
 83 |         '''
 84 |         Correct action by solving the QP and compute how much the uncorrected action violates the constraints
 85 |         '''
 86 |         # Only rebuild objective function matrices since inequality matrices are already rebuilt at the end of each step
 87 |         if do_update_obj:
 88 |             self.update_obj_matrices(target_action)
 89 |         self.update_solver(do_update_eq=do_update_eq, do_update_ineq=do_update_ineq, do_update_obj=do_update_obj)
 90 |         corrected_action, success = self.solver.solve()
 91 |         if success:
 92 |             corrected_action = np.reshape(corrected_action, target_action.shape)
 93 |         else:
 94 |             if self.return_zero_if_opt_fails:
 95 |                 corrected_action = np.zeros(target_action.shape)
 96 |             else:
 97 |                 corrected_action = target_action
 98 |         viol = self.calc_constraint_violation(target_action)
 99 |         return corrected_action, success, viol
100 | 
101 |     def print_ineq(self, ineq_mat=None, ineq_vec=None):
102 |         '''
103 |         Print inequality constraints in a human-readable format
104 |         '''
105 |         print(self.ineq_to_str(ineq_mat=ineq_mat, ineq_vec=ineq_vec))
106 | 
107 |     def ineq_to_str(self, ineq_mat=None, ineq_vec=None):
108 |         '''
109 |         Build a human-readable string for inequality constraints
110 |         '''
111 |         if ineq_mat is None:
112 |             ineq_mat = self.ineq_mat
113 |         if ineq_vec is None:
114 |             ineq_vec = self.ineq_vec
115 |         ineq_mat_str = str(ineq_mat)
116 |         ineq_mat_lines_str = ineq_mat_str.split('\n')
117 |         ineq_vec_str = str(ineq_vec)
118 |         ineq_vec_lines_str = ineq_vec_str.split('\n')
119 |         n_ineq = len(ineq_mat_lines_str)
120 |         n_digits_max = len(str(n_ineq))
121 |         opt_var_str = [['X{0}'.format(str(_i).zfill(n_digits_max))] for _i in range(n_ineq)]
122 |         opt_var_lines_str = str(np.array(opt_var_str)).split('\n')
123 |         ineq_str_lines = []
124 |         for _i, (ineq_mat_line_str, opt_var_line_str, ineq_vec_line_str) in enumerate(zip(ineq_mat_lines_str, opt_var_lines_str, ineq_vec_lines_str)):
125 |             ineq_str_line = '{0}{3} {1} {3}<= {2}{3}'.format(ineq_mat_line_str, opt_var_line_str, ineq_vec_line_str, ' ' if _i < n_ineq-1 else '')
126 |             ineq_str_lines.append(ineq_str_line)
127 |         ineq_str = '\n'.join(ineq_str_lines)
128 |         return ineq_str
129 | 
130 |     def calc_constraint_violation(self, raw_action):
131 |         '''
132 |         Compute the L2 norm of the constraint violation margin for the uncorrected action
133 |         '''
134 |         if self.ineq_mat is not None:
135 |             a = np.reshape(raw_action, (self.n_act, 1))
136 |             ineq_diff = np.dot(self.ineq_mat, a) - self.ineq_vec
137 |             ineq_val = np.maximum(ineq_diff, 0.)
138 |             ineq_viol = np.linalg.norm(ineq_val)
139 |         else:
140 |             ineq_viol = 0.
141 |         return ineq_viol
142 | 
143 |     def update_obj_matrices(self, target_action):
144 |         '''
145 |         Build objective function matrices of the form 1/2 xT P x + qT x,
146 |         to minimize the distance between optimal and uncorrected (target) action, 1/2 || target - x || ^2,
147 |         hence P = identity and q = -target
148 |         '''
149 |         self.obj_mat = np.eye(self.n_act)
150 |         self.obj_vec = -np.reshape(target_action, (self.n_act, 1))
151 | 
152 | 


--------------------------------------------------------------------------------
/ceres/baselines/ppo1/pposgd_simple_helper.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | These are components of OpenAI's baselines.ppo1.pposgd_simple.learn
  3 | cut into individual functions for re-use in CERES
  4 | OpenAI Baselines is licensed under the MIT License, see LICENSE
  5 | '''
  6 | 
  7 | from baselines.ppo1.pposgd_simple import add_vtarg_and_adv
  8 | from baselines.common import Dataset, explained_variance, fmt_row, zipsame
  9 | from baselines import logger
 10 | import baselines.common.tf_util as U
 11 | import tensorflow as tf, numpy as np
 12 | import time
 13 | from baselines.common.mpi_adam import MpiAdam
 14 | from collections import deque
 15 | from mpi4py import MPI
 16 | 
 17 | def calc_end_training(max_timesteps, timesteps_so_far,
 18 |                       max_episodes, episodes_so_far,
 19 |                       max_iters, iters_so_far,
 20 |                       max_seconds, tstart):
 21 |     if max_timesteps and timesteps_so_far >= max_timesteps:
 22 |         return True
 23 |     elif max_episodes and episodes_so_far >= max_episodes:
 24 |         return True
 25 |     elif max_iters and iters_so_far >= max_iters:
 26 |         return True
 27 |     elif max_seconds and time.time() - tstart >= max_seconds:
 28 |         return True
 29 |     else:
 30 |         return False
 31 | 
 32 | def build_policy_training_vars(pi, oldpi, clip_param, entcoeff, adam_epsilon):
 33 |     atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
 34 |     ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
 35 | 
 36 |     lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
 37 |     clip_param = clip_param * lrmult # Annealed cliping parameter epislon
 38 | 
 39 |     ob = U.get_placeholder_cached(name="ob")
 40 |     ac = pi.pdtype.sample_placeholder([None])
 41 | 
 42 |     kloldnew = oldpi.pd.kl(pi.pd)
 43 |     ent = pi.pd.entropy()
 44 |     meankl = tf.reduce_mean(kloldnew)
 45 |     meanent = tf.reduce_mean(ent)
 46 |     pol_entpen = (-entcoeff) * meanent
 47 | 
 48 |     ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
 49 |     surr1 = ratio * atarg # surrogate from conservative policy iteration
 50 |     surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
 51 |     pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
 52 |     vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
 53 |     total_loss = pol_surr + pol_entpen + vf_loss
 54 |     losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
 55 |     loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
 56 | 
 57 |     var_list = pi.get_trainable_variables()
 58 |     lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
 59 |     adam = MpiAdam(var_list, epsilon=adam_epsilon)
 60 | 
 61 |     assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
 62 |         for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
 63 |     compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)
 64 | 
 65 |     return loss_names, var_list, lossandgrad, adam, assign_old_eq_new, compute_losses
 66 | 
 67 | def adjust_policy_learning_rate(schedule,
 68 |                                 max_timesteps, timesteps_so_far,
 69 |                                 max_episodes, episodes_so_far,
 70 |                                 max_iters, iters_so_far):
 71 |     if schedule == 'constant':
 72 |         cur_lrmult = 1.0
 73 |     elif schedule == 'linear':
 74 |         if max_timesteps > 0:
 75 |             cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
 76 |         elif max_episodes > 0:
 77 |             cur_lrmult =  max(1.0 - float(episodes_so_far) / max_episodes, 0)
 78 |         elif max_iters > 0:
 79 |             cur_lrmult =  max(1.0 - float(iters_so_far) / max_iters, 0)
 80 |         else:
 81 |             raise NotImplementedError
 82 |     else:
 83 |         raise NotImplementedError
 84 |     return cur_lrmult
 85 | 
 86 | def update_policy(pi, seg, gamma, lam,
 87 |                   logger, optim_epochs, optim_batchsize, optim_stepsize, cur_lrmult,
 88 |                   loss_names, lossandgrad, adam, assign_old_eq_new, compute_losses,
 89 |                   mpi_moments_fn):
 90 |         
 91 |     add_vtarg_and_adv(seg, gamma, lam)
 92 | 
 93 |     # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
 94 |     ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
 95 |     vpredbefore = seg["vpred"] # predicted value function before udpate
 96 |     atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
 97 |     d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
 98 |     optim_batchsize = optim_batchsize or ob.shape[0]
 99 | 
100 |     if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
101 | 
102 |     assign_old_eq_new() # set old parameter values to new parameter values
103 |     logger.log("Optimizing...")
104 |     logger.log(fmt_row(13, loss_names))
105 |     # Here we do a bunch of optimization epochs over the data
106 |     for _ in range(optim_epochs):
107 |         losses = [] # list of tuples, each of which gives the loss for a minibatch
108 |         for batch in d.iterate_once(optim_batchsize):
109 |             *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
110 |             adam.update(g, optim_stepsize * cur_lrmult)
111 |             losses.append(newlosses)
112 |         logger.log(fmt_row(13, np.mean(losses, axis=0)))
113 | 
114 |     logger.log("Evaluating losses...")
115 |     losses = []
116 |     for batch in d.iterate_once(optim_batchsize):
117 |         newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
118 |         losses.append(newlosses)
119 |     meanlosses,_,_ = mpi_moments_fn(losses)
120 |     logger.log(fmt_row(13, meanlosses))
121 |     for (lossval, name) in zipsame(meanlosses, loss_names):
122 |         logger.record_tabular("loss_"+name, lossval)
123 |     return vpredbefore, tdlamret, optim_batchsize
124 | 
125 | def log_iter_info(lenbuffer, rewbuffer, tstart,
126 |                   vpredbefore, tdlamret, seg,
127 |                   episodes_so_far, timesteps_so_far,
128 |                   do_dump_tabular, allgather_fn):
129 |     logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
130 |     lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
131 |     listoflrpairs = allgather_fn(lrlocal) # list of tuples
132 |     lens, rews = map(flatten_lists, zip(*listoflrpairs))
133 |     lenbuffer.extend(lens)
134 |     rewbuffer.extend(rews)
135 |     logger.record_tabular("EpLenMean", np.mean(lenbuffer))
136 |     logger.record_tabular("EpRewMean", np.mean(rewbuffer))
137 |     logger.record_tabular("EpThisIter", len(lens))
138 |     episodes_so_far += len(lens)
139 |     timesteps_so_far += sum(lens)
140 |     logger.record_tabular("EpisodesSoFar", episodes_so_far)
141 |     logger.record_tabular("TimestepsSoFar", timesteps_so_far)
142 |     logger.record_tabular("TimeElapsed", time.time() - tstart)
143 |     if do_dump_tabular:
144 |         logger.dump_tabular()
145 |     return episodes_so_far, timesteps_so_far
146 | 
147 | def build_counters():
148 |     episodes_so_far = 0
149 |     timesteps_so_far = 0
150 |     iters_so_far = 0
151 |     tstart = time.time()
152 |     lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
153 |     rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
154 |     return iters_so_far, episodes_so_far, timesteps_so_far, tstart, lenbuffer, rewbuffer
155 | 
156 | def flatten_lists(listoflists):
157 |     return [el for list_ in listoflists for el in list_]
158 | 


--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
 1 | An open-source and open community project is one in which participants choose to work together, and in that process experience differences in language, location, nationality, and experience. In such a diverse environment, misunderstandings and disagreements happen, which in most cases can be resolved informally. In rare cases, however, behavior can intimidate, harass, or otherwise disrupt one or more people in the community, which this project will not tolerate.
 2 | 
 3 | A **Code of Conduct** is useful to define accepted and acceptable behaviors and to promote high standards of professional practice. It also provides a benchmark for self evaluation and acts as a vehicle for better identity of the organization.
 4 | 
 5 | This code (**CoC**) applies to any participant in this project's community – developers, participants in meetings, teleconferences, mailing lists, conferences or functions, etc. Note that this code complements rather than replaces legal rights and obligations pertaining to any particular situation.
 6 | 
 7 | ## Statement of Intent
 8 | 
 9 | This project is committed to maintain a **positive** [work environment](#work-environment). This commitment calls for a workplace where [participants](#participant) at all levels behave according to the rules of the following code. A foundational concept of this code is that we all share responsibility for our work environment.
10 | ## Code
11 | 1. Treat each other with [respect](#respect), professionalism, fairness, and sensitivity to our many differences and strengths, including in situations of high pressure and urgency.
12 | 1. Never [harass](#harassment) or [bully](#workplace-bullying) anyone verbally, physically or [sexually](#sexual-harassment).
13 | 1. Never [discriminate](#discrimination) on the basis of personal characteristics or group membership.
14 | 1. Communicate constructively and avoid [demeaning](#demeaning-behavior) or [insulting](#insulting-behavior) behavior or language.
15 | 1. Seek, accept, and offer objective work criticism, and [acknowledge](#acknowledgement] properly the contributions of others.
16 | 1. Be honest about your own qualifications, and about any circumstances that might lead to conflicts of interest.
17 | 1. Respect the privacy of others and the confidentiality of data you access.
18 | 1. With respect to cultural differences, be conservative in what you do and liberal in what you accept from others, but not to the point of accepting disrespectful, unprofessional or unfair or [unwelcome behavior](#unwelcome-behavior) or [advances](#unwelcome-sexual-advance).
19 | 1. Promote the rules of this Code and take action (especially if you are in a [leadership position](#leadership-position)) to bring the discussion back to a more civil level whenever inappropriate behaviors are observed.
20 | 1. Stay on topic: Make sure that you are posting to the correct channel and avoid off-topic discussions. Remember when you update an issue or respond to an email you are potentially sending to a large number of people.
21 | 1. Step down considerately: Members of every project come and go, and the Hyperledger Project is no different. When you leave or disengage from the project, in whole or in part, we ask that you do so in a way that minimizes disruption to the project. This means you should tell people you are leaving and take the proper steps to ensure that others can pick up where you left off.
22 | 
23 | ## Glossary
24 | #### Demeaning behavior
25 | is acting in a way that reduces another person's dignity, sense of self-worth or respect within the community.
26 | 
27 | #### Discrimination
28 | is the prejudicial treatment of an individual based on criteria such as: physical appearance, race, ethnic origin, genetic differences, national or social origin, name, religion, gender, sexual orientation, family or health situation, pregnancy, disability, age, education, wealth, domicile, political view, morals, employment, or union activity.
29 | 
30 | #### Insulting behavior
31 | is treating another person with scorn or disrespect.
32 | 
33 | #### Acknowledgement
34 | is a record of the origin(s) and author(s) of a contribution.
35 | 
36 | #### Harassment
37 | is any conduct, verbal or physical, that has the intent or effect of interfering with an individual, or that creates an intimidating, hostile, or offensive environment.
38 | 
39 | #### Leadership position
40 | includes group Chairs, project maintainers, staff members, and Board members.
41 | 
42 | #### Participant
43 | includes the following persons:
44 | * Developers
45 | * Anyone from the Public partaking in this project's work environment (e.g. contribute code, comment on our code or specs, email us, attend our conferences, functions, etc)
46 | 
47 | #### Respect
48 | is the genuine consideration you have for someone (if only because of their status as participant in Hyperledger Project, like yourself), and that you show by treating them in a polite and kind way.
49 | 
50 | #### Sexual harassment
51 | includes visual displays of degrading sexual images, sexually suggestive conduct, offensive remarks of a sexual nature, requests for sexual favors, unwelcome physical contact, and sexual assault.
52 | 
53 | #### Unwelcome behavior
54 | Hard to define? Some questions to ask yourself are:
55 | * how would I feel if I were in the position of the recipient?
56 | * would my spouse, parent, child, sibling or friend like to be treated this way?
57 | * would I like an account of my behavior published in the organization's newsletter?
58 | * could my behavior offend or hurt other members of the work group?
59 | * could someone misinterpret my behavior as intentionally harmful or harassing?
60 | * would I treat my boss or a person I admire at work like that ?
61 | 
62 | _Summary_: if you are unsure whether something might be welcome or unwelcome, don't do it.
63 | 
64 | #### Unwelcome sexual advance
65 | includes requests for sexual favors, and other verbal or physical conduct of a sexual nature, where:
66 | * submission to such conduct is made either explicitly or implicitly a term or condition of an individual's employment,
67 | * submission to or rejection of such conduct by an individual is used as a basis for employment decisions affecting the individual,
68 | * such conduct has the purpose or effect of unreasonably interfering with an individual's work performance or creating an intimidating hostile or offensive working environment.
69 | 
70 | #### Workplace Bullying
71 | is a tendency of individuals or groups to use persistent aggressive or unreasonable behavior (e.g. verbal or written abuse, offensive conduct or any interference which undermines or impedes work) against a co-worker or any professional relations.
72 | 
73 | #### Work Environment
74 | is the set of all available means of collaboration, including, but not limited to messages to mailing lists, private correspondence, Web pages, chat channels, phone and video teleconferences, and any kind of face-to-face meetings or discussions.
75 | 
76 | ## Incident Procedure
77 | 
78 | To report incidents or to appeal reports of incidents, contact the Project maintainers. Please include any available relevant information, including links to any publicly accessible material relating to the matter. Every effort will be taken to ensure a safe and collegial environment in which to collaborate on matters relating to the Project. In order to protect the community, the Project reserves the right to take appropriate action, potentially including the removal of an individual from any and all participation in the project. The Project will work towards an equitable resolution in the event of a misunderstanding.
79 | 
80 | ## Credits
81 | 
82 | This code is based on the [Hyperledger Project's CoC](https://github.com/hyperledger/hyperledger/wiki/Hyperledger-Project-Code-of-Conduct), [W3C’s Code of Ethics and Professional Conduct](https://www.w3.org/Consortium/cepc) with some additions from the [Cloud Foundry](https://www.cloudfoundry.org/)‘s Code of Conduct.
83 | 


--------------------------------------------------------------------------------
/examples/02-constraints-from-scratch.md:
--------------------------------------------------------------------------------
  1 | # Training policy and constraints from scratch
  2 | 
  3 | We consider an environment in which an agent navigates through random obstacles to reach a target by position commands.
  4 | The starting position of the agent, the position of the target, the obstacle positions and sizes are randomized at each episode.
  5 | 
  6 | Visualize the environment with a random policy:
  7 | ```
  8 | python3 -m ceres.scripts.play_policy --env_id Nav2dPosRandomHolesCeres-v0 \
  9 |         --max_episodes 1000 --render
 10 | ```
 11 | <img src="media/random_exploration/random_pos_unconstrained.gif" width="400">
 12 | 
 13 | Arguments:
 14 | * ```--env_id Nav2dPosRandomHolesCeres-v0```: environment name (can load environments from modules other than ```ceres``` with the extended argument format ```<module>:<environment name>```)
 15 | * ```--max_episodes 1000```: play a random policy for 1000 episodes
 16 | * ```--render```: (optional) render to screen
 17 | 
 18 | ## Baseline PPO
 19 | 
 20 | Within CERES, disabling constraints and setting the number of recovery policies to zero amounts to training with PPO:
 21 | ```
 22 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosRandomHolesCeres-v0 \
 23 |         --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \
 24 |         --max_iter 1000 --output random_pos_ppo_full
 25 | ```
 26 | Arguments:
 27 | * ```--only_train_policy```: only train policy, not constraints
 28 | * ```--constant_constraint_activation 0.```: set the constraint activation probability to zero throughout training
 29 | * ```--n_recovery 0```: do not train recovery agents
 30 | * ```--max_iter 1000```: do reinforcement learning for 1000 iterations
 31 | * ```--output random_pos_ppo_full```: save logs in ```logs/random_pos_ppo_full```. If the directory already exists, remove it manually or run the script with ```--overwrite```
 32 | * Optionally, run with ```--render``` to visualize exploration and constraints.
 33 | 
 34 | Optionally, we can train the control policy using only a selection of available observations:
 35 | ```
 36 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosRandomHolesCeres-v0 \
 37 |         --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \
 38 |         --policy_observation_filter 0:1:2:3 --max_iter 1000 --output random_pos_ppo_partial
 39 | ```
 40 | Argument:
 41 | * ```--policy_observation_filter 0:1:2:3```: only provide the policy with state elements 0, 1, 2 and 3 (agent and target 2D locations), not 4 and up (distances to surrounding obstacles)
 42 | 
 43 | ## Learning constraints through exploration and recovery
 44 | 
 45 | Train direct and recovery policies with CERES:
 46 | ```
 47 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosRandomHolesCeres-v0 \
 48 |         --cnet_n_ineq 4 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \
 49 |         --cnet_spherical_coordinates --cnet_predict_interior_point --unconstrained_recovery \
 50 |         --adaptive_constraint_activation prior_min --interrupt_constraint_training prior_accuracy:0.95:5:0.90:1 \
 51 |         --max_iter 1000 --output random_pos_ceres_full
 52 | ```
 53 | 
 54 | Arguments:
 55 | * ```--cnet_n_ineq 4```: 4 inequality constraints
 56 | * ```--cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6```: loss weights for positive violation max, negative satisfaction min, L2 regularization
 57 | * ```--cnet_spherical_coordinates```: predict unit-norm constraints using spherical coordinates (alternatively, use ```--cnet_normalize_ineq_mat``` for post-normalization)
 58 | * ```--cnet_predict_interior_point```: predict constraints such that there exists an interior point that satisfies them all
 59 | * ```--unconstrained_recovery```: only apply constraints to the direct agent, not recovery
 60 | * ```--adaptive_constraint_activation prior_min```: adjust the constraint activation probability based on their accuracy at this iteration before training
 61 | * ```--interrupt_constraint_training prior_accuracy:0.95:5:0.90:1```: stop training constraints if their accuracy before training (prior_accuracy) exceeds 95% (0.95) for at least 5 iterations (5). Re-enable training if constraint accuracy falls below 90% (0.90) for at least 1 iteration (1)
 62 | * If omitted, the ```--n_recovery``` argument is set to be equal to the number of direct agents ```--n_direct``` (1 by default)
 63 | 
 64 | Similarly to the baseline PPO, we can train the policy from agent and target positions only and let the constraint network deal with obstacle avoidance:
 65 | ```
 66 | python3 -m ceres.scripts.train_ceres --env_id Nav2dPosRandomHolesCeres-v0 \
 67 |         --cnet_n_ineq 4 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \
 68 |         --cnet_spherical_coordinates --cnet_predict_interior_point --unconstrained_recovery \
 69 |         --adaptive_constraint_activation prior_min --interrupt_constraint_training prior_accuracy:0.95:5:0.90:1 \
 70 |         --policy_observation_filter 0:1:2:3 --max_iter 1000 --output random_pos_ceres_partial
 71 | ```
 72 | 
 73 | ## Compare rewards with and without constrained exploration
 74 | 
 75 | Plot the rewards during training:
 76 | ```
 77 | python3 -m ceres.scripts.plot_rewards \
 78 |         --plot_path "PPO full state=logs/random_pos_ppo_full/worker_0_direct" \
 79 |         --plot_path "PPO partial state=logs/random_pos_ppo_partial/worker_0_direct" \
 80 |         --plot_path "CERES full state=logs/random_pos_ceres_full/worker_0_direct" \
 81 |         --plot_path "CERES partial state=logs/random_pos_ceres_partial/worker_0_direct"
 82 | ```
 83 | <img src="media/random_pos_individual.png">
 84 | <img src="media/random_pos_average.png">
 85 | 
 86 | ## Random obstacles with force control
 87 | 
 88 | We can apply the same method to the case where the agent is controlled with force commands.
 89 | 
 90 | Visualize the environment with a random policy:
 91 | ```
 92 | python3 -m ceres.scripts.play_policy --env_id Nav2dForceRandomHolesCeres-v0 \
 93 |         --max_episodes 1000 --render
 94 | ```
 95 | <img src="media/random_exploration/random_force_unconstrained.gif" width="400">
 96 | 
 97 | Baseline PPO from full state:
 98 | ```
 99 | python3 -m ceres.scripts.train_ceres --env_id Nav2dForceRandomHolesCeres-v0 \
100 |         --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \
101 |         --max_iter 1000 --output random_force_ppo_full
102 | ```
103 | 
104 | Baseline PPO from filtered state (0 to 5: agent position and velocity, target position):
105 | ```
106 | python3 -m ceres.scripts.train_ceres --env_id Nav2dForceRandomHolesCeres-v0 \
107 |         --only_train_policy --constant_constraint_activation 0. --n_recovery 0 \
108 |         --policy_observation_filter 0:1:2:3:4:5 --max_iter 1000 --output random_force_ppo_partial
109 | ```
110 | 
111 | CERES from full state:
112 | ```
113 | python3 -m ceres.scripts.train_ceres --env_id Nav2dForceRandomHolesCeres-v0 \
114 |         --cnet_n_ineq 4 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \
115 |         --cnet_spherical_coordinates --cnet_predict_interior_point --unconstrained_recovery \
116 |         --adaptive_constraint_activation prior_min --interrupt_constraint_training prior_accuracy:0.95:5:0.90:1 \
117 |         --max_iter 1000 --output random_force_ceres_full
118 | ```
119 | 
120 | CERES from filtered state:
121 | ```
122 | python3 -m ceres.scripts.train_ceres --env_id Nav2dForceRandomHolesCeres-v0 \
123 |         --cnet_n_ineq 4 --cnet_loss pvm:1. --cnet_loss nsm:1. --cnet_loss l2:1e-6 \
124 |         --cnet_spherical_coordinates --cnet_predict_interior_point --unconstrained_recovery \
125 |         --adaptive_constraint_activation prior_min --interrupt_constraint_training prior_accuracy:0.95:5:0.90:1 \
126 |         --policy_observation_filter 0:1:2:3:4:5 --max_iter 1000 --output random_force_ceres_partial
127 | ```
128 | 
129 | Plot the rewards during training:
130 | ```
131 | python3 -m ceres.scripts.plot_rewards \
132 |         --plot_path "PPO full state=logs/random_force_ppo_full/worker_0_direct" \
133 |         --plot_path "PPO partial state=logs/random_force_ppo_partial/worker_0_direct" \
134 |         --plot_path "CERES full state=logs/random_force_ceres_full/worker_0_direct" \
135 |         --plot_path "CERES partial state=logs/random_force_ceres_partial/worker_0_direct"
136 | ```
137 | <img src="media/random_force_individual.png">
138 | <img src="media/random_force_average.png">
139 | 
140 | 


--------------------------------------------------------------------------------
/ceres/tools/plot/plot_logs.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import os
  8 | 
  9 | class PlotLogs(object):
 10 |     '''
 11 |     Base class for plotting reward plots, provided input logs
 12 |     '''
 13 | 
 14 |     default_seed_value = 'N/A'
 15 |     plot_as_rows = True
 16 | 
 17 |     def __init__(self, plot_config):
 18 |         self.plot_config = plot_config
 19 |         self.plots = []
 20 |         self.n_plots = 0
 21 |         super().__init__()
 22 | 
 23 |     def add_plot(self, title='', paths='', label='', color='k', skip_error=False):
 24 |         if not type(paths) == list:
 25 |             assert type(paths) == str
 26 |             paths = [paths]
 27 |         for i_path, path in enumerate(paths):
 28 |             assert os.path.exists(path), 'Experiment path does not exist: {0}'.format(path)
 29 |             if path[-1] == '/':
 30 |                 paths[i_path] = path[:-1]
 31 |         try:
 32 |             path_sessions, suffix_sessions = self.load_paths(paths)
 33 |             plot_info = {
 34 |                 'title': title,
 35 |                 'label': label,
 36 |                 'color': color,
 37 |                 'path_sessions': path_sessions,
 38 |                 'suffix_sessions': suffix_sessions,
 39 |             }
 40 |             self.plots.append(plot_info)
 41 |             print('Found logs for {0}: {1}'.format(' '.join(title.split('\n')), plot_info['path_sessions']))
 42 |             success = True
 43 |             self.n_plots += 1
 44 |         except Exception as e:
 45 |             print('Could not find logs for {0}'.format(' '.join(title.split('\n'))))
 46 |             success = False
 47 |             if skip_error:
 48 |                 print('Skip')
 49 |             else:
 50 |                 raise(e)
 51 |         return success
 52 | 
 53 | 
 54 |     def load_paths(self, paths=None):
 55 |         raise NotImplementedError('Implement this into your child class')
 56 | 
 57 |     def calc_plots(self):
 58 |         raise NotImplementedError('Implement this in your own child class')
 59 | 
 60 |     def plot(self, show=True):
 61 |         self.calc_plots()
 62 |         plt.figure()
 63 |         n_cols = self.n_plots
 64 |         max_elements = min([len(plot_info['t']) for plot_info in self.plots])
 65 |         reward_min = min([min(plot_info['rewards']) for plot_info in self.plots])
 66 |         reward_max = max([max(plot_info['rewards']) for plot_info in self.plots])
 67 |         y_lim = [reward_min - 0.10*(reward_max - reward_min),
 68 |                  reward_max + 0.10*(reward_max - reward_min)]
 69 |         y_lim = np.array(y_lim)
 70 |         x_vec_average_list = []
 71 |         y_vec_average_list = []
 72 |         for i_plot, plot_info in enumerate(self.plots):
 73 |             t_averaged = plot_info['t']
 74 |             rew_averaged = plot_info['rewards']
 75 |             if self.plot_as_rows:
 76 |                 ax = plt.subplot(n_cols, 1, i_plot+1)
 77 |             else:
 78 |                 ax = plt.subplot(1, n_cols, i_plot+1)
 79 |             x_vec = np.array(t_averaged)/self.plot_config.timesteps_per_iteration
 80 |             label_x = self.plot_config.label_x_iterations
 81 |             y_vec = np.array(rew_averaged)
 82 |             # Individual rewards
 83 |             x_plot = x_vec
 84 |             y_plot = y_vec
 85 |             plt.plot(x_plot, y_plot, self.plot_config.color_rewards_ind, alpha=0.5, label='Episode reward')
 86 |             # Compute moving average
 87 |             x_vec_select = x_vec
 88 |             y_vec_average = self.calc_moving_average(y_vec, n=self.plot_config.n_average)
 89 |             x_vec_average_list.append(x_vec_select)
 90 |             y_vec_average_list.append(y_vec_average)
 91 |             # Standard deviation
 92 |             y_vec_std = self.calc_moving_std(y_vec, n=self.plot_config.n_average)
 93 |             y_vec_average_minus_std = y_vec_average - y_vec_std
 94 |             y_vec_average_plus_std = y_vec_average + y_vec_std
 95 |             x_plot = x_vec_select
 96 |             y_plot_minus = y_vec_average_minus_std
 97 |             y_plot_plus = y_vec_average_plus_std
 98 |             plt.fill_between(x_plot, y_plot_minus, y_plot_plus, facecolor=self.plot_config.color_rewards_std, alpha=1., label='Standard deviation')
 99 |             # Average rewards
100 |             x_plot = x_vec_select
101 |             y_plot = y_vec_average
102 |             plt.plot(x_plot, y_plot, self.plot_config.color_rewards_avg, alpha=1., label='Average reward')
103 |             if self.plot_as_rows:
104 |                 plt.ylabel(self.plot_config.label_y)
105 |                 if i_plot == len(self.plots)-1:
106 |                     plt.xlabel(label_x)
107 |                 else:
108 |                     pass
109 |             else:
110 |                 plt.xlabel(label_x)
111 |                 if i_plot == 0:
112 |                     plt.ylabel(self.plot_config.label_y)
113 |                 else:
114 |                     pass
115 |             title_loc = plot_info['title']
116 |             plt.title(title_loc)
117 |             plt.ylim(y_lim)
118 |         # Legends
119 |         bottom_legend_artists = []
120 |         bottom_legend_labels = []
121 |         # Individual rewards
122 |         reward_ind_artist = plt.Line2D((0, 1), (0, 0), alpha=0.5, color=self.plot_config.color_rewards_ind)
123 |         bottom_legend_artists.append(reward_ind_artist)
124 |         bottom_legend_labels.append('Episode reward')
125 |         # Average rewards
126 |         reward_avg_artist = plt.Line2D((0, 1), (0, 0), color=self.plot_config.color_rewards_avg)
127 |         bottom_legend_artists.append(reward_avg_artist)
128 |         bottom_legend_labels.append('Reward average')
129 |         # Standard deviations
130 |         reward_std_artist = plt.Line2D((0, 1), (0, 0), color=self.plot_config.color_rewards_std)
131 |         bottom_legend_artists.append(reward_std_artist)
132 |         bottom_legend_labels.append('Reward std. dev.')
133 |         ax.legend(bottom_legend_artists,
134 |                   bottom_legend_labels,
135 |                   loc='lower center',
136 |                   fancybox=True,
137 |                   ncol=3)
138 | 
139 |         # Plot average rewards in the same graph
140 |         plt.figure()
141 |         for i_plot, plot_info in enumerate(self.plots):
142 |             x_vec = x_vec_average_list[i_plot]
143 |             y_vec = y_vec_average_list[i_plot]
144 |             plt.plot(x_vec, y_vec, label=plot_info['label'], color=plot_info['color'])
145 |         plt.xlabel(label_x)
146 |         plt.ylabel(self.plot_config.label_y)
147 |         plt.title('Average rewards')
148 |         plt.legend()
149 | 
150 |         if show:
151 |             plt.show()
152 | 
153 |     def calc_moving_average(self, a, n=3, fill=True) :
154 |         assert n % 2 == 1, 'Number of samples to average must be odd'
155 |         assert len(a) >= n, 'Not enough samples to average: {0} vs {1}'.format(len(a), n)
156 |         if fill:
157 |             assert len(a) > 1.5*n, 'Not enough samples to fill'
158 |         if n == 1:
159 |             return a
160 |         ret = np.cumsum(a, dtype=float)
161 |         ret[n:] = ret[n:] - ret[:-n]
162 |         res = ret[n - 1:] / n
163 |         if fill:
164 |             add_el = len(a) - len(res)
165 |             add_left = int((n-1)/2)
166 |             add_right = add_left
167 |             el_left = []
168 |             for _i in range(add_left):
169 |                 subvec = a[_i:_i + n]
170 |                 assert len(subvec) > 0
171 |                 el_left.append(np.mean(subvec))
172 |             el_right = []
173 |             for _i in range(len(a)-add_right, len(a)):
174 |                 subvec = a[-n+_i:_i]
175 |                 assert len(subvec) > 0
176 |                 el_right.append(np.mean(subvec))
177 |             res = el_left + list(res) + el_right
178 |             res = np.array(res)
179 |             assert len(res) == len(a)
180 |         return res
181 | 
182 | 
183 |     def calc_moving_std(self, a, n=3, fill=True) :
184 |         assert n % 2 == 1, 'Number of samples to average must be odd'
185 |         n_total = len(a)
186 |         res = np.zeros(n_total)
187 |         if n == 1:
188 |             return res
189 |         n_side = int((n-1)/2)
190 |         assert n_side > 0
191 |         for i_center in range(n_total):
192 |             i_left  = max(0, i_center - n_side)
193 |             i_right = min(n_total-1, i_center + n_side)
194 |             res[i_center] = np.std(a[i_left:(i_right+1)])
195 |         return res
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/ceres/tools/io/extra_args.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import os
  6 | import datetime
  7 | import argparse
  8 | import shutil
  9 | import numpy as np
 10 | 
 11 | class ExtraArgs(object):
 12 |     '''
 13 |     A simple class to parse and check command-line arguments for CERES
 14 |     '''
 15 | 
 16 |     def __init__(self, log_root='/tmp', args=None,
 17 |                  **kwargs):
 18 |         if args is not None:
 19 |             self.args = args
 20 |         else:
 21 |             self.args = self.parse_args()
 22 |             for _k, _v in kwargs.items():
 23 |                 assert hasattr(self.args, _k)
 24 |                 setattr(self.args, _k, _v)
 25 |             self.check_args()
 26 |         self.import_env_module(module_id=self.args.module_id)
 27 |         self.timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
 28 |         self.log_root = log_root
 29 |         self.path_xp = self.build_path()
 30 | 
 31 |     def __getattr__(self, _k):
 32 |         return getattr(self.args, _k)
 33 | 
 34 |     def build_path(self):
 35 |         if len(self.args.output) > 0:
 36 |             xp_dirname = self.args.output
 37 |         else:
 38 |             xp_dirname = self.timestamp
 39 |         path_xp = os.path.join(self.log_root, xp_dirname)
 40 |         if os.path.exists(path_xp):
 41 |             exists_str = 'Log path already exists: {0}'.format(path_xp)
 42 |             if self.args.overwrite:
 43 |                 path_move = path_xp + '_moved_{0}'.format(self.timestamp)
 44 |                 exists_str += '\n  Moved existing logs to: {0}'.format(path_move)
 45 |             else:
 46 |                 exists_str += '\n  Remove dir manually or run with --overwrite'
 47 |                 raise ValueError(exists_str)
 48 |             shutil.move(path_xp, path_move)
 49 |             print(exists_str)
 50 |         return path_xp
 51 | 
 52 |     @staticmethod
 53 |     def parse_env_module(env_id):
 54 |         if ':' in env_id:
 55 |             module_id, env_id = env_id.split(':')
 56 |         else:
 57 |             module_id = ''
 58 |         return env_id, module_id
 59 | 
 60 |     def check_args(self):
 61 |         self.args.env_id, self.args.module_id = self.parse_env_module(self.args.env_id)
 62 |         self.args.cnet_hidden_layers = list(map(int, self.args.cnet_hidden_layers.split(',')))
 63 | 
 64 |         self.args.cnet_loss_weights = {}
 65 |         for loss_weight_arg_str in self.args.cnet_loss:
 66 |             try:
 67 |                 loss_name, loss_weight = loss_weight_arg_str.split(':')
 68 |                 loss_weight = float(loss_weight)
 69 |             except:
 70 |                 raise ValueError('Invalid --loss_weight argument {0}, excepted format <loss_name>:<loss_weight>'.format(loss_weight_arg_str))
 71 |             self.args.cnet_loss_weights[loss_name] = loss_weight
 72 |         assert not (self.args.cnet_spherical_coordinates and self.args.cnet_normalize_ineq_mat), 'Cannot have simultaneously --cnet_spherical_coordinates and --cnet_normalize_ineq_mat'
 73 |         assert self.args.n_direct > 0, 'Set at least one direct agent'
 74 |         if self.args.n_recovery is None: # By default, set equal number of direct and recovery agents
 75 |             self.args.n_recovery = self.args.n_direct
 76 | 
 77 |         assert (self.args.constant_constraint_activation is None) or (len(self.args.adaptive_constraint_activation) == 0), 'Cannot set both constant and adaptive constraint activation probability'
 78 |         # Plot
 79 |         if (self.args.plot_average % 2) == 0: self.args.plot_average += 1 # make it odd
 80 | 
 81 |     @staticmethod
 82 |     def import_env_module(env_id=None, module_id=None):
 83 |         if module_id is None:
 84 |             assert env_id is not None
 85 |             env_id, module_id = ExtraArgs.parse_env_module(env_id)
 86 |         if len(module_id) > 0:
 87 |             import importlib
 88 |             print('Import module {0}'.format(module_id))
 89 |             importlib.import_module(module_id)
 90 | 
 91 |     def parse_args(self):
 92 |         parser = argparse.ArgumentParser()
 93 |         args, unprocessed_args = parser.parse_known_args()
 94 | 
 95 |         # Base reinforcement learning parameters
 96 |         parser.add_argument('-e', '--env_id', default='', help='Environment name')
 97 |         parser.add_argument('--timesteps_per_actorbatch', default=1024, type=int, help='timesteps per actor per training batch')
 98 |         parser.add_argument('--max_iterations', default=0, type=int, help='maximum total number of training iterations')
 99 |         parser.add_argument('--max_episodes', default=0, type=int, help='maximum total number of episodes')
100 |         parser.add_argument('--max_timesteps', default=0, type=int, help='maximum total number of timesteps')
101 |         parser.add_argument('--max_seconds', default=0, type=int, help='maximum training time in seconds')
102 |         parser.add_argument('--seed', default=0, type=int, help='random seed')
103 |         parser.add_argument('--backup_frequency', default=1, type=int, help='save every n iterations')
104 |         parser.add_argument('--backup_keep', default=1, type=int, help='number of backups to keep, set to 0 to keep all')
105 |         parser.add_argument('--min_time_between_backups', default=60., type=float, help='minimum time in seconds between model backups')
106 |         parser.add_argument('--continue_ceres_training', default='', help='root directory for CERES logs')
107 |         parser.add_argument('--output', default='', help='output log dir')
108 |         parser.add_argument('--render', action='store_true', help='render')
109 |         parser.add_argument('--save_render', default='', help='directory to save render')
110 |         parser.add_argument('--policy_hidden_size', type=int, default=64)
111 |         parser.add_argument('--policy_hidden_layers', type=int, default=2)
112 |         parser.add_argument('--policy_entcoeff', help='entropy coefficiency of policy', type=float, default=0.)
113 |         parser.add_argument('--policy_learning_rate_schedule', default='linear', choices=['constant', 'linear'], help='policy learning rate schedule',)
114 | 
115 |         # Constraint network architecture
116 |         parser.add_argument('--cnet_n_ineq', default=2, type=int, help='Number of inequality constraints')
117 |         parser.add_argument('--cnet_batch_size', default=64, type=int, help='Batch size')
118 |         parser.add_argument('--cnet_hidden_layers', default='64,64', help='Number of inequality constraints')
119 |         parser.add_argument('--cnet_spherical_coordinates', action='store_true', help='Inequality matrix is first predicted as (n-1)-dim spherical coordinates')
120 |         parser.add_argument('--cnet_normalize_ineq_mat', action='store_true', help='Normalize each row of the inequality matrices')
121 |         parser.add_argument('--cnet_predict_interior_point', action='store_true', help='Predict one point satisfying all constraints')
122 |         parser.add_argument('--cnet_interior_point_margin_min', default=0.1, type=float, help='minimum distance to interior point')
123 |         parser.add_argument('--cnet_interior_point_margin_max', default=1., type=float, help='maximum distance to interior point')
124 |         parser.add_argument('--cnet_interior_point_max', default=1., type=float, help='maximum value for the interior point per action component')
125 |         parser.add_argument('--cnet_loss', default=[], action='append', help='loss weights')
126 | 
127 |         # CERES
128 |         parser.add_argument('-n', '--n_direct', default=1, type=int, help='number of agents for direct reinforcement learning')
129 |         parser.add_argument('--n_recovery', default=None, type=int, help='number of agents learning recovery')
130 |         parser.add_argument('--constant_constraint_activation', default=None, type=float, help='constant constraint activation probability')
131 |         parser.add_argument('--adaptive_constraint_activation', type=str, choices=['average', 'positive', 'negative', 'prior_average', 'prior_positive', 'prior_negative', 'prior_min'], default='', help='which constraint accuracy to use, if not empty')
132 |         parser.add_argument('--interrupt_constraint_training', default='', help='condition for stopping CNet training')
133 |         parser.add_argument('--policy_observation_filter', default='', help='use only these state elements')
134 |         parser.add_argument('--only_train_constraints', action='store_true', help='only train constraint network in CERES')
135 |         parser.add_argument('--only_train_policy', action='store_true', help='only train policy in CERES')
136 |         parser.add_argument('--constraint_demonstration_buffer', default='', help='path to constraint demonstration buffer to restore')
137 |         parser.add_argument('--constraint_demonstration_buffer_size', default=2048, type=int, help='Experience constraint demonstration size')
138 |         parser.add_argument('--cnet_decay_epochs', default=0, type=int, help='decay CNet learning rate every N epochs without improvement')
139 |         parser.add_argument('--cnet_decay_max', default=0.01, type=float, help='Keep learning rate >= this')
140 |         parser.add_argument('--early_stop_positive', default=1.0, type=float)
141 |         parser.add_argument('--early_stop_negative', default=1.0, type=float)
142 |         parser.add_argument('--conservative_exploration', default=0.09, type=float, help='remove this from ineq vec')
143 |         parser.add_argument('--max_recovery_attempts', default=10, type=int, help='number of recovery attempts per reference trajectory')
144 |         parser.add_argument('--unconstrained_recovery', action='store_true', help='do not constrain recovery agent')
145 |         parser.add_argument('--cnet_training_epochs', default=10, type=int, help='Number of training epoch')
146 |         parser.add_argument('--cnet_training_batches', default=0, type=int, help='If > 0, maximum number of batches per epoch')
147 |         parser.add_argument('--cnet_learning_rate', default=1.e-3, type=float)
148 |         parser.add_argument('--cnet_improvement_metric', type=str, default='total_loss', choices=['mean_accuracy', 'min_accuracy', 'total_loss', 'mean_loss', 'max_loss'], help='Improvement metric for LR annealing')
149 | 
150 |         # Write, restore and replay
151 |         parser.add_argument('--play_step_duration', default=0., type=float, help='wait duration in seconds when replaying baselines')
152 |         parser.add_argument('--trained_policy', default='', help='load policy model backup')
153 |         parser.add_argument('--trained_cnet', default='', help='Path to constraint network configuration')
154 |         parser.add_argument('--overwrite', action='store_true', help='automatically moves log dir if it already exists')
155 | 
156 |         # Plot
157 |         parser.add_argument('--plot_average', default=401, type=int, help='Moving average over N episodes')
158 |         parser.add_argument('--plot_path', default=[], action='append', help='Path to logs')
159 | 
160 |         args = parser.parse_args(unprocessed_args)
161 |         return args
162 | 


--------------------------------------------------------------------------------
/ceres/baselines/common/plot_logs_baselines.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import os
  8 | import ast
  9 | from ceres.tools.plot import PlotConfig, PlotLogs
 10 | 
 11 | class PlotLogsBaselines(PlotLogs):
 12 |     '''
 13 |     Load (multiple) baselines logs and plots rewards together with useful statistics.
 14 |     Supports logs distributed across multiple processes and seeds.
 15 |     '''
 16 | 
 17 |     key_ep_reward = 'r'
 18 |     key_ep_length = 'l'
 19 |     key_ep_time   = 't'
 20 |     keys_ep = [key_ep_time, key_ep_length, key_ep_reward]
 21 | 
 22 |     def load_paths(self, paths=None):
 23 |         '''
 24 |         Find log paths by looking for directories of the form "worker_*"
 25 |         '''
 26 |         assert type(paths) == list
 27 |         suffix_sessions = {}
 28 |         target_dirname_0 = {}
 29 |         target_worker_dir_default = 'worker_0'
 30 |         for _i, _d in enumerate(paths):
 31 |             _d_basename = os.path.basename(_d)
 32 |             if 'worker' in _d_basename:
 33 |                 target_worker_dir = '_'.join(_d_basename.split('_')[:2])
 34 |             else:
 35 |                 target_worker_dir = target_worker_dir_default
 36 |             if _d_basename[:len(target_worker_dir)] == target_worker_dir:
 37 |                 target_dirname_0[_i] = _d_basename
 38 |                 suffix_sessions[_i] = _d_basename[len(target_worker_dir):]
 39 |             else:
 40 |                 target_dirname_0[_i] = target_worker_dir
 41 |                 suffix_sessions[_i] = ''
 42 | 
 43 |         path_sessions = {}
 44 |         for i_path, path_loc in enumerate(paths):
 45 |             path_sessions[i_path] = path_loc
 46 |         # For each directory in path_sessions, look for most recent worker_0 directory
 47 |         for _i, _d in path_sessions.items():
 48 |             target_dirname = target_dirname_0[_i]
 49 |             if os.path.basename(_d) == target_dirname:
 50 |                 continue
 51 |             subdirs = [subdir for subdir in os.listdir(_d) if os.path.isdir(os.path.join(_d, subdir))]
 52 |             assert len(subdirs) > 0, 'Could not find any subdirectory in {0}'.format(_d)
 53 |             if target_dirname in subdirs:
 54 |                 path_sessions[_i] = os.path.join(_d, target_dirname)
 55 |             else: # Take the most recent directory
 56 |                 subdirs.sort()
 57 |                 found_target_dir = False
 58 |                 for subdir in reversed(subdirs):
 59 |                     path_xp = os.path.join(_d, subdir, target_dirname)
 60 |                     if os.path.exists(path_xp):
 61 |                         path_sessions[_i] = path_xp
 62 |                         found_target_dir = True
 63 |                         break
 64 |                 assert found_target_dir, 'Could not find {0} directory in {1}'.format(target_dirname, path_xp)
 65 |         # Finally, return folder that contains worker_0
 66 |         for _i, _d in path_sessions.items():
 67 |             assert os.path.basename(_d) == target_dirname_0[_i]
 68 |             path_sessions[_i] = os.path.join(_d, os.pardir)
 69 |         return path_sessions, suffix_sessions
 70 | 
 71 |     def calc_plots(self):
 72 |         '''
 73 |         Build plots from session paths
 74 |         '''
 75 |         for i_plot, plot_info in enumerate(self.plots):
 76 |             path_sessions = plot_info['path_sessions']
 77 |             suffix_sessions = plot_info['suffix_sessions']
 78 |             plot_info['t'], plot_info['rewards'] = self.calc_plot_sessions(path_sessions, suffix_sessions)
 79 | 
 80 |     def calc_plot_sessions(self, path_sessions, suffix_sessions):
 81 |         '''
 82 |         Calculate reward plots across multiple sessions (e.g., seeds)
 83 |         '''
 84 |         t_sessions = []
 85 |         rewards_sessions = []
 86 |         lengths_sessions = []
 87 |         for i_session, (seed, path_session) in enumerate(path_sessions.items()):
 88 |             t_session, rewards_session, lengths_session = self.calc_plot_workers(path_session, suffix_session=suffix_sessions[seed])
 89 |             t_sessions.append(t_session)
 90 |             rewards_sessions.append(rewards_session)
 91 |             lengths_sessions.append(lengths_session)
 92 |             if i_session == 0:
 93 |                 len_min = len(t_session)
 94 |             else:
 95 |                 len_min = min(len_min, len(t_session))
 96 |         assert len_min > 0
 97 |         n_steps_sessions = [np.cumsum(x) for x in lengths_sessions]
 98 |         t_averaged = list(n_steps_sessions[0])
 99 |         rewards_averaged = list(rewards_sessions[0])
100 |         for t_worker, rewards_worker in zip(n_steps_sessions[1:], rewards_sessions[1:]):
101 |             t_averaged, [rewards_averaged] = self.combine_logs_xy(t_averaged, [rewards_averaged], t_worker, [rewards_worker])
102 |         return t_averaged, rewards_averaged
103 | 
104 |     def parse_worker_monitor_csv(self, path_monitor, n_workers=1):
105 |         '''
106 |         Parse baselines monitor files
107 |         '''
108 |         with open(path_monitor, 'r') as f:
109 |             lines = f.read().splitlines()
110 |         header = lines[0]
111 |         labels = lines[1]
112 |         labels = labels.split(',')
113 |         i_key = {_k: labels.index(_k) for _k in self.keys_ep}
114 |         lines = lines[2:]
115 |         res = {_k: [] for _k in self.keys_ep}
116 |         n_ep = len(lines)
117 |         for i_episode in range(n_ep):
118 |             line = lines[i_episode]
119 |             ep_info = line.split(',')
120 |             res[self.key_ep_reward].append(float(ep_info[i_key[self.key_ep_reward]]))
121 |             res[self.key_ep_time].append(float(ep_info[i_key[self.key_ep_time]]))
122 |             res[self.key_ep_length].append(int(ep_info[i_key[self.key_ep_length]]))
123 |         continue_index = 0
124 |         continue_path = path_monitor + '.continue{0}'.format(continue_index)
125 |         while os.path.exists(continue_path):
126 |             raise NotImplementedError()
127 |         t = res[self.key_ep_time]
128 |         rewards = res[self.key_ep_reward]
129 |         lengths = res[self.key_ep_length]
130 |         return t, rewards, lengths
131 | 
132 |     def combine_logs_xy(self, x1, y1_list, x2, y2_list):
133 |         '''
134 |         Combine (x1, y1) with (x2, y2) sorting on increasing elements of (x1, x2)
135 |         '''
136 |         n_x1 = len(x1)
137 |         n_x2 = len(x2)
138 |         x = np.zeros(n_x1 + n_x2)
139 |         y_list = [np.zeros(n_x1 + n_x2) for _ in y1_list]
140 |         i_x1 = 0
141 |         i_x2 = 0
142 |         i_x = 0
143 |         while (i_x1 < n_x1) and (i_x2 < n_x2):
144 |             if x1[i_x1] < x2[i_x2]:
145 |                 x[i_x] = x1[i_x1]
146 |                 for (y, y1) in zip(y_list, y1_list):
147 |                     y[i_x] = y1[i_x1]
148 |                 i_x1 += 1
149 |             else:
150 |                 x[i_x] = x2[i_x2]
151 |                 for (y, y2) in zip(y_list, y2_list):
152 |                     y[i_x] = y2[i_x2]
153 |                 i_x2 += 1
154 |             i_x += 1
155 |         for _i, _x in enumerate(x1[i_x1:]):
156 |             x[i_x + _i] = _x
157 |             for (y, y1) in zip(y_list, y1_list):
158 |                 y[i_x + _i] = y1[i_x1 + _i]
159 |         for _i, (_x, _y) in enumerate(zip(x2[i_x2:], y2[i_x2:])):
160 |             x[i_x + _i] = _x
161 |             for (y, y2) in zip(y_list, y2_list):
162 |                 y[i_x + _i] = y2[i_x2 + _i]
163 |         return x, y_list
164 | 
165 |     def combine_logs_xyz(self, x1, y1, z1, x2, y2, z2):
166 |         '''
167 |         Combine (x1, y1, z1) with (x2, y2, z2) sorting on increasing elements of (x1, x2)
168 |         '''
169 |         x, [y, z] = self.combine_logs_xy(x1, [y1, z1], x2, [y2, z2])
170 |         return x, y, z
171 | 
172 |     def combine_workers(self, t_workers, rewards_workers, lengths_workers):
173 |         '''
174 |         Combine individual worker reward sequences into a single reward sequence
175 |         '''
176 |         t = t_workers[0]
177 |         rewards = rewards_workers[0]
178 |         lengths = lengths_workers[0]
179 |         for t_worker, rewards_worker, lengths_worker in zip(t_workers[1:], rewards_workers[1:], lengths_workers[1:]):
180 |             t, rewards, lengths = self.combine_logs_xyz(t, rewards, lengths, t_worker, rewards_worker, lengths_worker)
181 |         return t, rewards, lengths
182 | 
183 |     def calc_plot_workers(self, path_session, suffix_session=''):
184 |         '''
185 |         Parse logs across workers and build a reward sequence
186 |         '''
187 |         # Load rewards across workers
188 |         path_worker_logs = []
189 |         print('Processing session {0}'.format(path_session))
190 |         for _d in os.listdir(path_session):
191 |             if 'worker' in _d:
192 |                 worker_str_suffix = _d[-len(suffix_session):]
193 |                 if len(suffix_session) > 0:
194 |                     if worker_str_suffix != suffix_session:
195 |                         print('  (ignore path {0}: does not contain suffix {1})'.format(_d, suffix_session))
196 |                         continue
197 |                 worker_monitor_dir = os.path.join(path_session, _d)
198 |                 worker_monitor_path = os.path.join(worker_monitor_dir, 'monitor.csv')
199 |                 assert os.path.isfile(worker_monitor_path), 'Could not find logs at path: {0}'.format(worker_monitor_path)
200 |                 path_worker_logs.append(worker_monitor_path)
201 |         n_workers = len(path_worker_logs)
202 |         t_workers = []
203 |         rewards_workers = []
204 |         lengths_workers = []
205 |         for path_monitor_json in path_worker_logs:
206 |             print('  {0}'.format(path_monitor_json))
207 |             t_worker, rewards_worker, lengths_worker = self.parse_worker_monitor_csv(path_monitor_json, n_workers=n_workers)
208 |             t_workers.append(t_worker)
209 |             rewards_workers.append(rewards_worker)
210 |             lengths_workers.append(lengths_worker)
211 |         t, rewards, lengths = self.combine_workers(t_workers, rewards_workers, lengths_workers)
212 |         return t, rewards, lengths
213 | 
214 | 
215 | def main():
216 |     from ceres.tools.io import ExtraArgs
217 |     extra_args = ExtraArgs()
218 |     plot_config = PlotConfig.from_extra_args(extra_args)
219 |     plotter = PlotLogsBaselines(plot_config)
220 | 
221 |     assert len(extra_args.plot_path) > 0
222 |     color_list = ['g', 'b', 'r', 'k']
223 |     path_info_dict = {}
224 |     path_list = []
225 |     title_list = []
226 |     for i_path, path_info in enumerate(extra_args.plot_path):
227 |         if '=' in path_info:
228 |             title, path_loc = path_info.split('=')
229 |         else:
230 |             path_loc = path_info
231 |             title = '{0}:{1}'.format(i_path, os.path.basename(path_loc))
232 |         if not (title in title_list):
233 |             title_list.append(title)
234 |             path_info_dict[title] = []
235 |         path_info_dict[title].append(path_loc)
236 |     if len(color_list) < len(title_list):
237 |         for _ in range(len(title_list) - len(color_list)):
238 |             color_random = np.random.rand(3)
239 |             color_list.append(color_random)
240 |     for i_plot, title in enumerate(title_list):
241 |         plotter.add_plot(title=title,
242 |                                 paths=path_info_dict[title],
243 |                                 label=title,
244 |                                 color=color_list[i_plot],
245 |                                 skip_error=False)
246 | 
247 |     plotter.plot(show=True)
248 | 
249 | 
250 | 
251 | if __name__ == '__main__':
252 |     main()
253 | 


--------------------------------------------------------------------------------
/ceres/constraints/constraint_network.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from .constraint_loss import ConstraintLoss
  8 | from .constraint_config import ConstraintConfig
  9 | from ceres.tools.math.spherical_coordinates import SphericalCoordinates
 10 | 
 11 | class ConstraintNetwork(object):
 12 |     '''
 13 |     Learn and predict state-dependent constraints on actions
 14 |     '''
 15 | 
 16 |     def __init__(self, observation_space, action_space, config):
 17 |         self.observation_space = observation_space
 18 |         self.action_space = action_space
 19 |         self.action_space_pm = np.array([0.5*(high - low) for (low, high) in zip(self.action_space.low, self.action_space.high)])
 20 |         self.action_space_mid = np.array([0.5*(high + low) for (low, high) in zip(self.action_space.low, self.action_space.high)])
 21 |         self.n_obs = self.observation_space.shape[0]
 22 |         self.n_act = self.action_space.shape[0]
 23 |         self.config = config
 24 |         self.n_ineq = self.config.n_ineq
 25 |         self.init_default()
 26 | 
 27 |         # Constraints are of the form G x <= h, with G of size n_ineq x n_act and h of size n_ineq x 1
 28 |         self.init_prediction_size()
 29 | 
 30 |         # Build model
 31 |         self.observation = tf.placeholder(dtype=tf.float32, shape=(None, self.n_obs), name='observation')
 32 |         self.batch_size = tf.shape(self.observation)[0]
 33 |         self.batch_size_float = tf.cast(self.batch_size, dtype=tf.float32)
 34 |         self.ones_batch = tf.ones([self.batch_size], dtype=tf.float32)
 35 |         self.zeros_batch = tf.zeros([self.batch_size], dtype=tf.float32)
 36 | 
 37 |         self.input_layer = self.observation
 38 |         self.output_layer = self.build_model()
 39 |         self.init_regularization()
 40 | 
 41 |         # Transform output layer into constraint matrices G, h
 42 |         self.ineq_mat_params, self.ineq_vec_params = self.split_predictions(self.output_layer)
 43 |         self.ineq_mat, self.ineq_vec = self.build_ineq(self.ineq_mat_params, self.ineq_vec_params)
 44 | 
 45 |         ##################################
 46 |         # Individual actions with labels #
 47 |         ##################################
 48 | 
 49 |         # Reference action and action indicator for training
 50 |         self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.n_act), name='action')
 51 |         self.action_indicator = tf.placeholder(dtype=tf.float32, shape=(None), name='action_indicator')
 52 |         self.action_tensor = tf.expand_dims(self.action, -1)
 53 | 
 54 |         # Count positive and negative demonstrations
 55 |         self.is_positive = self.action_indicator
 56 |         self.is_negative = 1. - self.is_positive
 57 |         self.n_positive = tf.cast(tf.reduce_sum(self.is_positive), dtype=tf.int32)
 58 |         self.n_negative = tf.cast(tf.reduce_sum(self.is_negative), dtype=tf.int32)
 59 | 
 60 |         # Build constraint margins and scores
 61 |         self.ineq_diff, self.ineq_satisfaction_margin, self.ineq_violation_margin = self.build_ineq_margins(self.ineq_mat, self.ineq_vec, self.action_tensor)
 62 |         self.n_positive_satisfied, self.n_positive_violated, self.n_negative_satisfied, self.n_negative_violated = self.calc_constraint_score()
 63 | 
 64 |         # Construct training losses
 65 |         self.loss, self.losses = self.init_training_loss()
 66 | 
 67 |     def init_prediction_size(self):
 68 |         '''
 69 |         Initialize the number of output variables, depending on: number of constraints; spherical coordinates; interior point prediction
 70 |         '''
 71 |         if self.config.spherical_coordinates:
 72 |             assert self.n_act >= 2
 73 |             self.n_ineq_mat_params = self.n_ineq * (self.n_act - 1)
 74 |         else:
 75 |             self.n_ineq_mat_params = self.n_ineq * self.n_act
 76 |         self.n_ineq_vec_params = self.n_ineq
 77 |         if self.config.predict_interior_point:
 78 |             # Add an action x0 to the constraint predictions
 79 |             self.n_ineq_vec_params += self.n_act
 80 |         self.n_outputs = self.n_ineq_mat_params + self.n_ineq_vec_params
 81 | 
 82 |     def init_training_loss(self):
 83 |         '''
 84 |         Defer the construction of constraint loss terms to a ConstraintLoss object
 85 |         '''
 86 |         constraint_loss = ConstraintLoss(self)
 87 |         total_loss = constraint_loss.total_loss
 88 |         losses = constraint_loss.losses
 89 |         return total_loss, losses
 90 | 
 91 |     def calc_constraint_score(self):
 92 |         '''
 93 |         Count the number of positive / negative demonstrations that satisfy / violate the constraint
 94 |         '''
 95 |         positive_loss = tf.reduce_max(self.ineq_violation_margin, axis=1)
 96 |         positive_loss = tf.multiply(self.action_indicator, positive_loss)
 97 |         n_positive_violated = tf.where(positive_loss > 0., self.ones_batch, self.zeros_batch)
 98 |         n_positive_violated = tf.cast(tf.reduce_sum(n_positive_violated), dtype=tf.int32)
 99 |         n_positive_satisfied = self.n_positive - n_positive_violated
100 |         negative_loss = tf.reduce_min(self.ineq_satisfaction_margin, axis=1)
101 |         negative_loss = tf.multiply(self.is_negative, negative_loss)
102 |         n_negative_satisfied = tf.where(negative_loss > 0., self.ones_batch, self.zeros_batch)
103 |         n_negative_satisfied = tf.cast(tf.reduce_sum(n_negative_satisfied), dtype=tf.int32)
104 |         n_negative_violated = self.n_negative - n_negative_satisfied
105 |         return n_positive_satisfied, n_positive_violated, \
106 |                n_negative_satisfied, n_negative_violated
107 | 
108 |     def init_default(self):
109 |         '''
110 |         Default initialization and nonlinearity parameters
111 |         '''
112 |         self.initializer = tf.random_normal_initializer(mean=0., stddev=0.1)
113 |         self.activation_common = tf.nn.relu
114 | 
115 |     def split_predictions(self, output_layer):
116 |         '''
117 |         Split the output layer into inequality matrix and vector parameters
118 |         '''
119 |         # First n_ineq * self.n_act are G, remaining n_ineq are h
120 |         ineq_mat_params = tf.slice(output_layer, [0, 0], [-1, self.n_ineq_mat_params])
121 |         ineq_vec_params = tf.slice(output_layer, [0, self.n_ineq_mat_params], [-1, self.n_ineq_vec_params])
122 |         return ineq_mat_params, ineq_vec_params
123 | 
124 |     def build_ineq(self, ineq_mat_params, ineq_vec_params):
125 |         '''
126 |         Transform ineq predicted parameters into solver-compatible matrices
127 |         '''
128 |         ineq_mat = self.build_ineq_mat(ineq_mat_params)
129 |         ineq_vec = self.build_ineq_vec(ineq_vec_params, ineq_mat)
130 |         return ineq_mat, ineq_vec
131 | 
132 |     def build_ineq_mat(self, ineq_mat_params):
133 |         '''
134 |         Reshape predicted parameters and optionally ensure ineq matrix normalization
135 |         '''
136 |         ineq_mat_epsilon = 1.e-7
137 |         assert not (self.config.spherical_coordinates and self.config.normalize_ineq_mat), 'Cannot have simultaneously spherical coordinates and ineq mat normalization'
138 |         if self.config.spherical_coordinates:
139 |             ineq_mat = tf.reshape(ineq_mat_params, [-1, self.n_ineq, self.n_act-1])
140 |             self.sc = SphericalCoordinates(self.n_act, input_angles=ineq_mat)
141 |             ineq_mat = self.sc.output_unit_vec
142 |         else:
143 |             ineq_mat = tf.reshape(ineq_mat_params, [-1, self.n_ineq, self.n_act])
144 |             if self.config.normalize_ineq_mat:
145 |                 # Make each line of G of unit norm
146 |                 norm_row = tf.norm(ineq_mat, ord='euclidean', axis=-1, keep_dims=True)
147 |                 norm_row += ineq_mat_epsilon
148 |                 norm_mat = tf.tile(norm_row, [1, 1, self.n_act])
149 |                 ineq_mat = tf.divide(ineq_mat, norm_mat)
150 |         return ineq_mat
151 | 
152 |     def build_ineq_vec(self, ineq_vec_params, ineq_mat):
153 |         '''
154 |         Reshape predicted parameters and optionally build an interior point that satisfies all constraints
155 |         '''
156 |         ineq_vec = ineq_vec_params
157 |         if self.config.predict_interior_point:
158 |             # In this case, we split h_pred into x0 and h+
159 |             # with x0 an action and h+ non negative.
160 |             # Then: h = g*x0 + h+
161 |             interior_point = tf.slice(ineq_vec, [0, 0], [-1, self.n_act])
162 |             action_space_mid_tensor = tf.convert_to_tensor(self.action_space_mid, dtype=ineq_vec.dtype)
163 |             action_space_mid_tensor = tf.expand_dims(action_space_mid_tensor, axis=0)
164 |             action_space_mid_tensor = tf.tile(action_space_mid_tensor, [tf.shape(ineq_vec)[0], 1])
165 |             action_space_pm_tensor = tf.convert_to_tensor(self.action_space_pm, dtype=ineq_vec.dtype)
166 |             action_space_pm_tensor = tf.expand_dims(action_space_pm_tensor, axis=0)
167 |             action_space_pm_tensor = tf.tile(action_space_pm_tensor, [tf.shape(ineq_vec)[0], 1])
168 |             if self.config.interior_point_max >= 0.:
169 |                 # force the interior point to be in a given domain within the action space
170 |                 interior_point_low = action_space_mid_tensor - self.config.interior_point_max*action_space_pm_tensor
171 |                 interior_point_high = action_space_mid_tensor + self.config.interior_point_max*action_space_pm_tensor
172 |                 interior_point = tf.clip_by_value(interior_point, interior_point_low, interior_point_high)
173 |             interior_point = tf.expand_dims(interior_point, axis=-1)
174 |             ineq_vec_plus = tf.slice(ineq_vec, [0, self.n_act], [-1, -1])
175 |             ineq_vec_plus = tf.nn.relu(ineq_vec_plus)
176 |             zeros_like_ineq_vec_plus = tf.zeros_like(ineq_vec_plus)
177 |             if self.config.interior_point_margin_min != 0.:
178 |                 # each row of h is a scalar, so we use the min action range as basis for the margin
179 |                 ineq_vec_plus_min_val = self.config.interior_point_margin_min * np.min(self.action_space_pm)
180 |                 ineq_vec_plus += ineq_vec_plus_min_val
181 |             if self.config.interior_point_margin_max > 0.:
182 |                 # each row of h is a scalar, so we use the min action range as basis for the margin
183 |                 ineq_vec_plus_max_val = self.config.interior_point_margin_max * np.min(self.action_space_pm)
184 |                 ineq_vec_plus = tf.clip_by_value(ineq_vec_plus, 0., ineq_vec_plus_max_val)
185 |             ineq_vec_interior_point = tf.matmul(ineq_mat, interior_point)
186 |             ineq_vec_interior_point = tf.squeeze(ineq_vec_interior_point, axis=-1)
187 |             ineq_vec = ineq_vec_interior_point + ineq_vec_plus
188 |         else:
189 |             interior_point = tf.zeros([1, self.n_act, 1], dtype=ineq_vec.dtype)
190 |             interior_point = tf.tile(interior_point, [tf.shape(ineq_vec)[0], 1, 1])
191 |         # Reshape into matrices
192 |         self.interior_point = interior_point
193 |         ineq_vec = tf.expand_dims(ineq_vec, axis=-1)
194 |         return ineq_vec
195 | 
196 |     def build_ineq_margins(self, ineq_mat, ineq_vec, action_tensor, do_squeeze=True):
197 |         '''
198 |         Constraints are satisfied if Gx <= h, hence satisfaction margin = max(0, h - Gx)
199 |         Constraints are violated if Gx > h, hence violation margin = max(0, Gx - h)
200 |         '''
201 |         ineq_diff = ineq_vec - tf.matmul(ineq_mat, action_tensor)
202 |         ineq_satisfaction_margin = tf.nn.relu(ineq_diff)
203 |         ineq_violation_margin = tf.nn.relu(-ineq_diff)
204 |         if do_squeeze:
205 |             ineq_diff = tf.squeeze(ineq_diff, axis=-1)
206 |             ineq_satisfaction_margin = tf.squeeze(ineq_satisfaction_margin, axis=-1)
207 |             ineq_violation_margin = tf.squeeze(ineq_violation_margin, axis=-1)
208 |         return ineq_diff, ineq_satisfaction_margin, ineq_violation_margin
209 | 
210 |     def init_regularization(self):
211 |         '''
212 |         Find model weights for regularization
213 |         '''
214 |         gr = tf.get_default_graph()
215 |         self.model_weights = {name: gr.get_tensor_by_name('{0}/kernel:0'.format(name)) for name in self.layer_names}
216 | 
217 |     def build_model(self):
218 |         raise NotImplementedError('Implement build_model within child classes')
219 | 
220 | 


--------------------------------------------------------------------------------
/ceres/baselines/ceres/pposgd_ceres.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) IBM Corp. 2018. All Rights Reserved.
  2 | # Project name: Constrained Exploration and Recovery from Experience Shaping
  3 | # This project is licensed under the MIT License, see LICENSE
  4 | 
  5 | from baselines import logger
  6 | import baselines.common.tf_util as U
  7 | import numpy as np
  8 | from ceres.baselines.common import mpi_select
  9 | from ceres.baselines.common.mpi_adam_select import MpiAdamSelect
 10 | from ceres.baselines.common.mpi_moments_select import mpi_moments_select
 11 | from ceres import ConstraintDemonstration, ConstraintDemonstrationTrajectory, ConstraintDemonstrationBuffer
 12 | from .ceres_logic import CeresLogic
 13 | from .constraint_trainer import ConstraintTrainer
 14 | 
 15 | from ceres.baselines.ppo1.pposgd_simple_helper import build_policy_training_vars, build_counters, adjust_policy_learning_rate, update_policy, log_iter_info, calc_end_training
 16 | from .pposgd_ceres_helper import update_constraint_activation_probability, build_policy_observation_filter, build_mpi_vars, save_models_and_data
 17 | 
 18 | def traj_segment_generator(pi, env, horizon,
 19 |                            ceres_logic, is_direct_policy, policy_observation_filter,
 20 |                            stochastic=True, render=False):
 21 |     '''
 22 |     Sample trajectories and collect positive/negative/uncertain demonstrations for constraint learning
 23 |     '''
 24 |     t = 0
 25 |     ac = env.action_space.sample() # not used, just so we have the datatype
 26 |     new = True # marks if we're on first timestep of an episode
 27 |     ob = env.reset()
 28 |     policy_ob = policy_observation_filter(ob)
 29 |     if render:
 30 |         env.render()
 31 |     i_iteration = 0
 32 | 
 33 |     cur_ep_ret = 0 # return in current episode
 34 |     cur_ep_len = 0 # len of current episode
 35 |     ep_rets = [] # returns of completed episodes in this segment
 36 |     ep_lens = [] # lengths of ...
 37 | 
 38 |     # Initialize history arrays
 39 |     obs = np.array([policy_ob for _ in range(horizon)])
 40 |     rews = np.zeros(horizon, 'float32')
 41 |     vpreds = np.zeros(horizon, 'float32')
 42 |     news = np.zeros(horizon, 'int32')
 43 |     acs = np.array([ac for _ in range(horizon)])
 44 |     prevacs = acs.copy()
 45 | 
 46 |     demonstration_trajectory = [] # store sampled demonstrations here
 47 |     uncertain_demonstration_trajectories = [] # demonstrations that cannot yet be identified as positive or negative will go here
 48 |     snapshot = env.unwrapped.calc_snapshot()
 49 |     recovery_info = env.unwrapped.recovery_info
 50 | 
 51 |     while True:
 52 |         prevac = ac
 53 |         policy_ob = policy_observation_filter(ob)
 54 |         ac, vpred = pi.act(stochastic, policy_ob)
 55 |         # Slight weirdness here because we need value function at time T
 56 |         # before returning segment [0, T-1] so we get the correct
 57 |         # terminal value
 58 |         if t > 0 and t % horizon == 0:
 59 |             res = {'ob' : obs, 'rew' : rews, 'vpred' : vpreds, 'new' : news,
 60 |                     'ac' : acs, 'prevac' : prevacs, 'nextvpred': vpred * (1 - new),
 61 |                     'ep_rets' : ep_rets, 'ep_lens' : ep_lens,
 62 |             }
 63 |             res['uncertain_demonstration_trajectories'] = uncertain_demonstration_trajectories
 64 |             yield res
 65 |             # Be careful!!! if you change the downstream algorithm to aggregate
 66 |             # several of these batches, then be sure to do a deepcopy
 67 |             ep_rets = []
 68 |             ep_lens = []
 69 |             uncertain_demonstration_trajectories = []
 70 |             i_iteration += 1
 71 |         i = t % horizon
 72 |         obs[i] = policy_ob
 73 |         vpreds[i] = vpred
 74 |         news[i] = new
 75 |         acs[i] = ac
 76 |         prevacs[i] = prevac
 77 | 
 78 |         ob_new, rew, new, info = env.step(ac)
 79 |         if render:
 80 |             env.render()
 81 |         rews[i] = rew
 82 |         ac_constrained = info[env.unwrapped.info_key_constrained_action]
 83 | 
 84 |         ceres_demonstration = ConstraintDemonstration(state=ob, snapshot=snapshot, action=ac_constrained)
 85 |         demonstration_trajectory.append(ceres_demonstration)
 86 |         ob = ob_new
 87 |         snapshot = env.unwrapped.calc_snapshot()
 88 | 
 89 |         cur_ep_ret += rew
 90 |         cur_ep_len += 1
 91 |         if new:
 92 |             # Add final demonstration as terminal (state without action)
 93 |             demonstration_trajectory.append(ConstraintDemonstration(state=ob, snapshot=snapshot, is_terminal=True))
 94 |             # Sort demonstrations into positive and negative
 95 |             ceres_logic.process_trajectory(ConstraintDemonstrationTrajectory(demonstration_trajectory),
 96 |                                            info[env.unwrapped.info_key_failure], info[env.unwrapped.info_key_success],
 97 |                                            uncertain_demonstration_trajectories,
 98 |                                            env.unwrapped.recovery_info, is_direct_policy,
 99 |                                            remove_reference_trajectory_if_emptied=True,
100 |                                            increment_reset_count_on_change=i_iteration)
101 |             ep_rets.append(cur_ep_ret)
102 |             ep_lens.append(cur_ep_len)
103 |             # Reset trajectory parameters
104 |             cur_ep_ret = 0
105 |             cur_ep_len = 0
106 |             ob = env.reset()
107 |             snapshot = env.unwrapped.calc_snapshot()
108 |             if render:
109 |                 env.render()
110 |             demonstration_trajectory = []
111 |         t += 1
112 | 
113 | def learn(env, policy_fn, *,
114 |         timesteps_per_actorbatch, # timesteps per actor per update
115 |         clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
116 |         optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
117 |         gamma, lam, # advantage estimation
118 |         extra_args, cnet, constraint_demonstration_buffer,
119 |         max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
120 |         callback=None, # you can do anything in the callback, since it takes locals(), globals()
121 |         adam_epsilon=1e-5,
122 |         schedule='constant' # annealing for stepsize parameters (epsilon and adam)
123 | ):
124 | 
125 |     assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, 'Only one time constraint permitted'
126 | 
127 |     # Train different networks across processes
128 |     mpi_comm, mpi_rank, is_direct_policy, mpi_root, mpi_group, mpi_destinations, mpi_n_processes, is_root, cnet_recovery_id_in_direct_exchange_ids, cnet_exchange_ids, n_exchange_processes = build_mpi_vars(extra_args)
129 | 
130 |     # Setup observation filtering (use partial state information)
131 |     policy_ob_space = env.observation_space
132 |     policy_ac_space = env.action_space
133 |     policy_ob_space, policy_observation_filter = build_policy_observation_filter(extra_args, policy_ob_space)
134 |     pi = policy_fn('pi', policy_ob_space, policy_ac_space) # Construct network for new policy
135 |     oldpi = policy_fn('oldpi', policy_ob_space, policy_ac_space) # Network for old policy
136 | 
137 |     # Create policy optimizers
138 |     policy_loss_names, policy_var_list, policy_lossandgrad, policy_adam, policy_assign_old_eq_new, policy_compute_losses = build_policy_training_vars(pi, oldpi, clip_param, entcoeff, adam_epsilon)
139 |     # Use rank-selective Adam to train direct and recovery with separate data
140 |     policy_adam = MpiAdamSelect(mpi_rank, mpi_root, mpi_group, policy_var_list, epsilon=adam_epsilon)
141 |     mpi_moments_fn = lambda losses: mpi_moments_select(losses, mpi_rank, mpi_root, mpi_destinations, mpi_n_processes, axis=0)
142 |     allgather_fn = lambda x: mpi_select.allgather_select(mpi_comm, mpi_rank, mpi_root, mpi_destinations, x, tag=mpi_root)
143 | 
144 |     # Constraints
145 |     cnet = env.unwrapped.cnet
146 |     last_backup_time = None
147 | 
148 |     constraint_trainer = ConstraintTrainer(extra_args, logger,
149 |                                            cnet, constraint_demonstration_buffer,
150 |                                            mpi_comm, mpi_rank, is_direct_policy,
151 |                                            cnet_recovery_id_in_direct_exchange_ids,
152 |                                            cnet_exchange_ids, n_exchange_processes,
153 |                                            adam_epsilon=adam_epsilon)
154 | 
155 |     # Enable conservative exploration (force margin w.r.t. constraints)
156 |     env.unwrapped.set_ineq_margin(extra_args.conservative_exploration)
157 |     # Set initial constraint activation probability to zero
158 |     update_constraint_activation_probability(env, extra_args, logger, is_direct_policy, True, 0., 0.)
159 | 
160 |     U.initialize()
161 |     if len(extra_args.trained_policy) > 0:
162 |         pi.restore_model(extra_args.trained_policy)
163 |         oldpi.restore_model(extra_args.trained_policy, backup_network_id='pi')
164 |     if len(extra_args.trained_cnet) > 0:
165 |         cnet.restore_model(extra_args.trained_cnet)
166 |     if len(extra_args.constraint_demonstration_buffer) > 0:
167 |         constraint_demonstration_buffer.restore_buffer(extra_args.constraint_demonstration_buffer, keep_size=False, verbose=True)
168 | 
169 |     policy_adam.sync()
170 |     constraint_trainer.init()
171 | 
172 |     # Prepare for rollouts
173 |     ceres_logic = CeresLogic(env, constraint_demonstration_buffer, extra_args)
174 |     seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch,
175 |                                      ceres_logic,
176 |                                      is_direct_policy,
177 |                                      policy_observation_filter,
178 |                                      stochastic=True,
179 |                                      render=(extra_args.render and (mpi_rank == 0)))
180 | 
181 |     iters_so_far, episodes_so_far, timesteps_so_far, tstart, lenbuffer, rewbuffer = build_counters()
182 | 
183 |     do_train_policy  = True
184 |     do_sync_recovery = True
185 |     do_train_cnet    = True
186 | 
187 |     if extra_args.n_recovery == 0:
188 |         do_sync_recovery = False
189 |         do_train_cnet    = False # turn it back on if only train constraints
190 | 
191 |     if extra_args.only_train_constraints:
192 |         do_train_policy  = False
193 |         do_sync_recovery = False
194 |         do_train_cnet    = True
195 |         assert len(extra_args.constraint_demonstration_buffer) > 0, 'Required constraint demonstration buffer'
196 |         max_iters, max_timesteps, max_episodes, max_seconds = 1, 0, 0, 0
197 | 
198 |     if extra_args.only_train_policy:
199 |         do_train_policy  = True
200 |         do_sync_recovery = False
201 |         do_train_cnet    = False
202 | 
203 |     while True:
204 |         if callback: callback(locals(), globals())
205 | 
206 |         logger.log('********** Begin iteration {0} ************'.format(iters_so_far))
207 | 
208 |         n_reference_trajectories_before_sampling = len(env.unwrapped.reference_trajectories)
209 | 
210 |         if do_train_policy:
211 |             # Collect new trajectories and update policy
212 |             seg = seg_gen.__next__()
213 |             policy_cur_lrmult = adjust_policy_learning_rate(schedule, max_timesteps, timesteps_so_far, max_episodes, episodes_so_far, max_iters, iters_so_far)
214 |             vpredbefore, tdlamret, optim_batchsize = update_policy(pi, seg, gamma, lam,
215 |                                                                    logger, optim_epochs, optim_batchsize, optim_stepsize, policy_cur_lrmult,
216 |                                                                    policy_loss_names, policy_lossandgrad, policy_adam, policy_assign_old_eq_new, policy_compute_losses,
217 |                                                                    mpi_moments_fn)
218 | 
219 |         if do_train_policy and do_sync_recovery:
220 |             # Transfer uncertain demonstrations from direct to recovery agents
221 |             constraint_trainer.synchronize_recovery_trajectories(env, seg, n_reference_trajectories_before_sampling)
222 | 
223 |         # Compute constraint losses on the newly collected data, prior to training
224 |         do_train_cnet, activation_probability_before = constraint_trainer.prepare_constraint_update(do_train_cnet, iters_so_far)
225 | 
226 |         # Train constraints on the new data and return final losses
227 |         do_train_cnet, activation_probability_after = constraint_trainer.update_constraint_network(do_train_cnet)
228 | 
229 |         # Change the environment constraint activation probability dependending on the constraint prediction accuracy
230 |         update_constraint_activation_probability(env, extra_args, logger, is_direct_policy, do_train_cnet,
231 |                                                  activation_probability_before, activation_probability_after)
232 | 
233 |         # Log iteration results
234 |         if do_train_policy:
235 |             episodes_so_far, timesteps_so_far = log_iter_info(lenbuffer, rewbuffer, tstart,
236 |                                                               vpredbefore, tdlamret, seg,
237 |                                                               episodes_so_far, timesteps_so_far,
238 |                                                               is_root, allgather_fn)
239 |         iters_so_far += 1
240 |         end_training = calc_end_training(max_timesteps, timesteps_so_far,
241 |                                          max_episodes, episodes_so_far,
242 |                                          max_iters, iters_so_far,
243 |                                          max_seconds, tstart)
244 | 
245 |         # Save models and data
246 |         last_backup_time = save_models_and_data(extra_args, iters_so_far, end_training, last_backup_time,
247 |                                                 is_root, mpi_rank, pi, cnet, constraint_demonstration_buffer)
248 | 
249 |         if end_training:
250 |             break
251 | 


--------------------------------------------------------------------------------