├── .DS_Store ├── .gitignore ├── .idea ├── ioc.iml ├── misc.xml ├── modules.xml └── vcs.xml ├── README.md ├── control ├── .benchmark_pattern ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── baselines │ ├── __init__.py │ ├── bench │ │ ├── __init__.py │ │ ├── benchmarks.py │ │ └── monitor.py │ ├── common │ │ ├── __init__.py │ │ ├── atari_wrappers.py │ │ ├── cg.py │ │ ├── cmd_util.py │ │ ├── console_util.py │ │ ├── dataset.py │ │ ├── distributions.py │ │ ├── input.py │ │ ├── math_util.py │ │ ├── misc_util.py │ │ ├── models.py │ │ ├── mpi_adam.py │ │ ├── mpi_adam_optimizer.py │ │ ├── mpi_fork.py │ │ ├── mpi_moments.py │ │ ├── mpi_running_mean_std.py │ │ ├── mpi_util.py │ │ ├── plot_util.py │ │ ├── policies.py │ │ ├── retro_wrappers.py │ │ ├── runners.py │ │ ├── running_mean_std.py │ │ ├── schedules.py │ │ ├── segment_tree.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── envs │ │ │ │ ├── __init__.py │ │ │ │ ├── fixed_sequence_env.py │ │ │ │ ├── identity_env.py │ │ │ │ └── mnist_env.py │ │ │ ├── test_cartpole.py │ │ │ ├── test_doc_examples.py │ │ │ ├── test_env_after_learn.py │ │ │ ├── test_fetchreach.py │ │ │ ├── test_fixed_sequence.py │ │ │ ├── test_identity.py │ │ │ ├── test_mnist.py │ │ │ ├── test_schedules.py │ │ │ ├── test_segment_tree.py │ │ │ ├── test_serialization.py │ │ │ ├── test_tf_util.py │ │ │ └── util.py │ │ ├── tf_util.py │ │ ├── tile_images.py │ │ └── vec_env │ │ │ ├── __init__.py │ │ │ ├── dummy_vec_env.py │ │ │ ├── shmem_vec_env.py │ │ │ ├── subproc_vec_env.py │ │ │ ├── test_vec_env.py │ │ │ ├── test_video_recorder.py │ │ │ ├── util.py │ │ │ ├── vec_frame_stack.py │ │ │ ├── vec_monitor.py │ │ │ ├── vec_normalize.py │ │ │ └── vec_video_recorder.py │ ├── logger.py │ ├── ppoc_int │ │ ├── __init__.py │ │ ├── assets │ │ │ ├── half_cheetah.xml │ │ │ └── twod_tmaze.xml │ │ ├── half_cheetah.py │ │ ├── mlp_policy.py │ │ ├── normalized_env.py │ │ ├── plot_res.py │ │ ├── pposgd_simple.py │ │ ├── run_mujoco.py │ │ ├── seeding.py │ │ └── twod_tmaze.py │ ├── results_plotter.py │ └── run.py ├── benchmarks_atari10M.htm ├── benchmarks_mujoco1M.htm ├── data │ ├── cartpole.gif │ ├── fetchPickAndPlaceContrast.png │ └── logo.jpg ├── docs │ └── viz │ │ └── viz.ipynb ├── setup.cfg └── setup.py ├── launcher_miniworld.sh ├── launcher_mujoco.sh ├── miniworld ├── .benchmark_pattern ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── baselines │ ├── __init__.py │ ├── bench │ │ ├── __init__.py │ │ ├── benchmarks.py │ │ └── monitor.py │ ├── common │ │ ├── __init__.py │ │ ├── atari_wrappers.py │ │ ├── cg.py │ │ ├── cmd_util.py │ │ ├── console_util.py │ │ ├── dataset.py │ │ ├── distributions.py │ │ ├── input.py │ │ ├── math_util.py │ │ ├── misc_util.py │ │ ├── models.py │ │ ├── mpi_adam.py │ │ ├── mpi_adam_optimizer.py │ │ ├── mpi_fork.py │ │ ├── mpi_moments.py │ │ ├── mpi_running_mean_std.py │ │ ├── mpi_util.py │ │ ├── plot_util.py │ │ ├── policies.py │ │ ├── retro_wrappers.py │ │ ├── runners.py │ │ ├── running_mean_std.py │ │ ├── schedules.py │ │ ├── segment_tree.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── envs │ │ │ │ ├── __init__.py │ │ │ │ ├── fixed_sequence_env.py │ │ │ │ ├── identity_env.py │ │ │ │ └── mnist_env.py │ │ │ ├── test_cartpole.py │ │ │ ├── test_doc_examples.py │ │ │ ├── test_env_after_learn.py │ │ │ ├── test_fetchreach.py │ │ │ ├── test_fixed_sequence.py │ │ │ ├── test_identity.py │ │ │ ├── test_mnist.py │ │ │ ├── test_schedules.py │ │ │ ├── test_segment_tree.py │ │ │ ├── test_serialization.py │ │ │ ├── test_tf_util.py │ │ │ └── util.py │ │ ├── tf_util.py │ │ ├── tile_images.py │ │ └── vec_env │ │ │ ├── __init__.py │ │ │ ├── dummy_vec_env.py │ │ │ ├── shmem_vec_env.py │ │ │ ├── subproc_vec_env.py │ │ │ ├── test_vec_env.py │ │ │ ├── test_video_recorder.py │ │ │ ├── util.py │ │ │ ├── vec_frame_stack.py │ │ │ ├── vec_monitor.py │ │ │ ├── vec_normalize.py │ │ │ └── vec_video_recorder.py │ ├── logger.py │ ├── ppoc_int │ │ ├── README.md │ │ ├── __init__.py │ │ ├── cnn_policy.py │ │ ├── mlp_policy.py │ │ ├── muj.py │ │ ├── oneroom.py │ │ ├── plot_res.py │ │ ├── pposgd_simple.py │ │ ├── run_miniw.py │ │ └── run_mujoco.py │ ├── results_plotter.py │ └── run.py ├── data │ ├── cartpole.gif │ ├── fetchPickAndPlaceContrast.png │ └── logo.jpg ├── docs │ └── viz │ │ └── viz.ipynb ├── setup.cfg └── setup.py └── tabular ├── .DS_Store ├── .ipynb_checkpoints ├── fr_analysis_heatmaps-checkpoint.ipynb ├── fr_analysis_performance-checkpoint.ipynb └── fr_env_plots-checkpoint.ipynb ├── FR_Return_10Runs_Smooth_MisspecifiedPiO.pdf ├── FR_Steps_10Runs_Smooth_MisspecifiedPiO.pdf ├── GoalG62.png ├── InterestOptionCritic └── Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200 │ ├── History.npy │ ├── IOC_Task1_IntraOptionPolicy_Opt_0.png │ ├── IOC_Task1_IntraOptionPolicy_Opt_1.png │ ├── IOC_Task1_IntraOptionPolicy_Opt_2.png │ ├── IOC_Task1_IntraOptionPolicy_Opt_3.png │ ├── Params.txt │ ├── StateFreq.npy │ ├── Weights_ActionValueFunction.npy │ ├── Weights_InterestFunction.npy │ ├── Weights_IntraOption.npy │ ├── Weights_OptionValueFunction.npy │ ├── Weights_Policy.npy │ └── Weights_Termination.npy ├── OptionCritic └── Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200 │ ├── History.npy │ ├── Params.txt │ ├── StateFreq.npy │ ├── Weights_ActionValueFunction.npy │ ├── Weights_IntraOption.npy │ ├── Weights_OptionValueFunction.npy │ └── Weights_Termination.npy ├── TransferVisual.png ├── __pycache__ └── fourrooms.cpython-36.pyc ├── fourrooms.py ├── fr_analysis_heatmaps.ipynb ├── fr_analysis_performance.ipynb ├── fr_env_plots.ipynb ├── interestoptioncritic_tabular_fr.py └── optioncritic_tabular_fr.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/workspace.xml 2 | .idea/tasks.xml 3 | -------------------------------------------------------------------------------- /.idea/ioc.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /control/.benchmark_pattern: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /control/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.pkl 4 | *.py~ 5 | .pytest_cache 6 | .DS_Store 7 | .idea 8 | 9 | # Setuptools distribution and build folders. 10 | /dist/ 11 | /build 12 | keys/ 13 | 14 | # Virtualenv 15 | /env 16 | 17 | 18 | *.sublime-project 19 | *.sublime-workspace 20 | 21 | .idea 22 | 23 | logs/ 24 | 25 | .ipynb_checkpoints 26 | ghostdriver.log 27 | 28 | htmlcov 29 | 30 | junk 31 | src 32 | 33 | *.egg-info 34 | .cache 35 | 36 | MUJOCO_LOG.TXT 37 | -------------------------------------------------------------------------------- /control/.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | services: 6 | - docker 7 | 8 | install: 9 | - pip install flake8 10 | - docker build . -t baselines-test 11 | 12 | script: 13 | - flake8 . --show-source --statistics 14 | - docker run baselines-test pytest -v --forked . 15 | -------------------------------------------------------------------------------- /control/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | RUN apt-get -y update && apt-get -y install ffmpeg 4 | # RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv 5 | 6 | ENV CODE_DIR /root/code 7 | 8 | COPY . $CODE_DIR/baselines 9 | WORKDIR $CODE_DIR/baselines 10 | 11 | # Clean up pycache and pyc files 12 | RUN rm -rf __pycache__ && \ 13 | find . -name "*.pyc" -delete && \ 14 | pip install tensorflow && \ 15 | pip install -e .[test] 16 | 17 | 18 | CMD /bin/bash 19 | -------------------------------------------------------------------------------- /control/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /control/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/baselines/__init__.py -------------------------------------------------------------------------------- /control/baselines/bench/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.bench.benchmarks import * 2 | from baselines.bench.monitor import * 3 | -------------------------------------------------------------------------------- /control/baselines/common/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F403 2 | from baselines.common.console_util import * 3 | from baselines.common.dataset import Dataset 4 | from baselines.common.math_util import * 5 | from baselines.common.misc_util import * 6 | -------------------------------------------------------------------------------- /control/baselines/common/cg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): 3 | """ 4 | Demmel p 312 5 | """ 6 | p = b.copy() 7 | r = b.copy() 8 | x = np.zeros_like(b) 9 | rdotr = r.dot(r) 10 | 11 | fmtstr = "%10i %10.3g %10.3g" 12 | titlestr = "%10s %10s %10s" 13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm")) 14 | 15 | for i in range(cg_iters): 16 | if callback is not None: 17 | callback(x) 18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x))) 19 | z = f_Ax(p) 20 | v = rdotr / p.dot(z) 21 | x += v*p 22 | r -= v*z 23 | newrdotr = r.dot(r) 24 | mu = newrdotr/rdotr 25 | p = r + mu*p 26 | 27 | rdotr = newrdotr 28 | if rdotr < residual_tol: 29 | break 30 | 31 | if callback is not None: 32 | callback(x) 33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 34 | return x 35 | -------------------------------------------------------------------------------- /control/baselines/common/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from contextlib import contextmanager 3 | import numpy as np 4 | import time 5 | import shlex 6 | import subprocess 7 | 8 | # ================================================================ 9 | # Misc 10 | # ================================================================ 11 | 12 | def fmt_row(width, row, header=False): 13 | out = " | ".join(fmt_item(x, width) for x in row) 14 | if header: out = out + "\n" + "-"*len(out) 15 | return out 16 | 17 | def fmt_item(x, l): 18 | if isinstance(x, np.ndarray): 19 | assert x.ndim==0 20 | x = x.item() 21 | if isinstance(x, (float, np.float32, np.float64)): 22 | v = abs(x) 23 | if (v < 1e-4 or v > 1e+4) and v > 0: 24 | rep = "%7.2e" % x 25 | else: 26 | rep = "%7.5f" % x 27 | else: rep = str(x) 28 | return " "*(l - len(rep)) + rep 29 | 30 | color2num = dict( 31 | gray=30, 32 | red=31, 33 | green=32, 34 | yellow=33, 35 | blue=34, 36 | magenta=35, 37 | cyan=36, 38 | white=37, 39 | crimson=38 40 | ) 41 | 42 | def colorize(string, color='green', bold=False, highlight=False): 43 | attr = [] 44 | num = color2num[color] 45 | if highlight: num += 10 46 | attr.append(str(num)) 47 | if bold: attr.append('1') 48 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 49 | 50 | def print_cmd(cmd, dry=False): 51 | if isinstance(cmd, str): # for shell=True 52 | pass 53 | else: 54 | cmd = ' '.join(shlex.quote(arg) for arg in cmd) 55 | print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd)) 56 | 57 | 58 | def get_git_commit(cwd=None): 59 | return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8') 60 | 61 | def get_git_commit_message(cwd=None): 62 | return subprocess.check_output(['git', 'show', '-s', '--format=%B', 'HEAD'], cwd=cwd).decode('utf8') 63 | 64 | def ccap(cmd, dry=False, env=None, **kwargs): 65 | print_cmd(cmd, dry) 66 | if not dry: 67 | subprocess.check_call(cmd, env=env, **kwargs) 68 | 69 | 70 | MESSAGE_DEPTH = 0 71 | 72 | @contextmanager 73 | def timed(msg): 74 | global MESSAGE_DEPTH #pylint: disable=W0603 75 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) 76 | tstart = time.time() 77 | MESSAGE_DEPTH += 1 78 | yield 79 | MESSAGE_DEPTH -= 1 80 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) 81 | -------------------------------------------------------------------------------- /control/baselines/common/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Dataset(object): 4 | def __init__(self, data_map, deterministic=False, shuffle=True): 5 | self.data_map = data_map 6 | self.deterministic = deterministic 7 | self.enable_shuffle = shuffle 8 | self.n = next(iter(data_map.values())).shape[0] 9 | self._next_id = 0 10 | self.shuffle() 11 | 12 | def shuffle(self): 13 | if self.deterministic: 14 | return 15 | perm = np.arange(self.n) 16 | np.random.shuffle(perm) 17 | 18 | for key in self.data_map: 19 | self.data_map[key] = self.data_map[key][perm] 20 | 21 | self._next_id = 0 22 | 23 | def next_batch(self, batch_size): 24 | if self._next_id >= self.n and self.enable_shuffle: 25 | self.shuffle() 26 | 27 | cur_id = self._next_id 28 | cur_batch_size = min(batch_size, self.n - self._next_id) 29 | self._next_id += cur_batch_size 30 | 31 | data_map = dict() 32 | for key in self.data_map: 33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 34 | return data_map 35 | 36 | def iterate_once(self, batch_size): 37 | if self.enable_shuffle: self.shuffle() 38 | 39 | while self._next_id <= self.n - batch_size: 40 | yield self.next_batch(batch_size) 41 | self._next_id = 0 42 | 43 | def subset(self, num_elements, deterministic=True): 44 | data_map = dict() 45 | for key in self.data_map: 46 | data_map[key] = self.data_map[key][:num_elements] 47 | return Dataset(data_map, deterministic) 48 | 49 | 50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 52 | arrays = tuple(map(np.asarray, arrays)) 53 | n = arrays[0].shape[0] 54 | assert all(a.shape[0] == n for a in arrays[1:]) 55 | inds = np.arange(n) 56 | if shuffle: np.random.shuffle(inds) 57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches 58 | for batch_inds in np.array_split(inds, sections): 59 | if include_final_partial_batch or len(batch_inds) == batch_size: 60 | yield tuple(a[batch_inds] for a in arrays) 61 | -------------------------------------------------------------------------------- /control/baselines/common/input.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from gym.spaces import Discrete, Box, MultiDiscrete 4 | 5 | def observation_placeholder(ob_space, batch_size=None, name='Ob'): 6 | ''' 7 | Create placeholder to feed observations into of the size appropriate to the observation space 8 | 9 | Parameters: 10 | ---------- 11 | 12 | ob_space: gym.Space observation space 13 | 14 | batch_size: int size of the batch to be fed into input. Can be left None in most cases. 15 | 16 | name: str name of the placeholder 17 | 18 | Returns: 19 | ------- 20 | 21 | tensorflow placeholder tensor 22 | ''' 23 | 24 | assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \ 25 | 'Can only deal with Discrete and Box observation spaces for now' 26 | 27 | dtype = ob_space.dtype 28 | if dtype == np.int8: 29 | dtype = np.uint8 30 | 31 | return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name) 32 | 33 | 34 | def observation_input(ob_space, batch_size=None, name='Ob'): 35 | ''' 36 | Create placeholder to feed observations into of the size appropriate to the observation space, and add input 37 | encoder of the appropriate type. 38 | ''' 39 | 40 | placeholder = observation_placeholder(ob_space, batch_size, name) 41 | return placeholder, encode_observation(ob_space, placeholder) 42 | 43 | def encode_observation(ob_space, placeholder): 44 | ''' 45 | Encode input in the way that is appropriate to the observation space 46 | 47 | Parameters: 48 | ---------- 49 | 50 | ob_space: gym.Space observation space 51 | 52 | placeholder: tf.placeholder observation input placeholder 53 | ''' 54 | if isinstance(ob_space, Discrete): 55 | return tf.to_float(tf.one_hot(placeholder, ob_space.n)) 56 | elif isinstance(ob_space, Box): 57 | return tf.to_float(placeholder) 58 | elif isinstance(ob_space, MultiDiscrete): 59 | placeholder = tf.cast(placeholder, tf.int32) 60 | one_hots = [tf.to_float(tf.one_hot(placeholder[..., i], ob_space.nvec[i])) for i in range(placeholder.shape[-1])] 61 | return tf.concat(one_hots, axis=-1) 62 | else: 63 | raise NotImplementedError 64 | 65 | -------------------------------------------------------------------------------- /control/baselines/common/math_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | 5 | def discount(x, gamma): 6 | """ 7 | computes discounted sums along 0th dimension of x. 8 | 9 | inputs 10 | ------ 11 | x: ndarray 12 | gamma: float 13 | 14 | outputs 15 | ------- 16 | y: ndarray with same shape as x, satisfying 17 | 18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 19 | where k = len(x) - t - 1 20 | 21 | """ 22 | assert x.ndim >= 1 23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 24 | 25 | def explained_variance(ypred,y): 26 | """ 27 | Computes fraction of variance that ypred explains about y. 28 | Returns 1 - Var[y-ypred] / Var[y] 29 | 30 | interpretation: 31 | ev=0 => might as well have predicted zero 32 | ev=1 => perfect prediction 33 | ev<0 => worse than just predicting zero 34 | 35 | """ 36 | assert y.ndim == 1 and ypred.ndim == 1 37 | vary = np.var(y) 38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 39 | 40 | def explained_variance_2d(ypred, y): 41 | assert y.ndim == 2 and ypred.ndim == 2 42 | vary = np.var(y, axis=0) 43 | out = 1 - np.var(y-ypred)/vary 44 | out[vary < 1e-10] = 0 45 | return out 46 | 47 | def ncc(ypred, y): 48 | return np.corrcoef(ypred, y)[1,0] 49 | 50 | def flatten_arrays(arrs): 51 | return np.concatenate([arr.flat for arr in arrs]) 52 | 53 | def unflatten_vector(vec, shapes): 54 | i=0 55 | arrs = [] 56 | for shape in shapes: 57 | size = np.prod(shape) 58 | arr = vec[i:i+size].reshape(shape) 59 | arrs.append(arr) 60 | i += size 61 | return arrs 62 | 63 | def discount_with_boundaries(X, New, gamma): 64 | """ 65 | X: 2d array of floats, time x features 66 | New: 2d array of bools, indicating when a new episode has started 67 | """ 68 | Y = np.zeros_like(X) 69 | T = X.shape[0] 70 | Y[T-1] = X[T-1] 71 | for t in range(T-2, -1, -1): 72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) 73 | return Y 74 | 75 | def test_discount_with_boundaries(): 76 | gamma=0.9 77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') 78 | starts = [1.0, 0.0, 0.0, 1.0] 79 | y = discount_with_boundaries(x, starts, gamma) 80 | assert np.allclose(y, [ 81 | 1 + gamma * 2 + gamma**2 * 3, 82 | 2 + gamma * 3, 83 | 3, 84 | 4 85 | ]) 86 | -------------------------------------------------------------------------------- /control/baselines/common/mpi_adam.py: -------------------------------------------------------------------------------- 1 | import baselines.common.tf_util as U 2 | import tensorflow as tf 3 | import numpy as np 4 | try: 5 | from mpi4py import MPI 6 | except ImportError: 7 | MPI = None 8 | 9 | 10 | class MpiAdam(object): 11 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): 12 | self.var_list = var_list 13 | self.beta1 = beta1 14 | self.beta2 = beta2 15 | self.epsilon = epsilon 16 | self.scale_grad_by_procs = scale_grad_by_procs 17 | size = sum(U.numel(v) for v in var_list) 18 | self.m = np.zeros(size, 'float32') 19 | self.v = np.zeros(size, 'float32') 20 | self.t = 0 21 | self.setfromflat = U.SetFromFlat(var_list) 22 | self.getflat = U.GetFlat(var_list) 23 | self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm 24 | 25 | def update(self, localg, stepsize): 26 | if self.t % 100 == 0: 27 | self.check_synced() 28 | localg = localg.astype('float32') 29 | if self.comm is not None: 30 | globalg = np.zeros_like(localg) 31 | self.comm.Allreduce(localg, globalg, op=MPI.SUM) 32 | if self.scale_grad_by_procs: 33 | globalg /= self.comm.Get_size() 34 | else: 35 | globalg = np.copy(localg) 36 | 37 | self.t += 1 38 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) 39 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg 40 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) 41 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) 42 | self.setfromflat(self.getflat() + step) 43 | 44 | def sync(self): 45 | if self.comm is None: 46 | return 47 | theta = self.getflat() 48 | self.comm.Bcast(theta, root=0) 49 | self.setfromflat(theta) 50 | 51 | def check_synced(self): 52 | if self.comm is None: 53 | return 54 | if self.comm.Get_rank() == 0: # this is root 55 | theta = self.getflat() 56 | self.comm.Bcast(theta, root=0) 57 | else: 58 | thetalocal = self.getflat() 59 | thetaroot = np.empty_like(thetalocal) 60 | self.comm.Bcast(thetaroot, root=0) 61 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) 62 | 63 | @U.in_session 64 | def test_MpiAdam(): 65 | np.random.seed(0) 66 | tf.set_random_seed(0) 67 | 68 | a = tf.Variable(np.random.randn(3).astype('float32')) 69 | b = tf.Variable(np.random.randn(2,5).astype('float32')) 70 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) 71 | 72 | stepsize = 1e-2 73 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) 74 | do_update = U.function([], loss, updates=[update_op]) 75 | 76 | tf.get_default_session().run(tf.global_variables_initializer()) 77 | losslist_ref = [] 78 | for i in range(10): 79 | l = do_update() 80 | print(i, l) 81 | losslist_ref.append(l) 82 | 83 | 84 | 85 | tf.set_random_seed(0) 86 | tf.get_default_session().run(tf.global_variables_initializer()) 87 | 88 | var_list = [a,b] 89 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)]) 90 | adam = MpiAdam(var_list) 91 | 92 | losslist_test = [] 93 | for i in range(10): 94 | l,g = lossandgrad() 95 | adam.update(g, stepsize) 96 | print(i,l) 97 | losslist_test.append(l) 98 | 99 | np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4) 100 | 101 | 102 | if __name__ == '__main__': 103 | test_MpiAdam() 104 | -------------------------------------------------------------------------------- /control/baselines/common/mpi_adam_optimizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from mpi4py import MPI 4 | 5 | class MpiAdamOptimizer(tf.train.AdamOptimizer): 6 | """Adam optimizer that averages gradients across mpi processes.""" 7 | def __init__(self, comm, **kwargs): 8 | self.comm = comm 9 | tf.train.AdamOptimizer.__init__(self, **kwargs) 10 | def compute_gradients(self, loss, var_list, **kwargs): 11 | grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs) 12 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] 13 | flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) 14 | shapes = [v.shape.as_list() for g, v in grads_and_vars] 15 | sizes = [int(np.prod(s)) for s in shapes] 16 | 17 | num_tasks = self.comm.Get_size() 18 | buf = np.zeros(sum(sizes), np.float32) 19 | 20 | def _collect_grads(flat_grad): 21 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) 22 | np.divide(buf, float(num_tasks), out=buf) 23 | return buf 24 | 25 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) 26 | avg_flat_grad.set_shape(flat_grad.shape) 27 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0) 28 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v) 29 | for g, (_, v) in zip(avg_grads, grads_and_vars)] 30 | 31 | return avg_grads_and_vars 32 | -------------------------------------------------------------------------------- /control/baselines/common/mpi_fork.py: -------------------------------------------------------------------------------- 1 | import os, subprocess, sys 2 | 3 | def mpi_fork(n, bind_to_core=False): 4 | """Re-launches the current script with workers 5 | Returns "parent" for original parent, "child" for MPI children 6 | """ 7 | if n<=1: 8 | return "child" 9 | if os.getenv("IN_MPI") is None: 10 | env = os.environ.copy() 11 | env.update( 12 | MKL_NUM_THREADS="1", 13 | OMP_NUM_THREADS="1", 14 | IN_MPI="1" 15 | ) 16 | args = ["mpirun", "-np", str(n)] 17 | if bind_to_core: 18 | args += ["-bind-to", "core"] 19 | args += [sys.executable] + sys.argv 20 | subprocess.check_call(args, env=env) 21 | return "parent" 22 | else: 23 | return "child" 24 | -------------------------------------------------------------------------------- /control/baselines/common/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | from baselines.common import zipsame 4 | 5 | 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False): 7 | x = np.asarray(x) 8 | assert x.ndim > 0 9 | if comm is None: comm = MPI.COMM_WORLD 10 | xsum = x.sum(axis=axis, keepdims=keepdims) 11 | n = xsum.size 12 | localsum = np.zeros(n+1, x.dtype) 13 | localsum[:n] = xsum.ravel() 14 | localsum[n] = x.shape[axis] 15 | globalsum = np.zeros_like(localsum) 16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM) 17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n] 18 | 19 | def mpi_moments(x, axis=0, comm=None, keepdims=False): 20 | x = np.asarray(x) 21 | assert x.ndim > 0 22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True) 23 | sqdiffs = np.square(x - mean) 24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) 25 | assert count1 == count 26 | std = np.sqrt(meansqdiff) 27 | if not keepdims: 28 | newshape = mean.shape[:axis] + mean.shape[axis+1:] 29 | mean = mean.reshape(newshape) 30 | std = std.reshape(newshape) 31 | return mean, std, count 32 | 33 | 34 | def test_runningmeanstd(): 35 | import subprocess 36 | subprocess.check_call(['mpirun', '-np', '3', 37 | 'python','-c', 38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()']) 39 | 40 | def _helper_runningmeanstd(): 41 | comm = MPI.COMM_WORLD 42 | np.random.seed(0) 43 | for (triple,axis) in [ 44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), 45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), 46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), 47 | ]: 48 | 49 | 50 | x = np.concatenate(triple, axis=axis) 51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] 52 | 53 | 54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) 55 | 56 | for (a1,a2) in zipsame(ms1, ms2): 57 | print(a1, a2) 58 | assert np.allclose(a1, a2) 59 | print("ok!") 60 | 61 | -------------------------------------------------------------------------------- /control/baselines/common/mpi_running_mean_std.py: -------------------------------------------------------------------------------- 1 | try: 2 | from mpi4py import MPI 3 | except ImportError: 4 | MPI = None 5 | 6 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np 7 | 8 | class RunningMeanStd(object): 9 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 10 | def __init__(self, epsilon=1e-2, shape=()): 11 | 12 | self._sum = tf.get_variable( 13 | dtype=tf.float64, 14 | shape=shape, 15 | initializer=tf.constant_initializer(0.0), 16 | name="runningsum", trainable=False) 17 | self._sumsq = tf.get_variable( 18 | dtype=tf.float64, 19 | shape=shape, 20 | initializer=tf.constant_initializer(epsilon), 21 | name="runningsumsq", trainable=False) 22 | self._count = tf.get_variable( 23 | dtype=tf.float64, 24 | shape=(), 25 | initializer=tf.constant_initializer(epsilon), 26 | name="count", trainable=False) 27 | self.shape = shape 28 | 29 | self.mean = tf.to_float(self._sum / self._count) 30 | self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) 31 | 32 | newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') 33 | newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') 34 | newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') 35 | self.incfiltparams = U.function([newsum, newsumsq, newcount], [], 36 | updates=[tf.assign_add(self._sum, newsum), 37 | tf.assign_add(self._sumsq, newsumsq), 38 | tf.assign_add(self._count, newcount)]) 39 | 40 | 41 | def update(self, x): 42 | x = x.astype('float64') 43 | n = int(np.prod(self.shape)) 44 | totalvec = np.zeros(n*2+1, 'float64') 45 | addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')]) 46 | if MPI is not None: 47 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) 48 | self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n]) 49 | 50 | @U.in_session 51 | def test_runningmeanstd(): 52 | for (x1, x2, x3) in [ 53 | (np.random.randn(3), np.random.randn(4), np.random.randn(5)), 54 | (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), 55 | ]: 56 | 57 | rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) 58 | U.initialize() 59 | 60 | x = np.concatenate([x1, x2, x3], axis=0) 61 | ms1 = [x.mean(axis=0), x.std(axis=0)] 62 | rms.update(x1) 63 | rms.update(x2) 64 | rms.update(x3) 65 | ms2 = [rms.mean.eval(), rms.std.eval()] 66 | 67 | assert np.allclose(ms1, ms2) 68 | 69 | @U.in_session 70 | def test_dist(): 71 | np.random.seed(0) 72 | p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1)) 73 | q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1)) 74 | 75 | # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) 76 | # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) 77 | 78 | comm = MPI.COMM_WORLD 79 | assert comm.Get_size()==2 80 | if comm.Get_rank()==0: 81 | x1,x2,x3 = p1,p2,p3 82 | elif comm.Get_rank()==1: 83 | x1,x2,x3 = q1,q2,q3 84 | else: 85 | assert False 86 | 87 | rms = RunningMeanStd(epsilon=0.0, shape=(1,)) 88 | U.initialize() 89 | 90 | rms.update(x1) 91 | rms.update(x2) 92 | rms.update(x3) 93 | 94 | bigvec = np.concatenate([p1,p2,p3,q1,q2,q3]) 95 | 96 | def checkallclose(x,y): 97 | print(x,y) 98 | return np.allclose(x,y) 99 | 100 | assert checkallclose( 101 | bigvec.mean(axis=0), 102 | rms.mean.eval(), 103 | ) 104 | assert checkallclose( 105 | bigvec.std(axis=0), 106 | rms.std.eval(), 107 | ) 108 | 109 | 110 | if __name__ == "__main__": 111 | # Run with mpirun -np 2 python 112 | test_dist() 113 | -------------------------------------------------------------------------------- /control/baselines/common/mpi_util.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from mpi4py import MPI 3 | import os, numpy as np 4 | import platform 5 | import shutil 6 | import subprocess 7 | 8 | def sync_from_root(sess, variables, comm=None): 9 | """ 10 | Send the root node's parameters to every worker. 11 | Arguments: 12 | sess: the TensorFlow session. 13 | variables: all parameter variables including optimizer's 14 | """ 15 | if comm is None: comm = MPI.COMM_WORLD 16 | rank = comm.Get_rank() 17 | for var in variables: 18 | if rank == 0: 19 | comm.Bcast(sess.run(var)) 20 | else: 21 | import tensorflow as tf 22 | returned_var = np.empty(var.shape, dtype='float32') 23 | comm.Bcast(returned_var) 24 | sess.run(tf.assign(var, returned_var)) 25 | 26 | def gpu_count(): 27 | """ 28 | Count the GPUs on this machine. 29 | """ 30 | if shutil.which('nvidia-smi') is None: 31 | return 0 32 | output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv']) 33 | return max(0, len(output.split(b'\n')) - 2) 34 | 35 | def setup_mpi_gpus(): 36 | """ 37 | Set CUDA_VISIBLE_DEVICES using MPI. 38 | """ 39 | num_gpus = gpu_count() 40 | if num_gpus == 0: 41 | return 42 | local_rank, _ = get_local_rank_size(MPI.COMM_WORLD) 43 | os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus) 44 | 45 | def get_local_rank_size(comm): 46 | """ 47 | Returns the rank of each process on its machine 48 | The processes on a given machine will be assigned ranks 49 | 0, 1, 2, ..., N-1, 50 | where N is the number of processes on this machine. 51 | 52 | Useful if you want to assign one gpu per machine 53 | """ 54 | this_node = platform.node() 55 | ranks_nodes = comm.allgather((comm.Get_rank(), this_node)) 56 | node2rankssofar = defaultdict(int) 57 | local_rank = None 58 | for (rank, node) in ranks_nodes: 59 | if rank == comm.Get_rank(): 60 | local_rank = node2rankssofar[node] 61 | node2rankssofar[node] += 1 62 | assert local_rank is not None 63 | return local_rank, node2rankssofar[this_node] 64 | 65 | def share_file(comm, path): 66 | """ 67 | Copies the file from rank 0 to all other ranks 68 | Puts it in the same place on all machines 69 | """ 70 | localrank, _ = get_local_rank_size(comm) 71 | if comm.Get_rank() == 0: 72 | with open(path, 'rb') as fh: 73 | data = fh.read() 74 | comm.bcast(data) 75 | else: 76 | data = comm.bcast(None) 77 | if localrank == 0: 78 | os.makedirs(os.path.dirname(path), exist_ok=True) 79 | with open(path, 'wb') as fh: 80 | fh.write(data) 81 | comm.Barrier() 82 | 83 | def dict_gather(comm, d, op='mean', assert_all_have_data=True): 84 | if comm is None: return d 85 | alldicts = comm.allgather(d) 86 | size = comm.size 87 | k2li = defaultdict(list) 88 | for d in alldicts: 89 | for (k,v) in d.items(): 90 | k2li[k].append(v) 91 | result = {} 92 | for (k,li) in k2li.items(): 93 | if assert_all_have_data: 94 | assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k) 95 | if op=='mean': 96 | result[k] = np.mean(li, axis=0) 97 | elif op=='sum': 98 | result[k] = np.sum(li, axis=0) 99 | else: 100 | assert 0, op 101 | return result 102 | -------------------------------------------------------------------------------- /control/baselines/common/runners.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC, abstractmethod 3 | 4 | class AbstractEnvRunner(ABC): 5 | def __init__(self, *, env, model, nsteps): 6 | self.env = env 7 | self.model = model 8 | self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape 10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) 11 | self.obs[:] = env.reset() 12 | self.nsteps = nsteps 13 | self.states = model.initial_state 14 | self.dones = [False for _ in range(nenv)] 15 | 16 | @abstractmethod 17 | def run(self): 18 | raise NotImplementedError 19 | 20 | -------------------------------------------------------------------------------- /control/baselines/common/schedules.py: -------------------------------------------------------------------------------- 1 | """This file is used for specifying various schedules that evolve over 2 | time throughout the execution of the algorithm, such as: 3 | - learning rate for the optimizer 4 | - exploration epsilon for the epsilon greedy exploration strategy 5 | - beta parameter for beta parameter in prioritized replay 6 | 7 | Each schedule has a function `value(t)` which returns the current value 8 | of the parameter given the timestep t of the optimization procedure. 9 | """ 10 | 11 | 12 | class Schedule(object): 13 | def value(self, t): 14 | """Value of the schedule at time t""" 15 | raise NotImplementedError() 16 | 17 | 18 | class ConstantSchedule(object): 19 | def __init__(self, value): 20 | """Value remains constant over time. 21 | 22 | Parameters 23 | ---------- 24 | value: float 25 | Constant value of the schedule 26 | """ 27 | self._v = value 28 | 29 | def value(self, t): 30 | """See Schedule.value""" 31 | return self._v 32 | 33 | 34 | def linear_interpolation(l, r, alpha): 35 | return l + alpha * (r - l) 36 | 37 | 38 | class PiecewiseSchedule(object): 39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 40 | """Piecewise schedule. 41 | 42 | endpoints: [(int, int)] 43 | list of pairs `(time, value)` meanining that schedule should output 44 | `value` when `t==time`. All the values for time must be sorted in 45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 48 | time passed between `time_a` and `time_b` for time `t`. 49 | interpolation: lambda float, float, float: float 50 | a function that takes value to the left and to the right of t according 51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 52 | right endpoint that t has covered. See linear_interpolation for example. 53 | outside_value: float 54 | if the value is requested outside of all the intervals sepecified in 55 | `endpoints` this value is returned. If None then AssertionError is 56 | raised when outside value is requested. 57 | """ 58 | idxes = [e[0] for e in endpoints] 59 | assert idxes == sorted(idxes) 60 | self._interpolation = interpolation 61 | self._outside_value = outside_value 62 | self._endpoints = endpoints 63 | 64 | def value(self, t): 65 | """See Schedule.value""" 66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 67 | if l_t <= t and t < r_t: 68 | alpha = float(t - l_t) / (r_t - l_t) 69 | return self._interpolation(l, r, alpha) 70 | 71 | # t does not belong to any of the pieces, so doom. 72 | assert self._outside_value is not None 73 | return self._outside_value 74 | 75 | 76 | class LinearSchedule(object): 77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 78 | """Linear interpolation between initial_p and final_p over 79 | schedule_timesteps. After this many timesteps pass final_p is 80 | returned. 81 | 82 | Parameters 83 | ---------- 84 | schedule_timesteps: int 85 | Number of timesteps for which to linearly anneal initial_p 86 | to final_p 87 | initial_p: float 88 | initial output value 89 | final_p: float 90 | final output value 91 | """ 92 | self.schedule_timesteps = schedule_timesteps 93 | self.final_p = final_p 94 | self.initial_p = initial_p 95 | 96 | def value(self, t): 97 | """See Schedule.value""" 98 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 99 | return self.initial_p + fraction * (self.final_p - self.initial_p) 100 | -------------------------------------------------------------------------------- /control/baselines/common/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/baselines/common/tests/__init__.py -------------------------------------------------------------------------------- /control/baselines/common/tests/envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/baselines/common/tests/envs/__init__.py -------------------------------------------------------------------------------- /control/baselines/common/tests/envs/fixed_sequence_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import Env 3 | from gym.spaces import Discrete 4 | 5 | 6 | class FixedSequenceEnv(Env): 7 | def __init__( 8 | self, 9 | n_actions=10, 10 | seed=0, 11 | episode_len=100 12 | ): 13 | self.np_random = np.random.RandomState() 14 | self.np_random.seed(seed) 15 | self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)] 16 | 17 | self.action_space = Discrete(n_actions) 18 | self.observation_space = Discrete(1) 19 | 20 | self.episode_len = episode_len 21 | self.time = 0 22 | self.reset() 23 | 24 | def reset(self): 25 | self.time = 0 26 | return 0 27 | 28 | def step(self, actions): 29 | rew = self._get_reward(actions) 30 | self._choose_next_state() 31 | done = False 32 | if self.episode_len and self.time >= self.episode_len: 33 | rew = 0 34 | done = True 35 | 36 | return 0, rew, done, {} 37 | 38 | def _choose_next_state(self): 39 | self.time += 1 40 | 41 | def _get_reward(self, actions): 42 | return 1 if actions == self.sequence[self.time] else 0 43 | 44 | 45 | -------------------------------------------------------------------------------- /control/baselines/common/tests/envs/identity_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import abstractmethod 3 | from gym import Env 4 | from gym.spaces import MultiDiscrete, Discrete, Box 5 | 6 | 7 | class IdentityEnv(Env): 8 | def __init__( 9 | self, 10 | episode_len=None 11 | ): 12 | 13 | self.episode_len = episode_len 14 | self.time = 0 15 | self.reset() 16 | 17 | def reset(self): 18 | self._choose_next_state() 19 | self.time = 0 20 | self.observation_space = self.action_space 21 | 22 | return self.state 23 | 24 | def step(self, actions): 25 | rew = self._get_reward(actions) 26 | self._choose_next_state() 27 | done = False 28 | if self.episode_len and self.time >= self.episode_len: 29 | rew = 0 30 | done = True 31 | 32 | return self.state, rew, done, {} 33 | 34 | def _choose_next_state(self): 35 | self.state = self.action_space.sample() 36 | self.time += 1 37 | 38 | @abstractmethod 39 | def _get_reward(self, actions): 40 | raise NotImplementedError 41 | 42 | 43 | class DiscreteIdentityEnv(IdentityEnv): 44 | def __init__( 45 | self, 46 | dim, 47 | episode_len=None, 48 | ): 49 | 50 | self.action_space = Discrete(dim) 51 | super().__init__(episode_len=episode_len) 52 | 53 | def _get_reward(self, actions): 54 | return 1 if self.state == actions else 0 55 | 56 | class MultiDiscreteIdentityEnv(IdentityEnv): 57 | def __init__( 58 | self, 59 | dims, 60 | episode_len=None, 61 | ): 62 | 63 | self.action_space = MultiDiscrete(dims) 64 | super().__init__(episode_len=episode_len) 65 | 66 | def _get_reward(self, actions): 67 | return 1 if all(self.state == actions) else 0 68 | 69 | 70 | class BoxIdentityEnv(IdentityEnv): 71 | def __init__( 72 | self, 73 | shape, 74 | episode_len=None, 75 | ): 76 | 77 | self.action_space = Box(low=-1.0, high=1.0, shape=shape) 78 | super().__init__(episode_len=episode_len) 79 | 80 | def _get_reward(self, actions): 81 | diff = actions - self.state 82 | diff = diff[:] 83 | return -0.5 * np.dot(diff, diff) 84 | -------------------------------------------------------------------------------- /control/baselines/common/tests/envs/mnist_env.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import numpy as np 3 | import tempfile 4 | from gym import Env 5 | from gym.spaces import Discrete, Box 6 | 7 | 8 | 9 | class MnistEnv(Env): 10 | def __init__( 11 | self, 12 | seed=0, 13 | episode_len=None, 14 | no_images=None 15 | ): 16 | import filelock 17 | from tensorflow.examples.tutorials.mnist import input_data 18 | # we could use temporary directory for this with a context manager and 19 | # TemporaryDirecotry, but then each test that uses mnist would re-download the data 20 | # this way the data is not cleaned up, but we only download it once per machine 21 | mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data') 22 | with filelock.FileLock(mnist_path + '.lock'): 23 | self.mnist = input_data.read_data_sets(mnist_path) 24 | 25 | self.np_random = np.random.RandomState() 26 | self.np_random.seed(seed) 27 | 28 | self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1)) 29 | self.action_space = Discrete(10) 30 | self.episode_len = episode_len 31 | self.time = 0 32 | self.no_images = no_images 33 | 34 | self.train_mode() 35 | self.reset() 36 | 37 | def reset(self): 38 | self._choose_next_state() 39 | self.time = 0 40 | 41 | return self.state[0] 42 | 43 | def step(self, actions): 44 | rew = self._get_reward(actions) 45 | self._choose_next_state() 46 | done = False 47 | if self.episode_len and self.time >= self.episode_len: 48 | rew = 0 49 | done = True 50 | 51 | return self.state[0], rew, done, {} 52 | 53 | def train_mode(self): 54 | self.dataset = self.mnist.train 55 | 56 | def test_mode(self): 57 | self.dataset = self.mnist.test 58 | 59 | def _choose_next_state(self): 60 | max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1 61 | index = self.np_random.randint(0, max_index) 62 | image = self.dataset.images[index].reshape(28,28,1)*255 63 | label = self.dataset.labels[index] 64 | self.state = (image, label) 65 | self.time += 1 66 | 67 | def _get_reward(self, actions): 68 | return 1 if self.state[1] == actions else 0 69 | 70 | 71 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_cartpole.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | 4 | from baselines.run import get_learn_function 5 | from baselines.common.tests.util import reward_per_episode_test 6 | 7 | common_kwargs = dict( 8 | total_timesteps=30000, 9 | network='mlp', 10 | gamma=1.0, 11 | seed=0, 12 | ) 13 | 14 | learn_kwargs = { 15 | 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05), 16 | 'acer': dict(value_network='copy'), 17 | 'acktr': dict(nsteps=32, value_network='copy', is_async=False), 18 | 'deepq': dict(total_timesteps=20000), 19 | 'ppo2': dict(value_network='copy'), 20 | 'trpo_mpi': {} 21 | } 22 | 23 | @pytest.mark.slow 24 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 25 | def test_cartpole(alg): 26 | ''' 27 | Test if the algorithm (with an mlp policy) 28 | can learn to balance the cartpole 29 | ''' 30 | 31 | kwargs = common_kwargs.copy() 32 | kwargs.update(learn_kwargs[alg]) 33 | 34 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 35 | def env_fn(): 36 | 37 | env = gym.make('CartPole-v0') 38 | env.seed(0) 39 | return env 40 | 41 | reward_per_episode_test(env_fn, learn_fn, 100) 42 | 43 | if __name__ == '__main__': 44 | test_cartpole('acer') 45 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_doc_examples.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | try: 3 | import mujoco_py 4 | _mujoco_present = True 5 | except BaseException: 6 | mujoco_py = None 7 | _mujoco_present = False 8 | 9 | 10 | @pytest.mark.skipif( 11 | not _mujoco_present, 12 | reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library' 13 | ) 14 | def test_lstm_example(): 15 | import tensorflow as tf 16 | from baselines.common import policies, models, cmd_util 17 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 18 | 19 | # create vectorized environment 20 | venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) 21 | 22 | with tf.Session() as sess: 23 | # build policy based on lstm network with 128 units 24 | policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) 25 | 26 | # initialize tensorflow variables 27 | sess.run(tf.global_variables_initializer()) 28 | 29 | # prepare environment variables 30 | ob = venv.reset() 31 | state = policy.initial_state 32 | done = [False] 33 | step_counter = 0 34 | 35 | # run a single episode until the end (i.e. until done) 36 | while True: 37 | action, _, state, _ = policy.step(ob, S=state, M=done) 38 | ob, reward, done, _ = venv.step(action) 39 | step_counter += 1 40 | if done: 41 | break 42 | 43 | 44 | assert step_counter > 5 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_env_after_learn.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | import tensorflow as tf 4 | 5 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv 6 | from baselines.run import get_learn_function 7 | from baselines.common.tf_util import make_session 8 | 9 | algos = ['a2c', 'acer', 'acktr', 'deepq', 'ppo2', 'trpo_mpi'] 10 | 11 | @pytest.mark.parametrize('algo', algos) 12 | def test_env_after_learn(algo): 13 | def make_env(): 14 | # acktr requires too much RAM, fails on travis 15 | env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') 16 | return env 17 | 18 | make_session(make_default=True, graph=tf.Graph()) 19 | env = SubprocVecEnv([make_env]) 20 | 21 | learn = get_learn_function(algo) 22 | 23 | # Commenting out the following line resolves the issue, though crash happens at env.reset(). 24 | learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) 25 | 26 | env.reset() 27 | env.close() 28 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_fetchreach.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | 4 | from baselines.run import get_learn_function 5 | from baselines.common.tests.util import reward_per_episode_test 6 | 7 | pytest.importorskip('mujoco_py') 8 | 9 | common_kwargs = dict( 10 | network='mlp', 11 | seed=0, 12 | ) 13 | 14 | learn_kwargs = { 15 | 'her': dict(total_timesteps=2000) 16 | } 17 | 18 | @pytest.mark.slow 19 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 20 | def test_fetchreach(alg): 21 | ''' 22 | Test if the algorithm (with an mlp policy) 23 | can learn the FetchReach task 24 | ''' 25 | 26 | kwargs = common_kwargs.copy() 27 | kwargs.update(learn_kwargs[alg]) 28 | 29 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 30 | def env_fn(): 31 | 32 | env = gym.make('FetchReach-v1') 33 | env.seed(0) 34 | return env 35 | 36 | reward_per_episode_test(env_fn, learn_fn, -15) 37 | 38 | if __name__ == '__main__': 39 | test_fetchreach('her') 40 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_fixed_sequence.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv 3 | 4 | from baselines.common.tests.util import simple_test 5 | from baselines.run import get_learn_function 6 | 7 | common_kwargs = dict( 8 | seed=0, 9 | total_timesteps=50000, 10 | ) 11 | 12 | learn_kwargs = { 13 | 'a2c': {}, 14 | 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1), 15 | # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps) 16 | # github issue: https://github.com/openai/baselines/issues/188 17 | # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001) 18 | } 19 | 20 | 21 | alg_list = learn_kwargs.keys() 22 | rnn_list = ['lstm'] 23 | 24 | @pytest.mark.slow 25 | @pytest.mark.parametrize("alg", alg_list) 26 | @pytest.mark.parametrize("rnn", rnn_list) 27 | def test_fixed_sequence(alg, rnn): 28 | ''' 29 | Test if the algorithm (with a given policy) 30 | can learn an identity transformation (i.e. return observation as an action) 31 | ''' 32 | 33 | kwargs = learn_kwargs[alg] 34 | kwargs.update(common_kwargs) 35 | 36 | episode_len = 5 37 | env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len) 38 | learn = lambda e: get_learn_function(alg)( 39 | env=e, 40 | network=rnn, 41 | **kwargs 42 | ) 43 | 44 | simple_test(env_fn, learn, 0.7) 45 | 46 | 47 | if __name__ == '__main__': 48 | test_fixed_sequence('ppo2', 'lstm') 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_identity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv 3 | from baselines.run import get_learn_function 4 | from baselines.common.tests.util import simple_test 5 | 6 | common_kwargs = dict( 7 | total_timesteps=30000, 8 | network='mlp', 9 | gamma=0.9, 10 | seed=0, 11 | ) 12 | 13 | learn_kwargs = { 14 | 'a2c' : {}, 15 | 'acktr': {}, 16 | 'deepq': {}, 17 | 'ddpg': dict(layer_norm=True), 18 | 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0), 19 | 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01) 20 | } 21 | 22 | 23 | algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi'] 24 | algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi'] 25 | algos_cont = ['a2c', 'acktr', 'ddpg', 'ppo2', 'trpo_mpi'] 26 | 27 | @pytest.mark.slow 28 | @pytest.mark.parametrize("alg", algos_disc) 29 | def test_discrete_identity(alg): 30 | ''' 31 | Test if the algorithm (with an mlp policy) 32 | can learn an identity transformation (i.e. return observation as an action) 33 | ''' 34 | 35 | kwargs = learn_kwargs[alg] 36 | kwargs.update(common_kwargs) 37 | 38 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 39 | env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100) 40 | simple_test(env_fn, learn_fn, 0.9) 41 | 42 | @pytest.mark.slow 43 | @pytest.mark.parametrize("alg", algos_multidisc) 44 | def test_multidiscrete_identity(alg): 45 | ''' 46 | Test if the algorithm (with an mlp policy) 47 | can learn an identity transformation (i.e. return observation as an action) 48 | ''' 49 | 50 | kwargs = learn_kwargs[alg] 51 | kwargs.update(common_kwargs) 52 | 53 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 54 | env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100) 55 | simple_test(env_fn, learn_fn, 0.9) 56 | 57 | @pytest.mark.slow 58 | @pytest.mark.parametrize("alg", algos_cont) 59 | def test_continuous_identity(alg): 60 | ''' 61 | Test if the algorithm (with an mlp policy) 62 | can learn an identity transformation (i.e. return observation as an action) 63 | to a required precision 64 | ''' 65 | 66 | kwargs = learn_kwargs[alg] 67 | kwargs.update(common_kwargs) 68 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 69 | 70 | env_fn = lambda: BoxIdentityEnv((1,), episode_len=100) 71 | simple_test(env_fn, learn_fn, -0.1) 72 | 73 | if __name__ == '__main__': 74 | test_multidiscrete_identity('acktr') 75 | 76 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_mnist.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # from baselines.acer import acer_simple as acer 4 | from baselines.common.tests.envs.mnist_env import MnistEnv 5 | from baselines.common.tests.util import simple_test 6 | from baselines.run import get_learn_function 7 | 8 | 9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem? 10 | # GitHub issue https://github.com/openai/baselines/issues/189 11 | common_kwargs = { 12 | 'seed': 0, 13 | 'network':'cnn', 14 | 'gamma':0.9, 15 | 'pad':'SAME' 16 | } 17 | 18 | learn_args = { 19 | 'a2c': dict(total_timesteps=50000), 20 | 'acer': dict(total_timesteps=20000), 21 | 'deepq': dict(total_timesteps=5000), 22 | 'acktr': dict(total_timesteps=30000), 23 | 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0), 24 | 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001) 25 | } 26 | 27 | 28 | #tests pass, but are too slow on travis. Same algorithms are covered 29 | # by other tests with less compute-hungry nn's and by benchmarks 30 | @pytest.mark.skip 31 | @pytest.mark.slow 32 | @pytest.mark.parametrize("alg", learn_args.keys()) 33 | def test_mnist(alg): 34 | ''' 35 | Test if the algorithm can learn to classify MNIST digits. 36 | Uses CNN policy. 37 | ''' 38 | 39 | learn_kwargs = learn_args[alg] 40 | learn_kwargs.update(common_kwargs) 41 | 42 | learn = get_learn_function(alg) 43 | learn_fn = lambda e: learn(env=e, **learn_kwargs) 44 | env_fn = lambda: MnistEnv(seed=0, episode_len=100) 45 | 46 | simple_test(env_fn, learn_fn, 0.6) 47 | 48 | if __name__ == '__main__': 49 | test_mnist('acer') 50 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_schedules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule 4 | 5 | 6 | def test_piecewise_schedule(): 7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) 8 | 9 | assert np.isclose(ps.value(-10), 500) 10 | assert np.isclose(ps.value(0), 150) 11 | assert np.isclose(ps.value(5), 200) 12 | assert np.isclose(ps.value(9), 80) 13 | assert np.isclose(ps.value(50), 50) 14 | assert np.isclose(ps.value(80), 50) 15 | assert np.isclose(ps.value(150), 0) 16 | assert np.isclose(ps.value(175), -25) 17 | assert np.isclose(ps.value(201), 500) 18 | assert np.isclose(ps.value(500), 500) 19 | 20 | assert np.isclose(ps.value(200 - 1e-10), -50) 21 | 22 | 23 | def test_constant_schedule(): 24 | cs = ConstantSchedule(5) 25 | for i in range(-100, 100): 26 | assert np.isclose(cs.value(i), 5) 27 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_segment_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 4 | 5 | 6 | def test_tree_set(): 7 | tree = SumSegmentTree(4) 8 | 9 | tree[2] = 1.0 10 | tree[3] = 3.0 11 | 12 | assert np.isclose(tree.sum(), 4.0) 13 | assert np.isclose(tree.sum(0, 2), 0.0) 14 | assert np.isclose(tree.sum(0, 3), 1.0) 15 | assert np.isclose(tree.sum(2, 3), 1.0) 16 | assert np.isclose(tree.sum(2, -1), 1.0) 17 | assert np.isclose(tree.sum(2, 4), 4.0) 18 | 19 | 20 | def test_tree_set_overlap(): 21 | tree = SumSegmentTree(4) 22 | 23 | tree[2] = 1.0 24 | tree[2] = 3.0 25 | 26 | assert np.isclose(tree.sum(), 3.0) 27 | assert np.isclose(tree.sum(2, 3), 3.0) 28 | assert np.isclose(tree.sum(2, -1), 3.0) 29 | assert np.isclose(tree.sum(2, 4), 3.0) 30 | assert np.isclose(tree.sum(1, 2), 0.0) 31 | 32 | 33 | def test_prefixsum_idx(): 34 | tree = SumSegmentTree(4) 35 | 36 | tree[2] = 1.0 37 | tree[3] = 3.0 38 | 39 | assert tree.find_prefixsum_idx(0.0) == 2 40 | assert tree.find_prefixsum_idx(0.5) == 2 41 | assert tree.find_prefixsum_idx(0.99) == 2 42 | assert tree.find_prefixsum_idx(1.01) == 3 43 | assert tree.find_prefixsum_idx(3.00) == 3 44 | assert tree.find_prefixsum_idx(4.00) == 3 45 | 46 | 47 | def test_prefixsum_idx2(): 48 | tree = SumSegmentTree(4) 49 | 50 | tree[0] = 0.5 51 | tree[1] = 1.0 52 | tree[2] = 1.0 53 | tree[3] = 3.0 54 | 55 | assert tree.find_prefixsum_idx(0.00) == 0 56 | assert tree.find_prefixsum_idx(0.55) == 1 57 | assert tree.find_prefixsum_idx(0.99) == 1 58 | assert tree.find_prefixsum_idx(1.51) == 2 59 | assert tree.find_prefixsum_idx(3.00) == 3 60 | assert tree.find_prefixsum_idx(5.50) == 3 61 | 62 | 63 | def test_max_interval_tree(): 64 | tree = MinSegmentTree(4) 65 | 66 | tree[0] = 1.0 67 | tree[2] = 0.5 68 | tree[3] = 3.0 69 | 70 | assert np.isclose(tree.min(), 0.5) 71 | assert np.isclose(tree.min(0, 2), 1.0) 72 | assert np.isclose(tree.min(0, 3), 0.5) 73 | assert np.isclose(tree.min(0, -1), 0.5) 74 | assert np.isclose(tree.min(2, 4), 0.5) 75 | assert np.isclose(tree.min(3, 4), 3.0) 76 | 77 | tree[2] = 0.7 78 | 79 | assert np.isclose(tree.min(), 0.7) 80 | assert np.isclose(tree.min(0, 2), 1.0) 81 | assert np.isclose(tree.min(0, 3), 0.7) 82 | assert np.isclose(tree.min(0, -1), 0.7) 83 | assert np.isclose(tree.min(2, 4), 0.7) 84 | assert np.isclose(tree.min(3, 4), 3.0) 85 | 86 | tree[2] = 4.0 87 | 88 | assert np.isclose(tree.min(), 1.0) 89 | assert np.isclose(tree.min(0, 2), 1.0) 90 | assert np.isclose(tree.min(0, 3), 1.0) 91 | assert np.isclose(tree.min(0, -1), 1.0) 92 | assert np.isclose(tree.min(2, 4), 3.0) 93 | assert np.isclose(tree.min(2, 3), 4.0) 94 | assert np.isclose(tree.min(2, -1), 4.0) 95 | assert np.isclose(tree.min(3, 4), 3.0) 96 | 97 | 98 | if __name__ == '__main__': 99 | test_tree_set() 100 | test_tree_set_overlap() 101 | test_prefixsum_idx() 102 | test_prefixsum_idx2() 103 | test_max_interval_tree() 104 | -------------------------------------------------------------------------------- /control/baselines/common/tests/test_tf_util.py: -------------------------------------------------------------------------------- 1 | # tests for tf_util 2 | import tensorflow as tf 3 | from baselines.common.tf_util import ( 4 | function, 5 | initialize, 6 | single_threaded_session 7 | ) 8 | 9 | 10 | def test_function(): 11 | with tf.Graph().as_default(): 12 | x = tf.placeholder(tf.int32, (), name="x") 13 | y = tf.placeholder(tf.int32, (), name="y") 14 | z = 3 * x + 2 * y 15 | lin = function([x, y], z, givens={y: 0}) 16 | 17 | with single_threaded_session(): 18 | initialize() 19 | 20 | assert lin(2) == 6 21 | assert lin(2, 2) == 10 22 | 23 | 24 | def test_multikwargs(): 25 | with tf.Graph().as_default(): 26 | x = tf.placeholder(tf.int32, (), name="x") 27 | with tf.variable_scope("other"): 28 | x2 = tf.placeholder(tf.int32, (), name="x") 29 | z = 3 * x + 2 * x2 30 | 31 | lin = function([x, x2], z, givens={x2: 0}) 32 | with single_threaded_session(): 33 | initialize() 34 | assert lin(2) == 6 35 | assert lin(2, 2) == 10 36 | 37 | 38 | if __name__ == '__main__': 39 | test_function() 40 | test_multikwargs() 41 | -------------------------------------------------------------------------------- /control/baselines/common/tests/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from gym.spaces import np_random 4 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 5 | 6 | N_TRIALS = 10000 7 | N_EPISODES = 100 8 | 9 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS): 10 | np.random.seed(0) 11 | np_random.seed(0) 12 | 13 | env = DummyVecEnv([env_fn]) 14 | 15 | 16 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): 17 | tf.set_random_seed(0) 18 | 19 | model = learn_fn(env) 20 | 21 | sum_rew = 0 22 | done = True 23 | 24 | for i in range(n_trials): 25 | if done: 26 | obs = env.reset() 27 | state = model.initial_state 28 | 29 | if state is not None: 30 | a, v, state, _ = model.step(obs, S=state, M=[False]) 31 | else: 32 | a, v, _, _ = model.step(obs) 33 | 34 | obs, rew, done, _ = env.step(a) 35 | sum_rew += float(rew) 36 | 37 | print("Reward in {} trials is {}".format(n_trials, sum_rew)) 38 | assert sum_rew > min_reward_fraction * n_trials, \ 39 | 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials) 40 | 41 | 42 | 43 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES): 44 | env = DummyVecEnv([env_fn]) 45 | 46 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): 47 | model = learn_fn(env) 48 | 49 | N_TRIALS = 100 50 | 51 | observations, actions, rewards = rollout(env, model, N_TRIALS) 52 | rewards = [sum(r) for r in rewards] 53 | 54 | avg_rew = sum(rewards) / N_TRIALS 55 | print("Average reward in {} episodes is {}".format(n_trials, avg_rew)) 56 | assert avg_rew > min_avg_reward, \ 57 | 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward) 58 | 59 | def rollout(env, model, n_trials): 60 | rewards = [] 61 | actions = [] 62 | observations = [] 63 | 64 | for i in range(n_trials): 65 | obs = env.reset() 66 | state = model.initial_state if hasattr(model, 'initial_state') else None 67 | episode_rew = [] 68 | episode_actions = [] 69 | episode_obs = [] 70 | 71 | while True: 72 | if state is not None: 73 | a, v, state, _ = model.step(obs, S=state, M=[False]) 74 | else: 75 | a,v, _, _ = model.step(obs) 76 | 77 | obs, rew, done, _ = env.step(a) 78 | 79 | episode_rew.append(rew) 80 | episode_actions.append(a) 81 | episode_obs.append(obs) 82 | 83 | if done: 84 | break 85 | 86 | rewards.append(episode_rew) 87 | actions.append(episode_actions) 88 | observations.append(episode_obs) 89 | 90 | return observations, actions, rewards 91 | 92 | -------------------------------------------------------------------------------- /control/baselines/common/tile_images.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def tile_images(img_nhwc): 4 | """ 5 | Tile N images into one big PxQ image 6 | (P,Q) are chosen to be as close as possible, and if N 7 | is square, then P=Q. 8 | 9 | input: img_nhwc, list or array of images, ndim=4 once turned into array 10 | n = batch index, h = height, w = width, c = channel 11 | returns: 12 | bigim_HWc, ndarray with ndim=3 13 | """ 14 | img_nhwc = np.asarray(img_nhwc) 15 | N, h, w, c = img_nhwc.shape 16 | H = int(np.ceil(np.sqrt(N))) 17 | W = int(np.ceil(float(N)/H)) 18 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)]) 19 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c) 20 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4) 21 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c) 22 | return img_Hh_Ww_c 23 | 24 | -------------------------------------------------------------------------------- /control/baselines/common/vec_env/dummy_vec_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | from . import VecEnv 4 | from .util import copy_obs_dict, dict_to_obs, obs_space_info 5 | 6 | class DummyVecEnv(VecEnv): 7 | """ 8 | VecEnv that does runs multiple environments sequentially, that is, 9 | the step and reset commands are send to one environment at a time. 10 | Useful when debugging and when num_env == 1 (in the latter case, 11 | avoids communication overhead) 12 | """ 13 | def __init__(self, env_fns): 14 | """ 15 | Arguments: 16 | 17 | env_fns: iterable of callables functions that build environments 18 | """ 19 | self.envs = [fn() for fn in env_fns] 20 | env = self.envs[0] 21 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 22 | obs_space = env.observation_space 23 | self.keys, shapes, dtypes = obs_space_info(obs_space) 24 | 25 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } 26 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) 27 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) 28 | self.buf_infos = [{} for _ in range(self.num_envs)] 29 | self.actions = None 30 | self.specs = [e.spec for e in self.envs] 31 | 32 | def step_async(self, actions): 33 | listify = True 34 | try: 35 | if len(actions) == self.num_envs: 36 | listify = False 37 | except TypeError: 38 | pass 39 | 40 | if not listify: 41 | self.actions = actions 42 | else: 43 | assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs) 44 | self.actions = [actions] 45 | 46 | def step_wait(self): 47 | for e in range(self.num_envs): 48 | action = self.actions[e] 49 | if isinstance(self.envs[e].action_space, spaces.Discrete): 50 | action = int(action) 51 | 52 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action) 53 | if self.buf_dones[e]: 54 | obs = self.envs[e].reset() 55 | self._save_obs(e, obs) 56 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), 57 | self.buf_infos.copy()) 58 | 59 | def reset(self): 60 | for e in range(self.num_envs): 61 | obs = self.envs[e].reset() 62 | self._save_obs(e, obs) 63 | return self._obs_from_buf() 64 | 65 | def _save_obs(self, e, obs): 66 | for k in self.keys: 67 | if k is None: 68 | self.buf_obs[k][e] = obs 69 | else: 70 | self.buf_obs[k][e] = obs[k] 71 | 72 | def _obs_from_buf(self): 73 | return dict_to_obs(copy_obs_dict(self.buf_obs)) 74 | 75 | def get_images(self): 76 | return [env.render(mode='rgb_array') for env in self.envs] 77 | 78 | def render(self, mode='human'): 79 | if self.num_envs == 1: 80 | return self.envs[0].render(mode=mode) 81 | else: 82 | return super().render(mode=mode) 83 | -------------------------------------------------------------------------------- /control/baselines/common/vec_env/test_vec_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for asynchronous vectorized environments. 3 | """ 4 | 5 | import gym 6 | import numpy as np 7 | import pytest 8 | from .dummy_vec_env import DummyVecEnv 9 | from .shmem_vec_env import ShmemVecEnv 10 | from .subproc_vec_env import SubprocVecEnv 11 | 12 | 13 | def assert_envs_equal(env1, env2, num_steps): 14 | """ 15 | Compare two environments over num_steps steps and make sure 16 | that the observations produced by each are the same when given 17 | the same actions. 18 | """ 19 | assert env1.num_envs == env2.num_envs 20 | assert env1.action_space.shape == env2.action_space.shape 21 | assert env1.action_space.dtype == env2.action_space.dtype 22 | joint_shape = (env1.num_envs,) + env1.action_space.shape 23 | 24 | try: 25 | obs1, obs2 = env1.reset(), env2.reset() 26 | assert np.array(obs1).shape == np.array(obs2).shape 27 | assert np.array(obs1).shape == joint_shape 28 | assert np.allclose(obs1, obs2) 29 | np.random.seed(1337) 30 | for _ in range(num_steps): 31 | actions = np.array(np.random.randint(0, 0x100, size=joint_shape), 32 | dtype=env1.action_space.dtype) 33 | for env in [env1, env2]: 34 | env.step_async(actions) 35 | outs1 = env1.step_wait() 36 | outs2 = env2.step_wait() 37 | for out1, out2 in zip(outs1[:3], outs2[:3]): 38 | assert np.array(out1).shape == np.array(out2).shape 39 | assert np.allclose(out1, out2) 40 | assert list(outs1[3]) == list(outs2[3]) 41 | finally: 42 | env1.close() 43 | env2.close() 44 | 45 | 46 | @pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv)) 47 | @pytest.mark.parametrize('dtype', ('uint8', 'float32')) 48 | def test_vec_env(klass, dtype): # pylint: disable=R0914 49 | """ 50 | Test that a vectorized environment is equivalent to 51 | DummyVecEnv, since DummyVecEnv is less likely to be 52 | error prone. 53 | """ 54 | num_envs = 3 55 | num_steps = 100 56 | shape = (3, 8) 57 | 58 | def make_fn(seed): 59 | """ 60 | Get an environment constructor with a seed. 61 | """ 62 | return lambda: SimpleEnv(seed, shape, dtype) 63 | fns = [make_fn(i) for i in range(num_envs)] 64 | env1 = DummyVecEnv(fns) 65 | env2 = klass(fns) 66 | assert_envs_equal(env1, env2, num_steps=num_steps) 67 | 68 | 69 | class SimpleEnv(gym.Env): 70 | """ 71 | An environment with a pre-determined observation space 72 | and RNG seed. 73 | """ 74 | 75 | def __init__(self, seed, shape, dtype): 76 | np.random.seed(seed) 77 | self._dtype = dtype 78 | self._start_obs = np.array(np.random.randint(0, 0x100, size=shape), 79 | dtype=dtype) 80 | self._max_steps = seed + 1 81 | self._cur_obs = None 82 | self._cur_step = 0 83 | # this is 0xFF instead of 0x100 because the Box space includes 84 | # the high end, while randint does not 85 | self.action_space = gym.spaces.Box(low=0, high=0xFF, shape=shape, dtype=dtype) 86 | self.observation_space = self.action_space 87 | 88 | def step(self, action): 89 | self._cur_obs += np.array(action, dtype=self._dtype) 90 | self._cur_step += 1 91 | done = self._cur_step >= self._max_steps 92 | reward = self._cur_step / self._max_steps 93 | return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)} 94 | 95 | def reset(self): 96 | self._cur_obs = self._start_obs 97 | self._cur_step = 0 98 | return self._cur_obs 99 | 100 | def render(self, mode=None): 101 | raise NotImplementedError 102 | -------------------------------------------------------------------------------- /control/baselines/common/vec_env/test_video_recorder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for asynchronous vectorized environments. 3 | """ 4 | 5 | import gym 6 | import pytest 7 | import os 8 | import glob 9 | import tempfile 10 | 11 | from .dummy_vec_env import DummyVecEnv 12 | from .shmem_vec_env import ShmemVecEnv 13 | from .subproc_vec_env import SubprocVecEnv 14 | from .vec_video_recorder import VecVideoRecorder 15 | 16 | @pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv)) 17 | @pytest.mark.parametrize('num_envs', (1, 4)) 18 | @pytest.mark.parametrize('video_length', (10, 100)) 19 | @pytest.mark.parametrize('video_interval', (1, 50)) 20 | def test_video_recorder(klass, num_envs, video_length, video_interval): 21 | """ 22 | Wrap an existing VecEnv with VevVideoRecorder, 23 | Make (video_interval + video_length + 1) steps, 24 | then check that the file is present 25 | """ 26 | 27 | def make_fn(): 28 | env = gym.make('PongNoFrameskip-v4') 29 | return env 30 | fns = [make_fn for _ in range(num_envs)] 31 | env = klass(fns) 32 | 33 | with tempfile.TemporaryDirectory() as video_path: 34 | env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length) 35 | 36 | env.reset() 37 | for _ in range(video_interval + video_length + 1): 38 | env.step([0] * num_envs) 39 | env.close() 40 | 41 | 42 | recorded_video = glob.glob(os.path.join(video_path, "*.mp4")) 43 | 44 | # first and second step 45 | assert len(recorded_video) == 2 46 | # Files are not empty 47 | assert all(os.stat(p).st_size != 0 for p in recorded_video) 48 | 49 | 50 | -------------------------------------------------------------------------------- /control/baselines/common/vec_env/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for dealing with vectorized environments. 3 | """ 4 | 5 | from collections import OrderedDict 6 | 7 | import gym 8 | import numpy as np 9 | 10 | 11 | def copy_obs_dict(obs): 12 | """ 13 | Deep-copy an observation dict. 14 | """ 15 | return {k: np.copy(v) for k, v in obs.items()} 16 | 17 | 18 | def dict_to_obs(obs_dict): 19 | """ 20 | Convert an observation dict into a raw array if the 21 | original observation space was not a Dict space. 22 | """ 23 | if set(obs_dict.keys()) == {None}: 24 | return obs_dict[None] 25 | return obs_dict 26 | 27 | 28 | def obs_space_info(obs_space): 29 | """ 30 | Get dict-structured information about a gym.Space. 31 | 32 | Returns: 33 | A tuple (keys, shapes, dtypes): 34 | keys: a list of dict keys. 35 | shapes: a dict mapping keys to shapes. 36 | dtypes: a dict mapping keys to dtypes. 37 | """ 38 | if isinstance(obs_space, gym.spaces.Dict): 39 | assert isinstance(obs_space.spaces, OrderedDict) 40 | subspaces = obs_space.spaces 41 | else: 42 | subspaces = {None: obs_space} 43 | keys = [] 44 | shapes = {} 45 | dtypes = {} 46 | for key, box in subspaces.items(): 47 | keys.append(key) 48 | shapes[key] = box.shape 49 | dtypes[key] = box.dtype 50 | return keys, shapes, dtypes 51 | 52 | 53 | def obs_to_dict(obs): 54 | """ 55 | Convert an observation into a dict. 56 | """ 57 | if isinstance(obs, dict): 58 | return obs 59 | return {None: obs} 60 | -------------------------------------------------------------------------------- /control/baselines/common/vec_env/vec_frame_stack.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | import numpy as np 3 | from gym import spaces 4 | 5 | 6 | class VecFrameStack(VecEnvWrapper): 7 | def __init__(self, venv, nstack): 8 | self.venv = venv 9 | self.nstack = nstack 10 | wos = venv.observation_space # wrapped ob space 11 | low = np.repeat(wos.low, self.nstack, axis=-1) 12 | high = np.repeat(wos.high, self.nstack, axis=-1) 13 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype) 14 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) 15 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 16 | 17 | def step_wait(self): 18 | obs, rews, news, infos = self.venv.step_wait() 19 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) 20 | for (i, new) in enumerate(news): 21 | if new: 22 | self.stackedobs[i] = 0 23 | self.stackedobs[..., -obs.shape[-1]:] = obs 24 | return self.stackedobs, rews, news, infos 25 | 26 | def reset(self): 27 | obs = self.venv.reset() 28 | self.stackedobs[...] = 0 29 | self.stackedobs[..., -obs.shape[-1]:] = obs 30 | return self.stackedobs 31 | -------------------------------------------------------------------------------- /control/baselines/common/vec_env/vec_monitor.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | from baselines.bench.monitor import ResultsWriter 3 | import numpy as np 4 | import time 5 | 6 | 7 | class VecMonitor(VecEnvWrapper): 8 | def __init__(self, venv, filename=None): 9 | VecEnvWrapper.__init__(self, venv) 10 | self.eprets = None 11 | self.eplens = None 12 | self.tstart = time.time() 13 | self.results_writer = ResultsWriter(filename, header={'t_start': self.tstart}) 14 | 15 | def reset(self): 16 | obs = self.venv.reset() 17 | self.eprets = np.zeros(self.num_envs, 'f') 18 | self.eplens = np.zeros(self.num_envs, 'i') 19 | return obs 20 | 21 | def step_wait(self): 22 | obs, rews, dones, infos = self.venv.step_wait() 23 | self.eprets += rews 24 | self.eplens += 1 25 | newinfos = [] 26 | for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)): 27 | info = info.copy() 28 | if done: 29 | epinfo = {'r': ret, 'l': eplen, 't': round(time.time() - self.tstart, 6)} 30 | info['episode'] = epinfo 31 | self.eprets[i] = 0 32 | self.eplens[i] = 0 33 | self.results_writer.write_row(epinfo) 34 | 35 | newinfos.append(info) 36 | 37 | return obs, rews, dones, newinfos 38 | -------------------------------------------------------------------------------- /control/baselines/common/vec_env/vec_normalize.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | from baselines.common.running_mean_std import RunningMeanStd 3 | import numpy as np 4 | 5 | 6 | class VecNormalize(VecEnvWrapper): 7 | """ 8 | A vectorized wrapper that normalizes the observations 9 | and returns from an environment. 10 | """ 11 | 12 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): 13 | VecEnvWrapper.__init__(self, venv) 14 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None 15 | self.ret_rms = RunningMeanStd(shape=()) if ret else None 16 | self.clipob = clipob 17 | self.cliprew = cliprew 18 | self.ret = np.zeros(self.num_envs) 19 | self.gamma = gamma 20 | self.epsilon = epsilon 21 | 22 | def step_wait(self): 23 | obs, rews, news, infos = self.venv.step_wait() 24 | self.ret = self.ret * self.gamma + rews 25 | obs = self._obfilt(obs) 26 | if self.ret_rms: 27 | self.ret_rms.update(self.ret) 28 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) 29 | self.ret[news] = 0. 30 | return obs, rews, news, infos 31 | 32 | def _obfilt(self, obs): 33 | if self.ob_rms: 34 | self.ob_rms.update(obs) 35 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) 36 | return obs 37 | else: 38 | return obs 39 | 40 | def reset(self): 41 | self.ret = np.zeros(self.num_envs) 42 | obs = self.venv.reset() 43 | return self._obfilt(obs) 44 | -------------------------------------------------------------------------------- /control/baselines/common/vec_env/vec_video_recorder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from baselines import logger 3 | from baselines.common.vec_env import VecEnvWrapper 4 | from gym.wrappers.monitoring import video_recorder 5 | 6 | 7 | class VecVideoRecorder(VecEnvWrapper): 8 | """ 9 | Wrap VecEnv to record rendered image as mp4 video. 10 | """ 11 | 12 | def __init__(self, venv, directory, record_video_trigger, video_length=200): 13 | """ 14 | # Arguments 15 | venv: VecEnv to wrap 16 | directory: Where to save videos 17 | record_video_trigger: 18 | Function that defines when to start recording. 19 | The function takes the current number of step, 20 | and returns whether we should start recording or not. 21 | video_length: Length of recorded video 22 | """ 23 | 24 | VecEnvWrapper.__init__(self, venv) 25 | self.record_video_trigger = record_video_trigger 26 | self.video_recorder = None 27 | 28 | self.directory = os.path.abspath(directory) 29 | if not os.path.exists(self.directory): os.mkdir(self.directory) 30 | 31 | self.file_prefix = "vecenv" 32 | self.file_infix = '{}'.format(os.getpid()) 33 | self.step_id = 0 34 | self.video_length = video_length 35 | 36 | self.recording = False 37 | self.recorded_frames = 0 38 | 39 | def reset(self): 40 | obs = self.venv.reset() 41 | 42 | self.start_video_recorder() 43 | 44 | return obs 45 | 46 | def start_video_recorder(self): 47 | self.close_video_recorder() 48 | 49 | base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id)) 50 | self.video_recorder = video_recorder.VideoRecorder( 51 | env=self.venv, 52 | base_path=base_path, 53 | metadata={'step_id': self.step_id} 54 | ) 55 | 56 | self.video_recorder.capture_frame() 57 | self.recorded_frames = 1 58 | self.recording = True 59 | 60 | def _video_enabled(self): 61 | return self.record_video_trigger(self.step_id) 62 | 63 | def step_wait(self): 64 | obs, rews, dones, infos = self.venv.step_wait() 65 | 66 | self.step_id += 1 67 | if self.recording: 68 | self.video_recorder.capture_frame() 69 | self.recorded_frames += 1 70 | if self.recorded_frames > self.video_length: 71 | logger.info("Saving video to ", self.video_recorder.path) 72 | self.close_video_recorder() 73 | elif self._video_enabled(): 74 | self.start_video_recorder() 75 | 76 | return obs, rews, dones, infos 77 | 78 | def close_video_recorder(self): 79 | if self.recording: 80 | self.video_recorder.close() 81 | self.recording = False 82 | self.recorded_frames = 0 83 | 84 | def close(self): 85 | VecEnvWrapper.close(self) 86 | self.close_video_recorder() 87 | 88 | def __del__(self): 89 | self.close() 90 | -------------------------------------------------------------------------------- /control/baselines/ppoc_int/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/baselines/ppoc_int/__init__.py -------------------------------------------------------------------------------- /control/baselines/ppoc_int/assets/twod_tmaze.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 48 | -------------------------------------------------------------------------------- /control/baselines/ppoc_int/normalized_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | from gym.envs.registration import load 6 | 7 | 8 | class NormalizedActionWrapper(gym.ActionWrapper): 9 | """Environment wrapper to normalize the action space to [-1, 1]. This 10 | wrapper is adapted from rllab's [1] wrapper `NormalizedEnv` 11 | https://github.com/rll/rllab/blob/b3a28992eca103cab3cb58363dd7a4bb07f250a0/rllab/envs/normalized_env.py 12 | [1] Yan Duan, Xi Chen, Rein Houthooft, John Schulman, Pieter Abbeel, 13 | "Benchmarking Deep Reinforcement Learning for Continuous Control", 2016 14 | (https://arxiv.org/abs/1604.06778) 15 | """ 16 | def __init__(self, env): 17 | super(NormalizedActionWrapper, self).__init__(env) 18 | self.action_space = spaces.Box(low=-1.0, high=1.0, 19 | shape=self.env.action_space.shape) 20 | 21 | def action(self, action): 22 | # Clip the action in [-1, 1] 23 | action = np.clip(action, -1.0, 1.0) 24 | # Map the normalized action to original action space 25 | lb, ub = self.env.action_space.low, self.env.action_space.high 26 | action = lb + 0.5 * (action + 1.0) * (ub - lb) 27 | return action 28 | 29 | def reverse_action(self, action): 30 | # Map the original action to normalized action space 31 | lb, ub = self.env.action_space.low, self.env.action_space.high 32 | action = 2.0 * (action - lb) / (ub - lb) - 1.0 33 | # Clip the action in [-1, 1] 34 | action = np.clip(action, -1.0, 1.0) 35 | return action 36 | 37 | 38 | 39 | def mujoco_wrapper(entry_point, **kwargs): 40 | # Load the environment from its entry point 41 | env_cls = load(entry_point) 42 | env = env_cls(**kwargs) 43 | # Normalization wrapper 44 | env = NormalizedActionWrapper(env) 45 | return env -------------------------------------------------------------------------------- /control/baselines/ppoc_int/plot_res.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns; sns.set(color_codes=True) 3 | import numpy as np 4 | from collections import deque 5 | import pdb 6 | sns.set(style='ticks') 7 | 8 | name='TMaze' 9 | 10 | seeds = [0,1,2,3,4,] 11 | shortest=np.inf 12 | data=[] 13 | axes=[] 14 | direc='res' 15 | for seed in seeds: 16 | dat = np.genfromtxt('{}/{}seed{}_intfc1_2opts.csv'.format(direc,name,seed), delimiter=',')[1:200,1] 17 | print(len(dat)) 18 | if len(dat) < shortest: 19 | shortest=len(dat) 20 | 21 | rewbuffer = deque(maxlen=100) 22 | real_dat=[] 23 | for d in dat: 24 | rewbuffer.append(d) 25 | real_dat.append(np.mean(rewbuffer)) 26 | data.append(real_dat) 27 | for i in range(len(data)): 28 | data[i] = data[i][:shortest] 29 | axes.append(sns.tsplot(data=data,legend=True,condition='IOC',color='red')) 30 | 31 | 32 | 33 | shortest=np.inf 34 | data=[] 35 | for seed in seeds: 36 | dat = np.genfromtxt('{}/{}seed{}_intfc0_2opts.csv'.format(direc,name,seed), delimiter=',')[1:200,1] 37 | print(len(dat)) 38 | if len(dat) < shortest: 39 | shortest=len(dat) 40 | 41 | rewbuffer = deque(maxlen=100) 42 | real_dat=[] 43 | for d in dat: 44 | rewbuffer.append(d) 45 | real_dat.append(np.mean(rewbuffer)) 46 | data.append(real_dat) 47 | for i in range(len(data)): 48 | data[i] = data[i][:shortest] 49 | axes.append(sns.tsplot(data=data,legend=True,condition='OC',color='blue')) 50 | 51 | 52 | plt.gcf().subplots_adjust(bottom=0.15) 53 | plt.xlabel('Iterations',fontsize=18) 54 | plt.ylabel('Average Rewards',fontsize=18) 55 | plt.legend() 56 | plt.title("Results on {}-v0".format(name)) 57 | plt.savefig('plots/{}_notrans.png'.format(name)) 58 | plt.clf() 59 | -------------------------------------------------------------------------------- /control/baselines/ppoc_int/run_mujoco.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | from baselines.common import set_global_seeds, tf_util as U 3 | import gym, logging 4 | from baselines import logger 5 | from half_cheetah import * 6 | 7 | 8 | def train(env_id, num_timesteps, seed, num_options,app, saves ,wsaves, epoch,dc,plots,w_intfc,switch,mainlr,intlr,piolr,fewshot,k): 9 | from baselines.ppoc_int import mlp_policy, pposgd_simple 10 | U.make_session(num_cpu=1).__enter__() 11 | set_global_seeds(seed) 12 | 13 | if env_id=="TMaze": 14 | from twod_tmaze import TMaze 15 | env=TMaze() 16 | env.seed(seed) 17 | else: 18 | env = gym.make(env_id) 19 | env._seed(seed) 20 | 21 | 22 | def policy_fn(name, ob_space, ac_space): 23 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 24 | hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc, w_intfc=w_intfc,k=k) 25 | 26 | gym.logger.setLevel(logging.WARN) 27 | 28 | if num_options ==1: 29 | optimsize=64 30 | elif num_options ==2: 31 | optimsize=32 32 | else: 33 | optimsize=int(64/num_options) 34 | 35 | 36 | num_timesteps = num_timesteps #if env_id!="TMaze" else 5e5 37 | tperbatch = 2048 if not epoch else int(1e4) 38 | pposgd_simple.learn(env, policy_fn, 39 | max_timesteps=num_timesteps, 40 | timesteps_per_batch=tperbatch, 41 | clip_param=0.2, entcoeff=0.0, 42 | optim_epochs=10, optim_stepsize=mainlr, optim_batchsize=optimsize, 43 | gamma=0.99, lam=0.95, schedule='constant', num_options=num_options, 44 | app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed,dc=dc,plots=plots, 45 | w_intfc=w_intfc,switch=switch,intlr=intlr,piolr=piolr,fewshot=fewshot,k=k 46 | ) 47 | env.close() 48 | 49 | def main(): 50 | import argparse 51 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 52 | parser.add_argument('--env', help='environment ID', default='TMaze') 53 | parser.add_argument('--timesteps', help='number of timesteps', type=int, default=1e6) 54 | parser.add_argument('--seed', help='RNG seed', type=int, default=1) 55 | parser.add_argument('--opt', help='number of options', type=int, default=2) 56 | parser.add_argument('--app', help='Append to folder name', type=str, default='') 57 | parser.add_argument('--saves', help='Save the returns at each iteration', dest='saves', action='store_true', default=False) 58 | parser.add_argument('--wsaves', help='Save the weights',dest='wsaves', action='store_true', default=False) 59 | parser.add_argument('--plots', help='Plot some visualization', dest='plots', action='store_true', default=False) 60 | parser.add_argument('--switch', help='Switch task after 150 iterations', dest='switch', action='store_true', default=False) 61 | parser.add_argument('--fewshot', help='Value learning after 150 iterations', dest='fewshot', action='store_true', default=False) 62 | parser.add_argument('--nointfc', help='Disables interet functions', dest='w_intfc', action='store_false', default=True) 63 | parser.add_argument('--epoch', help='Load weights from a certain epoch', type=int, default=0) 64 | parser.add_argument('--dc', help='Deliberation cost (not used)', type=float, default=0.) 65 | parser.add_argument('--mainlr', type=float, default=3e-4) 66 | parser.add_argument('--intlr', type=float, default=1e-4) 67 | parser.add_argument('--piolr', type=float, default=1e-4) 68 | parser.add_argument('--k', type=float, default=0., help='threshold for interest function') 69 | 70 | 71 | 72 | 73 | args = parser.parse_args() 74 | 75 | train(args.env, num_timesteps=args.timesteps, seed=args.seed, num_options=args.opt, app=args.app, 76 | saves=args.saves, wsaves=args.wsaves, epoch=args.epoch,dc=args.dc,plots=args.plots, 77 | w_intfc=args.w_intfc,switch=args.switch,mainlr=args.mainlr,intlr=args.intlr,piolr=args.piolr,fewshot=args.fewshot,k=args.k) 78 | 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /control/baselines/ppoc_int/seeding.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import numpy as np 3 | import os 4 | import random as _random 5 | import struct 6 | import sys 7 | 8 | from gym import error 9 | 10 | if sys.version_info < (3,): 11 | integer_types = (int, long) 12 | else: 13 | integer_types = (int,) 14 | 15 | # Fortunately not needed right now! 16 | # 17 | # def random(seed=None): 18 | # seed = _seed(seed) 19 | # 20 | # rng = _random.Random() 21 | # rng.seed(hash_seed(seed)) 22 | # return rng, seed 23 | 24 | def np_random(seed=None): 25 | if seed is not None and not (isinstance(seed, integer_types) and 0 <= seed): 26 | raise error.Error('Seed must be a non-negative integer or omitted, not {}'.format(seed)) 27 | 28 | seed = _seed(seed) 29 | 30 | rng = np.random.RandomState() 31 | rng.seed(_int_list_from_bigint(hash_seed(seed))) 32 | return rng, seed 33 | 34 | def hash_seed(seed=None, max_bytes=8): 35 | """Any given evaluation is likely to have many PRNG's active at 36 | once. (Most commonly, because the environment is running in 37 | multiple processes.) There's literature indicating that having 38 | linear correlations between seeds of multiple PRNG's can correlate 39 | the outputs: 40 | 41 | http://blogs.unity3d.com/2015/01/07/a-primer-on-repeatable-random-numbers/ 42 | http://stackoverflow.com/questions/1554958/how-different-do-random-seeds-need-to-be 43 | http://dl.acm.org/citation.cfm?id=1276928 44 | 45 | Thus, for sanity we hash the seeds before using them. (This scheme 46 | is likely not crypto-strength, but it should be good enough to get 47 | rid of simple correlations.) 48 | 49 | Args: 50 | seed (Optional[int]): None seeds from an operating system specific randomness source. 51 | max_bytes: Maximum number of bytes to use in the hashed seed. 52 | """ 53 | if seed is None: 54 | seed = _seed(max_bytes=max_bytes) 55 | hash = hashlib.sha512(str(seed).encode('utf8')).digest() 56 | return _bigint_from_bytes(hash[:max_bytes]) 57 | 58 | def _seed(a=None, max_bytes=8): 59 | """Create a strong random seed. Otherwise, Python 2 would seed using 60 | the system time, which might be non-robust especially in the 61 | presence of concurrency. 62 | 63 | Args: 64 | a (Optional[int, str]): None seeds from an operating system specific randomness source. 65 | max_bytes: Maximum number of bytes to use in the seed. 66 | """ 67 | # Adapted from https://svn.python.org/projects/python/tags/r32/Lib/random.py 68 | if a is None: 69 | a = _bigint_from_bytes(os.urandom(max_bytes)) 70 | elif isinstance(a, str): 71 | a = a.encode('utf8') 72 | a += hashlib.sha512(a).digest() 73 | a = _bigint_from_bytes(a[:max_bytes]) 74 | elif isinstance(a, integer_types): 75 | a = a % 2**(8 * max_bytes) 76 | else: 77 | raise error.Error('Invalid type for seed: {} ({})'.format(type(a), a)) 78 | 79 | return a 80 | 81 | # TODO: don't hardcode sizeof_int here 82 | def _bigint_from_bytes(bytes): 83 | sizeof_int = 4 84 | padding = sizeof_int - len(bytes) % sizeof_int 85 | bytes += b'\0' * padding 86 | int_count = int(len(bytes) / sizeof_int) 87 | unpacked = struct.unpack("{}I".format(int_count), bytes) 88 | accum = 0 89 | for i, val in enumerate(unpacked): 90 | accum += 2 ** (sizeof_int * 8 * i) * val 91 | return accum 92 | 93 | def _int_list_from_bigint(bigint): 94 | # Special case 0 95 | if bigint < 0: 96 | raise error.Error('Seed must be non-negative, not {}'.format(bigint)) 97 | elif bigint == 0: 98 | return [0] 99 | 100 | ints = [] 101 | while bigint > 0: 102 | bigint, mod = divmod(bigint, 2 ** 32) 103 | ints.append(mod) 104 | return ints 105 | -------------------------------------------------------------------------------- /control/baselines/ppoc_int/twod_tmaze.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import mujoco_env 3 | from gym.spaces import Box 4 | import seeding 5 | 6 | 7 | class TwoDEnv(mujoco_env.MujocoEnv): 8 | def __init__(self, model_path, frame_skip, xbounds, ybounds): 9 | super(TwoDEnv, self).__init__(model_path=model_path, frame_skip=frame_skip) 10 | assert isinstance(self.observation_space, Box) 11 | assert self.observation_space.shape == (2,) 12 | 13 | def get_viewer(self): 14 | return self._get_viewer() 15 | 16 | import numpy as np 17 | from gym import utils 18 | import os 19 | 20 | 21 | 22 | 23 | def get_asset_xml(xml_name): 24 | return os.path.join(os.path.join(os.path.dirname(__file__), 'assets'), xml_name) 25 | 26 | class TMaze(TwoDEnv, utils.EzPickle): 27 | NAME='TMaze' 28 | def __init__(self, verbose=False,change_goal=None): 29 | self.verbose = verbose 30 | self.steps = 0 31 | self.change_goal = change_goal 32 | utils.EzPickle.__init__(self) 33 | TwoDEnv.__init__(self, get_asset_xml('twod_tmaze.xml'), 2, xbounds=[-0.3,0.3], ybounds=[-0.3,0.3]) 34 | 35 | 36 | def _step(self, a): 37 | self.do_simulation(a, self.frame_skip) 38 | ob = self._get_obs() 39 | pos = ob[0:2] 40 | 41 | if not self.change_goal: 42 | target = self.model.body_pos.copy()[-1][:2] 43 | else: 44 | target = self.change_goal 45 | dist_thresh = 0.1 46 | 47 | 48 | 49 | if pos[0]>target[0]-dist_thresh and pos[0]target[1]-dist_thresh: 51 | reward = 1. 52 | else: 53 | reward = 0. 54 | 55 | self.steps += 1 56 | if self.verbose: 57 | print(pos, reward) 58 | done = self.steps >= 500 or int(reward) 59 | return ob, reward, done, np.concatenate([self.model.data.qvel]).ravel() 60 | 61 | def reset_model(self): 62 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-0.01, high=0.01) 63 | qvel = self.init_qvel + self.np_random.uniform(size=self.model.nv, low=-0.01, high=0.01) 64 | self.set_state(qpos, qvel) 65 | self.steps = 0 66 | return self._get_obs() 67 | 68 | def _get_obs(self): 69 | init_pos = self.model.body_pos.copy()[1][:2] 70 | return np.concatenate([self.model.data.qpos]).ravel() + init_pos 71 | 72 | def viewer_setup(self): 73 | v = self.viewer 74 | 75 | def seed(self, seed=None): 76 | self.np_random, seed = seeding.np_random(seed) 77 | -------------------------------------------------------------------------------- /control/baselines/results_plotter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode 4 | 5 | import matplotlib.pyplot as plt 6 | plt.rcParams['svg.fonttype'] = 'none' 7 | 8 | from baselines.common import plot_util 9 | 10 | X_TIMESTEPS = 'timesteps' 11 | X_EPISODES = 'episodes' 12 | X_WALLTIME = 'walltime_hrs' 13 | Y_REWARD = 'reward' 14 | Y_TIMESTEPS = 'timesteps' 15 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] 16 | EPISODES_WINDOW = 100 17 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 18 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', 19 | 'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue'] 20 | 21 | def rolling_window(a, window): 22 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 23 | strides = a.strides + (a.strides[-1],) 24 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 25 | 26 | def window_func(x, y, window, func): 27 | yw = rolling_window(y, window) 28 | yw_func = func(yw, axis=-1) 29 | return x[window-1:], yw_func 30 | 31 | def ts2xy(ts, xaxis, yaxis): 32 | if xaxis == X_TIMESTEPS: 33 | x = np.cumsum(ts.l.values) 34 | elif xaxis == X_EPISODES: 35 | x = np.arange(len(ts)) 36 | elif xaxis == X_WALLTIME: 37 | x = ts.t.values / 3600. 38 | else: 39 | raise NotImplementedError 40 | if yaxis == Y_REWARD: 41 | y = ts.r.values 42 | elif yaxis == Y_TIMESTEPS: 43 | y = ts.l.values 44 | else: 45 | raise NotImplementedError 46 | return x, y 47 | 48 | def plot_curves(xy_list, xaxis, yaxis, title): 49 | fig = plt.figure(figsize=(8,2)) 50 | maxx = max(xy[0][-1] for xy in xy_list) 51 | minx = 0 52 | for (i, (x, y)) in enumerate(xy_list): 53 | color = COLORS[i % len(COLORS)] 54 | plt.scatter(x, y, s=2) 55 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes 56 | plt.plot(x, y_mean, color=color) 57 | plt.xlim(minx, maxx) 58 | plt.title(title) 59 | plt.xlabel(xaxis) 60 | plt.ylabel(yaxis) 61 | plt.tight_layout() 62 | fig.canvas.mpl_connect('resize_event', lambda event: plt.tight_layout()) 63 | plt.grid(True) 64 | 65 | 66 | def split_by_task(taskpath): 67 | return taskpath['dirname'].split('/')[-1].split('-')[0] 68 | 69 | def plot_results(dirs, num_timesteps=10e6, xaxis=X_TIMESTEPS, yaxis=Y_REWARD, title='', split_fn=split_by_task): 70 | results = plot_util.load_results(dirs) 71 | plot_util.plot_results(results, xy_fn=lambda r: ts2xy(r['monitor'], xaxis, yaxis), split_fn=split_fn, average_group=True, resample=int(1e6)) 72 | 73 | # Example usage in jupyter-notebook 74 | # from baselines.results_plotter import plot_results 75 | # %matplotlib inline 76 | # plot_results("./log") 77 | # Here ./log is a directory containing the monitor.csv files 78 | 79 | def main(): 80 | import argparse 81 | import os 82 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 83 | parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log']) 84 | parser.add_argument('--num_timesteps', type=int, default=int(10e6)) 85 | parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS) 86 | parser.add_argument('--yaxis', help = 'Varible on Y-axis', default = Y_REWARD) 87 | parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout') 88 | args = parser.parse_args() 89 | args.dirs = [os.path.abspath(dir) for dir in args.dirs] 90 | plot_results(args.dirs, args.num_timesteps, args.xaxis, args.yaxis, args.task_name) 91 | plt.show() 92 | 93 | if __name__ == '__main__': 94 | main() 95 | -------------------------------------------------------------------------------- /control/data/cartpole.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/data/cartpole.gif -------------------------------------------------------------------------------- /control/data/fetchPickAndPlaceContrast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/data/fetchPickAndPlaceContrast.png -------------------------------------------------------------------------------- /control/data/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/data/logo.jpg -------------------------------------------------------------------------------- /control/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = F,E999,W291,W293 3 | exclude = 4 | .git, 5 | __pycache__, 6 | baselines/ppo1, 7 | baselines/bench, 8 | -------------------------------------------------------------------------------- /control/setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | from setuptools import setup, find_packages 3 | import sys 4 | 5 | if sys.version_info.major != 3: 6 | print('This Python is only compatible with Python 3, but you are running ' 7 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major)) 8 | 9 | 10 | extras = { 11 | 'test': [ 12 | 'filelock', 13 | 'pytest', 14 | 'pytest-forked', 15 | 'atari-py' 16 | ], 17 | 'bullet': [ 18 | 'pybullet', 19 | ], 20 | 'mpi': [ 21 | 'mpi4py' 22 | ] 23 | } 24 | 25 | all_deps = [] 26 | for group_name in extras: 27 | all_deps += extras[group_name] 28 | 29 | extras['all'] = all_deps 30 | 31 | setup(name='baselines', 32 | packages=[package for package in find_packages() 33 | if package.startswith('baselines')], 34 | install_requires=[ 35 | 'gym', 36 | 'scipy', 37 | 'tqdm', 38 | 'joblib', 39 | 'dill', 40 | 'progressbar2', 41 | 'cloudpickle', 42 | 'click', 43 | 'opencv-python' 44 | ], 45 | extras_require=extras, 46 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', 47 | author='OpenAI', 48 | url='https://github.com/openai/baselines', 49 | author_email='gym@openai.com', 50 | version='0.1.5') 51 | 52 | 53 | # ensure there is some tensorflow build with version above 1.4 54 | import pkg_resources 55 | tf_pkg = None 56 | for tf_pkg_name in ['tensorflow', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-gpu']: 57 | try: 58 | tf_pkg = pkg_resources.get_distribution(tf_pkg_name) 59 | except pkg_resources.DistributionNotFound: 60 | pass 61 | assert tf_pkg is not None, 'TensorFlow needed, of version above 1.4' 62 | from distutils.version import LooseVersion 63 | assert LooseVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= LooseVersion('1.4.0') 64 | -------------------------------------------------------------------------------- /launcher_miniworld.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | main_lr=(1e-4) #(1e-4 3e-4 7e-4 5e-4) 4 | int_lr=(9e-5) #(1e-4 3e-3 8e-4 8e-5 5e-4 3e-4 9e-5) #(3e-3 8e-5 8e-4) 5 | seed=(0) 6 | port=({4000..4020}) 7 | envname="MiniWorld-OneRoom-v0" #"MiniWorld-PickupObjs-v0" #MiniWorld-PutNext-v0 8 | numoption=2 9 | 10 | count=0 11 | for _main_lr in ${main_lr[@]} 12 | do 13 | for _int_lr in ${int_lr[@]} 14 | do 15 | for _seed in ${seed[@]} 16 | do 17 | if [ -f temprun.sh ] ; then 18 | rm temprun.sh 19 | fi 20 | 21 | echo "#!/bin/bash" >> temprun.sh 22 | echo "#SBATCH --account=addccaccounthere" >> temprun.sh 23 | echo "#SBATCH --output=\"/scratch/username/slurm-%j.out\"" >> temprun.sh 24 | echo "#SBATCH --gres=gpu:1" >> temprun.sh 25 | echo "#SBATCH --mem=30G" >> temprun.sh 26 | echo "#SBATCH --time=10:00:00" >> temprun.sh 27 | echo "source $HOME/intf/bin/activate" >> temprun.sh 28 | echo "cd $HOME/ioc/miniworld/baselines/ppoc_int/" >> temprun.sh 29 | k="xvfb-run -n "${port[$count]}" -s \"-screen 0 1024x768x24 -ac +extension GLX +render -noreset\" python run_miniw.py --env "$envname" --seed $_seed --opt $numoption --saves --mainlr $_main_lr --intlr $_int_lr --switch --wsaves" 30 | echo $k >> temprun.sh 31 | echo $k 32 | eval "sbatch temprun.sh" 33 | rm temprun.sh 34 | count=$((count + 1)) 35 | done 36 | done 37 | done 38 | -------------------------------------------------------------------------------- /launcher_mujoco.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | seed=(0 1 2 3 4) 4 | mainlr=(1e-4) 5 | #intfclr=(1e-4 3e-4 5e-4 7e-4 9e-4) 6 | intfclr=(5e-4) 7 | #piolr=(7e-4 9e-4 3e-4 5e-4) 8 | piolr=(3e-4) 9 | 10 | port=($(seq 4000 1 4100)) 11 | 12 | envname="HalfCheetahDir-v1" 13 | 14 | count=0 15 | 16 | for _piolr in ${piolr[@]} 17 | do 18 | for _intfclr in ${intfclr[@]} 19 | do 20 | for _mainlr in ${mainlr[@]} 21 | do 22 | for _seed in ${seed[@]} 23 | do 24 | if [ -f temprun.sh ] ; then 25 | rm temprun.sh 26 | fi 27 | echo "#!/bin/bash" >> temprun.sh 28 | echo "#SBATCH --account=addaccounthere" >> temprun.sh 29 | echo "#SBATCH --output=\"/scratch/username/maml/Maml_seed${_seed}_mainlr${_mainlr}_intfclr_${_intfclr}_piolr_${_piolr}-%j.out\"" >> temprun.sh 30 | echo "#SBATCH --job-name=Maml_seed${_seed}_mainlr${_mainlr}_intfclr_${_intfclr}_piolr_${_piolr}" >> temprun.sh 31 | echo "#SBATCH --gres=gpu:0" >> temprun.sh 32 | echo "#SBATCH --mem=5G" >> temprun.sh 33 | echo "#SBATCH --time=1:00:00" >> temprun.sh 34 | echo "source $HOME/miniconda3/etc/profile.d/conda.sh" >> temprun.sh 35 | echo "conda activate intfc" >> temprun.sh 36 | echo "cd $HOME/ioc/control/baselines/ppoc_int/" >> temprun.sh 37 | k="xvfb-run -n "${port[$count]}" -s \"-screen 0 1024x768x24 -ac +extension GLX +render -noreset\" python run_mujoco.py --env "$envname" --saves --opt 2 --seed ${_seed} --mainlr ${_mainlr} --intlr ${_intfclr} --piolr ${_piolr} --switch --wsaves" 38 | echo $k >> temprun.sh 39 | echo $k 40 | eval "sbatch temprun.sh" 41 | rm temprun.sh 42 | count=$((count + 1)) 43 | done 44 | done 45 | done 46 | done 47 | -------------------------------------------------------------------------------- /miniworld/.benchmark_pattern: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /miniworld/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.pkl 4 | *.py~ 5 | .pytest_cache 6 | .DS_Store 7 | .idea 8 | 9 | # Setuptools distribution and build folders. 10 | /dist/ 11 | /build 12 | keys/ 13 | 14 | # Virtualenv 15 | /env 16 | 17 | 18 | *.sublime-project 19 | *.sublime-workspace 20 | 21 | .idea 22 | 23 | logs/ 24 | 25 | .ipynb_checkpoints 26 | ghostdriver.log 27 | 28 | htmlcov 29 | 30 | junk 31 | src 32 | 33 | *.egg-info 34 | .cache 35 | 36 | MUJOCO_LOG.TXT 37 | -------------------------------------------------------------------------------- /miniworld/.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | services: 6 | - docker 7 | 8 | install: 9 | - pip install flake8 10 | - docker build . -t baselines-test 11 | 12 | script: 13 | - flake8 . --show-source --statistics 14 | - docker run baselines-test pytest -v --forked . 15 | -------------------------------------------------------------------------------- /miniworld/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | RUN apt-get -y update && apt-get -y install ffmpeg 4 | # RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv 5 | 6 | ENV CODE_DIR /root/code 7 | 8 | COPY . $CODE_DIR/baselines 9 | WORKDIR $CODE_DIR/baselines 10 | 11 | # Clean up pycache and pyc files 12 | RUN rm -rf __pycache__ && \ 13 | find . -name "*.pyc" -delete && \ 14 | pip install tensorflow && \ 15 | pip install -e .[test] 16 | 17 | 18 | CMD /bin/bash 19 | -------------------------------------------------------------------------------- /miniworld/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /miniworld/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/baselines/__init__.py -------------------------------------------------------------------------------- /miniworld/baselines/bench/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.bench.benchmarks import * 2 | from baselines.bench.monitor import * 3 | -------------------------------------------------------------------------------- /miniworld/baselines/common/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F403 2 | from baselines.common.console_util import * 3 | from baselines.common.dataset import Dataset 4 | from baselines.common.math_util import * 5 | from baselines.common.misc_util import * 6 | -------------------------------------------------------------------------------- /miniworld/baselines/common/cg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): 3 | """ 4 | Demmel p 312 5 | """ 6 | p = b.copy() 7 | r = b.copy() 8 | x = np.zeros_like(b) 9 | rdotr = r.dot(r) 10 | 11 | fmtstr = "%10i %10.3g %10.3g" 12 | titlestr = "%10s %10s %10s" 13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm")) 14 | 15 | for i in range(cg_iters): 16 | if callback is not None: 17 | callback(x) 18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x))) 19 | z = f_Ax(p) 20 | v = rdotr / p.dot(z) 21 | x += v*p 22 | r -= v*z 23 | newrdotr = r.dot(r) 24 | mu = newrdotr/rdotr 25 | p = r + mu*p 26 | 27 | rdotr = newrdotr 28 | if rdotr < residual_tol: 29 | break 30 | 31 | if callback is not None: 32 | callback(x) 33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 34 | return x 35 | -------------------------------------------------------------------------------- /miniworld/baselines/common/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from contextlib import contextmanager 3 | import numpy as np 4 | import time 5 | import shlex 6 | import subprocess 7 | 8 | # ================================================================ 9 | # Misc 10 | # ================================================================ 11 | 12 | def fmt_row(width, row, header=False): 13 | out = " | ".join(fmt_item(x, width) for x in row) 14 | if header: out = out + "\n" + "-"*len(out) 15 | return out 16 | 17 | def fmt_item(x, l): 18 | if isinstance(x, np.ndarray): 19 | assert x.ndim==0 20 | x = x.item() 21 | if isinstance(x, (float, np.float32, np.float64)): 22 | v = abs(x) 23 | if (v < 1e-4 or v > 1e+4) and v > 0: 24 | rep = "%7.2e" % x 25 | else: 26 | rep = "%7.5f" % x 27 | else: rep = str(x) 28 | return " "*(l - len(rep)) + rep 29 | 30 | color2num = dict( 31 | gray=30, 32 | red=31, 33 | green=32, 34 | yellow=33, 35 | blue=34, 36 | magenta=35, 37 | cyan=36, 38 | white=37, 39 | crimson=38 40 | ) 41 | 42 | def colorize(string, color='green', bold=False, highlight=False): 43 | attr = [] 44 | num = color2num[color] 45 | if highlight: num += 10 46 | attr.append(str(num)) 47 | if bold: attr.append('1') 48 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 49 | 50 | def print_cmd(cmd, dry=False): 51 | if isinstance(cmd, str): # for shell=True 52 | pass 53 | else: 54 | cmd = ' '.join(shlex.quote(arg) for arg in cmd) 55 | print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd)) 56 | 57 | 58 | def get_git_commit(cwd=None): 59 | return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8') 60 | 61 | def get_git_commit_message(cwd=None): 62 | return subprocess.check_output(['git', 'show', '-s', '--format=%B', 'HEAD'], cwd=cwd).decode('utf8') 63 | 64 | def ccap(cmd, dry=False, env=None, **kwargs): 65 | print_cmd(cmd, dry) 66 | if not dry: 67 | subprocess.check_call(cmd, env=env, **kwargs) 68 | 69 | 70 | MESSAGE_DEPTH = 0 71 | 72 | @contextmanager 73 | def timed(msg): 74 | global MESSAGE_DEPTH #pylint: disable=W0603 75 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) 76 | tstart = time.time() 77 | MESSAGE_DEPTH += 1 78 | yield 79 | MESSAGE_DEPTH -= 1 80 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) 81 | -------------------------------------------------------------------------------- /miniworld/baselines/common/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Dataset(object): 4 | def __init__(self, data_map, deterministic=False, shuffle=True): 5 | self.data_map = data_map 6 | self.deterministic = deterministic 7 | self.enable_shuffle = shuffle 8 | self.n = next(iter(data_map.values())).shape[0] 9 | self._next_id = 0 10 | self.shuffle() 11 | 12 | def shuffle(self): 13 | if self.deterministic: 14 | return 15 | perm = np.arange(self.n) 16 | np.random.shuffle(perm) 17 | 18 | for key in self.data_map: 19 | self.data_map[key] = self.data_map[key][perm] 20 | 21 | self._next_id = 0 22 | 23 | def next_batch(self, batch_size): 24 | if self._next_id >= self.n and self.enable_shuffle: 25 | self.shuffle() 26 | 27 | cur_id = self._next_id 28 | cur_batch_size = min(batch_size, self.n - self._next_id) 29 | self._next_id += cur_batch_size 30 | 31 | data_map = dict() 32 | for key in self.data_map: 33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 34 | return data_map 35 | 36 | def iterate_once(self, batch_size): 37 | if self.enable_shuffle: self.shuffle() 38 | 39 | while self._next_id <= self.n - batch_size: 40 | yield self.next_batch(batch_size) 41 | self._next_id = 0 42 | 43 | def subset(self, num_elements, deterministic=True): 44 | data_map = dict() 45 | for key in self.data_map: 46 | data_map[key] = self.data_map[key][:num_elements] 47 | return Dataset(data_map, deterministic) 48 | 49 | 50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 52 | arrays = tuple(map(np.asarray, arrays)) 53 | n = arrays[0].shape[0] 54 | assert all(a.shape[0] == n for a in arrays[1:]) 55 | inds = np.arange(n) 56 | if shuffle: np.random.shuffle(inds) 57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches 58 | for batch_inds in np.array_split(inds, sections): 59 | if include_final_partial_batch or len(batch_inds) == batch_size: 60 | yield tuple(a[batch_inds] for a in arrays) 61 | -------------------------------------------------------------------------------- /miniworld/baselines/common/input.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from gym.spaces import Discrete, Box, MultiDiscrete 4 | 5 | def observation_placeholder(ob_space, batch_size=None, name='Ob'): 6 | ''' 7 | Create placeholder to feed observations into of the size appropriate to the observation space 8 | 9 | Parameters: 10 | ---------- 11 | 12 | ob_space: gym.Space observation space 13 | 14 | batch_size: int size of the batch to be fed into input. Can be left None in most cases. 15 | 16 | name: str name of the placeholder 17 | 18 | Returns: 19 | ------- 20 | 21 | tensorflow placeholder tensor 22 | ''' 23 | 24 | assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \ 25 | 'Can only deal with Discrete and Box observation spaces for now' 26 | 27 | dtype = ob_space.dtype 28 | if dtype == np.int8: 29 | dtype = np.uint8 30 | 31 | return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name) 32 | 33 | 34 | def observation_input(ob_space, batch_size=None, name='Ob'): 35 | ''' 36 | Create placeholder to feed observations into of the size appropriate to the observation space, and add input 37 | encoder of the appropriate type. 38 | ''' 39 | 40 | placeholder = observation_placeholder(ob_space, batch_size, name) 41 | return placeholder, encode_observation(ob_space, placeholder) 42 | 43 | def encode_observation(ob_space, placeholder): 44 | ''' 45 | Encode input in the way that is appropriate to the observation space 46 | 47 | Parameters: 48 | ---------- 49 | 50 | ob_space: gym.Space observation space 51 | 52 | placeholder: tf.placeholder observation input placeholder 53 | ''' 54 | if isinstance(ob_space, Discrete): 55 | return tf.to_float(tf.one_hot(placeholder, ob_space.n)) 56 | elif isinstance(ob_space, Box): 57 | return tf.to_float(placeholder) 58 | elif isinstance(ob_space, MultiDiscrete): 59 | placeholder = tf.cast(placeholder, tf.int32) 60 | one_hots = [tf.to_float(tf.one_hot(placeholder[..., i], ob_space.nvec[i])) for i in range(placeholder.shape[-1])] 61 | return tf.concat(one_hots, axis=-1) 62 | else: 63 | raise NotImplementedError 64 | 65 | -------------------------------------------------------------------------------- /miniworld/baselines/common/math_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | 5 | def discount(x, gamma): 6 | """ 7 | computes discounted sums along 0th dimension of x. 8 | 9 | inputs 10 | ------ 11 | x: ndarray 12 | gamma: float 13 | 14 | outputs 15 | ------- 16 | y: ndarray with same shape as x, satisfying 17 | 18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 19 | where k = len(x) - t - 1 20 | 21 | """ 22 | assert x.ndim >= 1 23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 24 | 25 | def explained_variance(ypred,y): 26 | """ 27 | Computes fraction of variance that ypred explains about y. 28 | Returns 1 - Var[y-ypred] / Var[y] 29 | 30 | interpretation: 31 | ev=0 => might as well have predicted zero 32 | ev=1 => perfect prediction 33 | ev<0 => worse than just predicting zero 34 | 35 | """ 36 | assert y.ndim == 1 and ypred.ndim == 1 37 | vary = np.var(y) 38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 39 | 40 | def explained_variance_2d(ypred, y): 41 | assert y.ndim == 2 and ypred.ndim == 2 42 | vary = np.var(y, axis=0) 43 | out = 1 - np.var(y-ypred)/vary 44 | out[vary < 1e-10] = 0 45 | return out 46 | 47 | def ncc(ypred, y): 48 | return np.corrcoef(ypred, y)[1,0] 49 | 50 | def flatten_arrays(arrs): 51 | return np.concatenate([arr.flat for arr in arrs]) 52 | 53 | def unflatten_vector(vec, shapes): 54 | i=0 55 | arrs = [] 56 | for shape in shapes: 57 | size = np.prod(shape) 58 | arr = vec[i:i+size].reshape(shape) 59 | arrs.append(arr) 60 | i += size 61 | return arrs 62 | 63 | def discount_with_boundaries(X, New, gamma): 64 | """ 65 | X: 2d array of floats, time x features 66 | New: 2d array of bools, indicating when a new episode has started 67 | """ 68 | Y = np.zeros_like(X) 69 | T = X.shape[0] 70 | Y[T-1] = X[T-1] 71 | for t in range(T-2, -1, -1): 72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) 73 | return Y 74 | 75 | def test_discount_with_boundaries(): 76 | gamma=0.9 77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') 78 | starts = [1.0, 0.0, 0.0, 1.0] 79 | y = discount_with_boundaries(x, starts, gamma) 80 | assert np.allclose(y, [ 81 | 1 + gamma * 2 + gamma**2 * 3, 82 | 2 + gamma * 3, 83 | 3, 84 | 4 85 | ]) 86 | -------------------------------------------------------------------------------- /miniworld/baselines/common/mpi_adam.py: -------------------------------------------------------------------------------- 1 | import baselines.common.tf_util as U 2 | import tensorflow as tf 3 | import numpy as np 4 | try: 5 | from mpi4py import MPI 6 | except ImportError: 7 | MPI = None 8 | 9 | 10 | class MpiAdam(object): 11 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): 12 | self.var_list = var_list 13 | self.beta1 = beta1 14 | self.beta2 = beta2 15 | self.epsilon = epsilon 16 | self.scale_grad_by_procs = scale_grad_by_procs 17 | size = sum(U.numel(v) for v in var_list) 18 | self.m = np.zeros(size, 'float32') 19 | self.v = np.zeros(size, 'float32') 20 | self.t = 0 21 | self.setfromflat = U.SetFromFlat(var_list) 22 | self.getflat = U.GetFlat(var_list) 23 | self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm 24 | 25 | def update(self, localg, stepsize): 26 | if self.t % 100 == 0: 27 | self.check_synced() 28 | localg = localg.astype('float32') 29 | if self.comm is not None: 30 | globalg = np.zeros_like(localg) 31 | self.comm.Allreduce(localg, globalg, op=MPI.SUM) 32 | if self.scale_grad_by_procs: 33 | globalg /= self.comm.Get_size() 34 | else: 35 | globalg = np.copy(localg) 36 | 37 | self.t += 1 38 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) 39 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg 40 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) 41 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) 42 | self.setfromflat(self.getflat() + step) 43 | 44 | def sync(self): 45 | if self.comm is None: 46 | return 47 | theta = self.getflat() 48 | self.comm.Bcast(theta, root=0) 49 | self.setfromflat(theta) 50 | 51 | def check_synced(self): 52 | if self.comm is None: 53 | return 54 | if self.comm.Get_rank() == 0: # this is root 55 | theta = self.getflat() 56 | self.comm.Bcast(theta, root=0) 57 | else: 58 | thetalocal = self.getflat() 59 | thetaroot = np.empty_like(thetalocal) 60 | self.comm.Bcast(thetaroot, root=0) 61 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) 62 | 63 | @U.in_session 64 | def test_MpiAdam(): 65 | np.random.seed(0) 66 | tf.set_random_seed(0) 67 | 68 | a = tf.Variable(np.random.randn(3).astype('float32')) 69 | b = tf.Variable(np.random.randn(2,5).astype('float32')) 70 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) 71 | 72 | stepsize = 1e-2 73 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) 74 | do_update = U.function([], loss, updates=[update_op]) 75 | 76 | tf.get_default_session().run(tf.global_variables_initializer()) 77 | losslist_ref = [] 78 | for i in range(10): 79 | l = do_update() 80 | print(i, l) 81 | losslist_ref.append(l) 82 | 83 | 84 | 85 | tf.set_random_seed(0) 86 | tf.get_default_session().run(tf.global_variables_initializer()) 87 | 88 | var_list = [a,b] 89 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)]) 90 | adam = MpiAdam(var_list) 91 | 92 | losslist_test = [] 93 | for i in range(10): 94 | l,g = lossandgrad() 95 | adam.update(g, stepsize) 96 | print(i,l) 97 | losslist_test.append(l) 98 | 99 | np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4) 100 | 101 | 102 | if __name__ == '__main__': 103 | test_MpiAdam() 104 | -------------------------------------------------------------------------------- /miniworld/baselines/common/mpi_adam_optimizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from mpi4py import MPI 4 | 5 | class MpiAdamOptimizer(tf.train.AdamOptimizer): 6 | """Adam optimizer that averages gradients across mpi processes.""" 7 | def __init__(self, comm, **kwargs): 8 | self.comm = comm 9 | tf.train.AdamOptimizer.__init__(self, **kwargs) 10 | def compute_gradients(self, loss, var_list, **kwargs): 11 | grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs) 12 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] 13 | flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) 14 | shapes = [v.shape.as_list() for g, v in grads_and_vars] 15 | sizes = [int(np.prod(s)) for s in shapes] 16 | 17 | num_tasks = self.comm.Get_size() 18 | buf = np.zeros(sum(sizes), np.float32) 19 | 20 | def _collect_grads(flat_grad): 21 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) 22 | np.divide(buf, float(num_tasks), out=buf) 23 | return buf 24 | 25 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) 26 | avg_flat_grad.set_shape(flat_grad.shape) 27 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0) 28 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v) 29 | for g, (_, v) in zip(avg_grads, grads_and_vars)] 30 | 31 | return avg_grads_and_vars 32 | -------------------------------------------------------------------------------- /miniworld/baselines/common/mpi_fork.py: -------------------------------------------------------------------------------- 1 | import os, subprocess, sys 2 | 3 | def mpi_fork(n, bind_to_core=False): 4 | """Re-launches the current script with workers 5 | Returns "parent" for original parent, "child" for MPI children 6 | """ 7 | if n<=1: 8 | return "child" 9 | if os.getenv("IN_MPI") is None: 10 | env = os.environ.copy() 11 | env.update( 12 | MKL_NUM_THREADS="1", 13 | OMP_NUM_THREADS="1", 14 | IN_MPI="1" 15 | ) 16 | args = ["mpirun", "-np", str(n)] 17 | if bind_to_core: 18 | args += ["-bind-to", "core"] 19 | args += [sys.executable] + sys.argv 20 | subprocess.check_call(args, env=env) 21 | return "parent" 22 | else: 23 | return "child" 24 | -------------------------------------------------------------------------------- /miniworld/baselines/common/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | from baselines.common import zipsame 4 | 5 | 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False): 7 | x = np.asarray(x) 8 | assert x.ndim > 0 9 | if comm is None: comm = MPI.COMM_WORLD 10 | xsum = x.sum(axis=axis, keepdims=keepdims) 11 | n = xsum.size 12 | localsum = np.zeros(n+1, x.dtype) 13 | localsum[:n] = xsum.ravel() 14 | localsum[n] = x.shape[axis] 15 | globalsum = np.zeros_like(localsum) 16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM) 17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n] 18 | 19 | def mpi_moments(x, axis=0, comm=None, keepdims=False): 20 | x = np.asarray(x) 21 | assert x.ndim > 0 22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True) 23 | sqdiffs = np.square(x - mean) 24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) 25 | assert count1 == count 26 | std = np.sqrt(meansqdiff) 27 | if not keepdims: 28 | newshape = mean.shape[:axis] + mean.shape[axis+1:] 29 | mean = mean.reshape(newshape) 30 | std = std.reshape(newshape) 31 | return mean, std, count 32 | 33 | 34 | def test_runningmeanstd(): 35 | import subprocess 36 | subprocess.check_call(['mpirun', '-np', '3', 37 | 'python','-c', 38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()']) 39 | 40 | def _helper_runningmeanstd(): 41 | comm = MPI.COMM_WORLD 42 | np.random.seed(0) 43 | for (triple,axis) in [ 44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), 45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), 46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), 47 | ]: 48 | 49 | 50 | x = np.concatenate(triple, axis=axis) 51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] 52 | 53 | 54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) 55 | 56 | for (a1,a2) in zipsame(ms1, ms2): 57 | print(a1, a2) 58 | assert np.allclose(a1, a2) 59 | print("ok!") 60 | 61 | -------------------------------------------------------------------------------- /miniworld/baselines/common/mpi_util.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from mpi4py import MPI 3 | import os, numpy as np 4 | import platform 5 | import shutil 6 | import subprocess 7 | 8 | def sync_from_root(sess, variables, comm=None): 9 | """ 10 | Send the root node's parameters to every worker. 11 | Arguments: 12 | sess: the TensorFlow session. 13 | variables: all parameter variables including optimizer's 14 | """ 15 | if comm is None: comm = MPI.COMM_WORLD 16 | rank = comm.Get_rank() 17 | for var in variables: 18 | if rank == 0: 19 | comm.Bcast(sess.run(var)) 20 | else: 21 | import tensorflow as tf 22 | returned_var = np.empty(var.shape, dtype='float32') 23 | comm.Bcast(returned_var) 24 | sess.run(tf.assign(var, returned_var)) 25 | 26 | def gpu_count(): 27 | """ 28 | Count the GPUs on this machine. 29 | """ 30 | if shutil.which('nvidia-smi') is None: 31 | return 0 32 | output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv']) 33 | return max(0, len(output.split(b'\n')) - 2) 34 | 35 | def setup_mpi_gpus(): 36 | """ 37 | Set CUDA_VISIBLE_DEVICES using MPI. 38 | """ 39 | num_gpus = gpu_count() 40 | if num_gpus == 0: 41 | return 42 | local_rank, _ = get_local_rank_size(MPI.COMM_WORLD) 43 | os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus) 44 | 45 | def get_local_rank_size(comm): 46 | """ 47 | Returns the rank of each process on its machine 48 | The processes on a given machine will be assigned ranks 49 | 0, 1, 2, ..., N-1, 50 | where N is the number of processes on this machine. 51 | 52 | Useful if you want to assign one gpu per machine 53 | """ 54 | this_node = platform.node() 55 | ranks_nodes = comm.allgather((comm.Get_rank(), this_node)) 56 | node2rankssofar = defaultdict(int) 57 | local_rank = None 58 | for (rank, node) in ranks_nodes: 59 | if rank == comm.Get_rank(): 60 | local_rank = node2rankssofar[node] 61 | node2rankssofar[node] += 1 62 | assert local_rank is not None 63 | return local_rank, node2rankssofar[this_node] 64 | 65 | def share_file(comm, path): 66 | """ 67 | Copies the file from rank 0 to all other ranks 68 | Puts it in the same place on all machines 69 | """ 70 | localrank, _ = get_local_rank_size(comm) 71 | if comm.Get_rank() == 0: 72 | with open(path, 'rb') as fh: 73 | data = fh.read() 74 | comm.bcast(data) 75 | else: 76 | data = comm.bcast(None) 77 | if localrank == 0: 78 | os.makedirs(os.path.dirname(path), exist_ok=True) 79 | with open(path, 'wb') as fh: 80 | fh.write(data) 81 | comm.Barrier() 82 | 83 | def dict_gather(comm, d, op='mean', assert_all_have_data=True): 84 | if comm is None: return d 85 | alldicts = comm.allgather(d) 86 | size = comm.size 87 | k2li = defaultdict(list) 88 | for d in alldicts: 89 | for (k,v) in d.items(): 90 | k2li[k].append(v) 91 | result = {} 92 | for (k,li) in k2li.items(): 93 | if assert_all_have_data: 94 | assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k) 95 | if op=='mean': 96 | result[k] = np.mean(li, axis=0) 97 | elif op=='sum': 98 | result[k] = np.sum(li, axis=0) 99 | else: 100 | assert 0, op 101 | return result 102 | -------------------------------------------------------------------------------- /miniworld/baselines/common/runners.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC, abstractmethod 3 | 4 | class AbstractEnvRunner(ABC): 5 | def __init__(self, *, env, model, nsteps): 6 | self.env = env 7 | self.model = model 8 | self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape 10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) 11 | self.obs[:] = env.reset() 12 | self.nsteps = nsteps 13 | self.states = model.initial_state 14 | self.dones = [False for _ in range(nenv)] 15 | 16 | @abstractmethod 17 | def run(self): 18 | raise NotImplementedError 19 | 20 | -------------------------------------------------------------------------------- /miniworld/baselines/common/schedules.py: -------------------------------------------------------------------------------- 1 | """This file is used for specifying various schedules that evolve over 2 | time throughout the execution of the algorithm, such as: 3 | - learning rate for the optimizer 4 | - exploration epsilon for the epsilon greedy exploration strategy 5 | - beta parameter for beta parameter in prioritized replay 6 | 7 | Each schedule has a function `value(t)` which returns the current value 8 | of the parameter given the timestep t of the optimization procedure. 9 | """ 10 | 11 | 12 | class Schedule(object): 13 | def value(self, t): 14 | """Value of the schedule at time t""" 15 | raise NotImplementedError() 16 | 17 | 18 | class ConstantSchedule(object): 19 | def __init__(self, value): 20 | """Value remains constant over time. 21 | 22 | Parameters 23 | ---------- 24 | value: float 25 | Constant value of the schedule 26 | """ 27 | self._v = value 28 | 29 | def value(self, t): 30 | """See Schedule.value""" 31 | return self._v 32 | 33 | 34 | def linear_interpolation(l, r, alpha): 35 | return l + alpha * (r - l) 36 | 37 | 38 | class PiecewiseSchedule(object): 39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 40 | """Piecewise schedule. 41 | 42 | endpoints: [(int, int)] 43 | list of pairs `(time, value)` meanining that schedule should output 44 | `value` when `t==time`. All the values for time must be sorted in 45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 48 | time passed between `time_a` and `time_b` for time `t`. 49 | interpolation: lambda float, float, float: float 50 | a function that takes value to the left and to the right of t according 51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 52 | right endpoint that t has covered. See linear_interpolation for example. 53 | outside_value: float 54 | if the value is requested outside of all the intervals sepecified in 55 | `endpoints` this value is returned. If None then AssertionError is 56 | raised when outside value is requested. 57 | """ 58 | idxes = [e[0] for e in endpoints] 59 | assert idxes == sorted(idxes) 60 | self._interpolation = interpolation 61 | self._outside_value = outside_value 62 | self._endpoints = endpoints 63 | 64 | def value(self, t): 65 | """See Schedule.value""" 66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 67 | if l_t <= t and t < r_t: 68 | alpha = float(t - l_t) / (r_t - l_t) 69 | return self._interpolation(l, r, alpha) 70 | 71 | # t does not belong to any of the pieces, so doom. 72 | assert self._outside_value is not None 73 | return self._outside_value 74 | 75 | 76 | class LinearSchedule(object): 77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 78 | """Linear interpolation between initial_p and final_p over 79 | schedule_timesteps. After this many timesteps pass final_p is 80 | returned. 81 | 82 | Parameters 83 | ---------- 84 | schedule_timesteps: int 85 | Number of timesteps for which to linearly anneal initial_p 86 | to final_p 87 | initial_p: float 88 | initial output value 89 | final_p: float 90 | final output value 91 | """ 92 | self.schedule_timesteps = schedule_timesteps 93 | self.final_p = final_p 94 | self.initial_p = initial_p 95 | 96 | def value(self, t): 97 | """See Schedule.value""" 98 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 99 | return self.initial_p + fraction * (self.final_p - self.initial_p) 100 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/baselines/common/tests/__init__.py -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/baselines/common/tests/envs/__init__.py -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/envs/fixed_sequence_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import Env 3 | from gym.spaces import Discrete 4 | 5 | 6 | class FixedSequenceEnv(Env): 7 | def __init__( 8 | self, 9 | n_actions=10, 10 | seed=0, 11 | episode_len=100 12 | ): 13 | self.np_random = np.random.RandomState() 14 | self.np_random.seed(seed) 15 | self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)] 16 | 17 | self.action_space = Discrete(n_actions) 18 | self.observation_space = Discrete(1) 19 | 20 | self.episode_len = episode_len 21 | self.time = 0 22 | self.reset() 23 | 24 | def reset(self): 25 | self.time = 0 26 | return 0 27 | 28 | def step(self, actions): 29 | rew = self._get_reward(actions) 30 | self._choose_next_state() 31 | done = False 32 | if self.episode_len and self.time >= self.episode_len: 33 | rew = 0 34 | done = True 35 | 36 | return 0, rew, done, {} 37 | 38 | def _choose_next_state(self): 39 | self.time += 1 40 | 41 | def _get_reward(self, actions): 42 | return 1 if actions == self.sequence[self.time] else 0 43 | 44 | 45 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/envs/identity_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import abstractmethod 3 | from gym import Env 4 | from gym.spaces import MultiDiscrete, Discrete, Box 5 | 6 | 7 | class IdentityEnv(Env): 8 | def __init__( 9 | self, 10 | episode_len=None 11 | ): 12 | 13 | self.episode_len = episode_len 14 | self.time = 0 15 | self.reset() 16 | 17 | def reset(self): 18 | self._choose_next_state() 19 | self.time = 0 20 | self.observation_space = self.action_space 21 | 22 | return self.state 23 | 24 | def step(self, actions): 25 | rew = self._get_reward(actions) 26 | self._choose_next_state() 27 | done = False 28 | if self.episode_len and self.time >= self.episode_len: 29 | rew = 0 30 | done = True 31 | 32 | return self.state, rew, done, {} 33 | 34 | def _choose_next_state(self): 35 | self.state = self.action_space.sample() 36 | self.time += 1 37 | 38 | @abstractmethod 39 | def _get_reward(self, actions): 40 | raise NotImplementedError 41 | 42 | 43 | class DiscreteIdentityEnv(IdentityEnv): 44 | def __init__( 45 | self, 46 | dim, 47 | episode_len=None, 48 | ): 49 | 50 | self.action_space = Discrete(dim) 51 | super().__init__(episode_len=episode_len) 52 | 53 | def _get_reward(self, actions): 54 | return 1 if self.state == actions else 0 55 | 56 | class MultiDiscreteIdentityEnv(IdentityEnv): 57 | def __init__( 58 | self, 59 | dims, 60 | episode_len=None, 61 | ): 62 | 63 | self.action_space = MultiDiscrete(dims) 64 | super().__init__(episode_len=episode_len) 65 | 66 | def _get_reward(self, actions): 67 | return 1 if all(self.state == actions) else 0 68 | 69 | 70 | class BoxIdentityEnv(IdentityEnv): 71 | def __init__( 72 | self, 73 | shape, 74 | episode_len=None, 75 | ): 76 | 77 | self.action_space = Box(low=-1.0, high=1.0, shape=shape) 78 | super().__init__(episode_len=episode_len) 79 | 80 | def _get_reward(self, actions): 81 | diff = actions - self.state 82 | diff = diff[:] 83 | return -0.5 * np.dot(diff, diff) 84 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/envs/mnist_env.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import numpy as np 3 | import tempfile 4 | from gym import Env 5 | from gym.spaces import Discrete, Box 6 | 7 | 8 | 9 | class MnistEnv(Env): 10 | def __init__( 11 | self, 12 | seed=0, 13 | episode_len=None, 14 | no_images=None 15 | ): 16 | import filelock 17 | from tensorflow.examples.tutorials.mnist import input_data 18 | # we could use temporary directory for this with a context manager and 19 | # TemporaryDirecotry, but then each test that uses mnist would re-download the data 20 | # this way the data is not cleaned up, but we only download it once per machine 21 | mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data') 22 | with filelock.FileLock(mnist_path + '.lock'): 23 | self.mnist = input_data.read_data_sets(mnist_path) 24 | 25 | self.np_random = np.random.RandomState() 26 | self.np_random.seed(seed) 27 | 28 | self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1)) 29 | self.action_space = Discrete(10) 30 | self.episode_len = episode_len 31 | self.time = 0 32 | self.no_images = no_images 33 | 34 | self.train_mode() 35 | self.reset() 36 | 37 | def reset(self): 38 | self._choose_next_state() 39 | self.time = 0 40 | 41 | return self.state[0] 42 | 43 | def step(self, actions): 44 | rew = self._get_reward(actions) 45 | self._choose_next_state() 46 | done = False 47 | if self.episode_len and self.time >= self.episode_len: 48 | rew = 0 49 | done = True 50 | 51 | return self.state[0], rew, done, {} 52 | 53 | def train_mode(self): 54 | self.dataset = self.mnist.train 55 | 56 | def test_mode(self): 57 | self.dataset = self.mnist.test 58 | 59 | def _choose_next_state(self): 60 | max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1 61 | index = self.np_random.randint(0, max_index) 62 | image = self.dataset.images[index].reshape(28,28,1)*255 63 | label = self.dataset.labels[index] 64 | self.state = (image, label) 65 | self.time += 1 66 | 67 | def _get_reward(self, actions): 68 | return 1 if self.state[1] == actions else 0 69 | 70 | 71 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_cartpole.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | 4 | from baselines.run import get_learn_function 5 | from baselines.common.tests.util import reward_per_episode_test 6 | 7 | common_kwargs = dict( 8 | total_timesteps=30000, 9 | network='mlp', 10 | gamma=1.0, 11 | seed=0, 12 | ) 13 | 14 | learn_kwargs = { 15 | 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05), 16 | 'acer': dict(value_network='copy'), 17 | 'acktr': dict(nsteps=32, value_network='copy', is_async=False), 18 | 'deepq': dict(total_timesteps=20000), 19 | 'ppo2': dict(value_network='copy'), 20 | 'trpo_mpi': {} 21 | } 22 | 23 | @pytest.mark.slow 24 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 25 | def test_cartpole(alg): 26 | ''' 27 | Test if the algorithm (with an mlp policy) 28 | can learn to balance the cartpole 29 | ''' 30 | 31 | kwargs = common_kwargs.copy() 32 | kwargs.update(learn_kwargs[alg]) 33 | 34 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 35 | def env_fn(): 36 | 37 | env = gym.make('CartPole-v0') 38 | env.seed(0) 39 | return env 40 | 41 | reward_per_episode_test(env_fn, learn_fn, 100) 42 | 43 | if __name__ == '__main__': 44 | test_cartpole('acer') 45 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_doc_examples.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | try: 3 | import mujoco_py 4 | _mujoco_present = True 5 | except BaseException: 6 | mujoco_py = None 7 | _mujoco_present = False 8 | 9 | 10 | @pytest.mark.skipif( 11 | not _mujoco_present, 12 | reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library' 13 | ) 14 | def test_lstm_example(): 15 | import tensorflow as tf 16 | from baselines.common import policies, models, cmd_util 17 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 18 | 19 | # create vectorized environment 20 | venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) 21 | 22 | with tf.Session() as sess: 23 | # build policy based on lstm network with 128 units 24 | policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) 25 | 26 | # initialize tensorflow variables 27 | sess.run(tf.global_variables_initializer()) 28 | 29 | # prepare environment variables 30 | ob = venv.reset() 31 | state = policy.initial_state 32 | done = [False] 33 | step_counter = 0 34 | 35 | # run a single episode until the end (i.e. until done) 36 | while True: 37 | action, _, state, _ = policy.step(ob, S=state, M=done) 38 | ob, reward, done, _ = venv.step(action) 39 | step_counter += 1 40 | if done: 41 | break 42 | 43 | 44 | assert step_counter > 5 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_env_after_learn.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | import tensorflow as tf 4 | 5 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv 6 | from baselines.run import get_learn_function 7 | from baselines.common.tf_util import make_session 8 | 9 | algos = ['a2c', 'acer', 'acktr', 'deepq', 'ppo2', 'trpo_mpi'] 10 | 11 | @pytest.mark.parametrize('algo', algos) 12 | def test_env_after_learn(algo): 13 | def make_env(): 14 | # acktr requires too much RAM, fails on travis 15 | env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') 16 | return env 17 | 18 | make_session(make_default=True, graph=tf.Graph()) 19 | env = SubprocVecEnv([make_env]) 20 | 21 | learn = get_learn_function(algo) 22 | 23 | # Commenting out the following line resolves the issue, though crash happens at env.reset(). 24 | learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) 25 | 26 | env.reset() 27 | env.close() 28 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_fetchreach.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | 4 | from baselines.run import get_learn_function 5 | from baselines.common.tests.util import reward_per_episode_test 6 | 7 | pytest.importorskip('mujoco_py') 8 | 9 | common_kwargs = dict( 10 | network='mlp', 11 | seed=0, 12 | ) 13 | 14 | learn_kwargs = { 15 | 'her': dict(total_timesteps=2000) 16 | } 17 | 18 | @pytest.mark.slow 19 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 20 | def test_fetchreach(alg): 21 | ''' 22 | Test if the algorithm (with an mlp policy) 23 | can learn the FetchReach task 24 | ''' 25 | 26 | kwargs = common_kwargs.copy() 27 | kwargs.update(learn_kwargs[alg]) 28 | 29 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 30 | def env_fn(): 31 | 32 | env = gym.make('FetchReach-v1') 33 | env.seed(0) 34 | return env 35 | 36 | reward_per_episode_test(env_fn, learn_fn, -15) 37 | 38 | if __name__ == '__main__': 39 | test_fetchreach('her') 40 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_fixed_sequence.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv 3 | 4 | from baselines.common.tests.util import simple_test 5 | from baselines.run import get_learn_function 6 | 7 | common_kwargs = dict( 8 | seed=0, 9 | total_timesteps=50000, 10 | ) 11 | 12 | learn_kwargs = { 13 | 'a2c': {}, 14 | 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1), 15 | # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps) 16 | # github issue: https://github.com/openai/baselines/issues/188 17 | # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001) 18 | } 19 | 20 | 21 | alg_list = learn_kwargs.keys() 22 | rnn_list = ['lstm'] 23 | 24 | @pytest.mark.slow 25 | @pytest.mark.parametrize("alg", alg_list) 26 | @pytest.mark.parametrize("rnn", rnn_list) 27 | def test_fixed_sequence(alg, rnn): 28 | ''' 29 | Test if the algorithm (with a given policy) 30 | can learn an identity transformation (i.e. return observation as an action) 31 | ''' 32 | 33 | kwargs = learn_kwargs[alg] 34 | kwargs.update(common_kwargs) 35 | 36 | episode_len = 5 37 | env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len) 38 | learn = lambda e: get_learn_function(alg)( 39 | env=e, 40 | network=rnn, 41 | **kwargs 42 | ) 43 | 44 | simple_test(env_fn, learn, 0.7) 45 | 46 | 47 | if __name__ == '__main__': 48 | test_fixed_sequence('ppo2', 'lstm') 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_identity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv 3 | from baselines.run import get_learn_function 4 | from baselines.common.tests.util import simple_test 5 | 6 | common_kwargs = dict( 7 | total_timesteps=30000, 8 | network='mlp', 9 | gamma=0.9, 10 | seed=0, 11 | ) 12 | 13 | learn_kwargs = { 14 | 'a2c' : {}, 15 | 'acktr': {}, 16 | 'deepq': {}, 17 | 'ddpg': dict(layer_norm=True), 18 | 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0), 19 | 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01) 20 | } 21 | 22 | 23 | algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi'] 24 | algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi'] 25 | algos_cont = ['a2c', 'acktr', 'ddpg', 'ppo2', 'trpo_mpi'] 26 | 27 | @pytest.mark.slow 28 | @pytest.mark.parametrize("alg", algos_disc) 29 | def test_discrete_identity(alg): 30 | ''' 31 | Test if the algorithm (with an mlp policy) 32 | can learn an identity transformation (i.e. return observation as an action) 33 | ''' 34 | 35 | kwargs = learn_kwargs[alg] 36 | kwargs.update(common_kwargs) 37 | 38 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 39 | env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100) 40 | simple_test(env_fn, learn_fn, 0.9) 41 | 42 | @pytest.mark.slow 43 | @pytest.mark.parametrize("alg", algos_multidisc) 44 | def test_multidiscrete_identity(alg): 45 | ''' 46 | Test if the algorithm (with an mlp policy) 47 | can learn an identity transformation (i.e. return observation as an action) 48 | ''' 49 | 50 | kwargs = learn_kwargs[alg] 51 | kwargs.update(common_kwargs) 52 | 53 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 54 | env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100) 55 | simple_test(env_fn, learn_fn, 0.9) 56 | 57 | @pytest.mark.slow 58 | @pytest.mark.parametrize("alg", algos_cont) 59 | def test_continuous_identity(alg): 60 | ''' 61 | Test if the algorithm (with an mlp policy) 62 | can learn an identity transformation (i.e. return observation as an action) 63 | to a required precision 64 | ''' 65 | 66 | kwargs = learn_kwargs[alg] 67 | kwargs.update(common_kwargs) 68 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 69 | 70 | env_fn = lambda: BoxIdentityEnv((1,), episode_len=100) 71 | simple_test(env_fn, learn_fn, -0.1) 72 | 73 | if __name__ == '__main__': 74 | test_multidiscrete_identity('acktr') 75 | 76 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_mnist.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # from baselines.acer import acer_simple as acer 4 | from baselines.common.tests.envs.mnist_env import MnistEnv 5 | from baselines.common.tests.util import simple_test 6 | from baselines.run import get_learn_function 7 | 8 | 9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem? 10 | # GitHub issue https://github.com/openai/baselines/issues/189 11 | common_kwargs = { 12 | 'seed': 0, 13 | 'network':'cnn', 14 | 'gamma':0.9, 15 | 'pad':'SAME' 16 | } 17 | 18 | learn_args = { 19 | 'a2c': dict(total_timesteps=50000), 20 | 'acer': dict(total_timesteps=20000), 21 | 'deepq': dict(total_timesteps=5000), 22 | 'acktr': dict(total_timesteps=30000), 23 | 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0), 24 | 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001) 25 | } 26 | 27 | 28 | #tests pass, but are too slow on travis. Same algorithms are covered 29 | # by other tests with less compute-hungry nn's and by benchmarks 30 | @pytest.mark.skip 31 | @pytest.mark.slow 32 | @pytest.mark.parametrize("alg", learn_args.keys()) 33 | def test_mnist(alg): 34 | ''' 35 | Test if the algorithm can learn to classify MNIST digits. 36 | Uses CNN policy. 37 | ''' 38 | 39 | learn_kwargs = learn_args[alg] 40 | learn_kwargs.update(common_kwargs) 41 | 42 | learn = get_learn_function(alg) 43 | learn_fn = lambda e: learn(env=e, **learn_kwargs) 44 | env_fn = lambda: MnistEnv(seed=0, episode_len=100) 45 | 46 | simple_test(env_fn, learn_fn, 0.6) 47 | 48 | if __name__ == '__main__': 49 | test_mnist('acer') 50 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_schedules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule 4 | 5 | 6 | def test_piecewise_schedule(): 7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) 8 | 9 | assert np.isclose(ps.value(-10), 500) 10 | assert np.isclose(ps.value(0), 150) 11 | assert np.isclose(ps.value(5), 200) 12 | assert np.isclose(ps.value(9), 80) 13 | assert np.isclose(ps.value(50), 50) 14 | assert np.isclose(ps.value(80), 50) 15 | assert np.isclose(ps.value(150), 0) 16 | assert np.isclose(ps.value(175), -25) 17 | assert np.isclose(ps.value(201), 500) 18 | assert np.isclose(ps.value(500), 500) 19 | 20 | assert np.isclose(ps.value(200 - 1e-10), -50) 21 | 22 | 23 | def test_constant_schedule(): 24 | cs = ConstantSchedule(5) 25 | for i in range(-100, 100): 26 | assert np.isclose(cs.value(i), 5) 27 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_segment_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 4 | 5 | 6 | def test_tree_set(): 7 | tree = SumSegmentTree(4) 8 | 9 | tree[2] = 1.0 10 | tree[3] = 3.0 11 | 12 | assert np.isclose(tree.sum(), 4.0) 13 | assert np.isclose(tree.sum(0, 2), 0.0) 14 | assert np.isclose(tree.sum(0, 3), 1.0) 15 | assert np.isclose(tree.sum(2, 3), 1.0) 16 | assert np.isclose(tree.sum(2, -1), 1.0) 17 | assert np.isclose(tree.sum(2, 4), 4.0) 18 | 19 | 20 | def test_tree_set_overlap(): 21 | tree = SumSegmentTree(4) 22 | 23 | tree[2] = 1.0 24 | tree[2] = 3.0 25 | 26 | assert np.isclose(tree.sum(), 3.0) 27 | assert np.isclose(tree.sum(2, 3), 3.0) 28 | assert np.isclose(tree.sum(2, -1), 3.0) 29 | assert np.isclose(tree.sum(2, 4), 3.0) 30 | assert np.isclose(tree.sum(1, 2), 0.0) 31 | 32 | 33 | def test_prefixsum_idx(): 34 | tree = SumSegmentTree(4) 35 | 36 | tree[2] = 1.0 37 | tree[3] = 3.0 38 | 39 | assert tree.find_prefixsum_idx(0.0) == 2 40 | assert tree.find_prefixsum_idx(0.5) == 2 41 | assert tree.find_prefixsum_idx(0.99) == 2 42 | assert tree.find_prefixsum_idx(1.01) == 3 43 | assert tree.find_prefixsum_idx(3.00) == 3 44 | assert tree.find_prefixsum_idx(4.00) == 3 45 | 46 | 47 | def test_prefixsum_idx2(): 48 | tree = SumSegmentTree(4) 49 | 50 | tree[0] = 0.5 51 | tree[1] = 1.0 52 | tree[2] = 1.0 53 | tree[3] = 3.0 54 | 55 | assert tree.find_prefixsum_idx(0.00) == 0 56 | assert tree.find_prefixsum_idx(0.55) == 1 57 | assert tree.find_prefixsum_idx(0.99) == 1 58 | assert tree.find_prefixsum_idx(1.51) == 2 59 | assert tree.find_prefixsum_idx(3.00) == 3 60 | assert tree.find_prefixsum_idx(5.50) == 3 61 | 62 | 63 | def test_max_interval_tree(): 64 | tree = MinSegmentTree(4) 65 | 66 | tree[0] = 1.0 67 | tree[2] = 0.5 68 | tree[3] = 3.0 69 | 70 | assert np.isclose(tree.min(), 0.5) 71 | assert np.isclose(tree.min(0, 2), 1.0) 72 | assert np.isclose(tree.min(0, 3), 0.5) 73 | assert np.isclose(tree.min(0, -1), 0.5) 74 | assert np.isclose(tree.min(2, 4), 0.5) 75 | assert np.isclose(tree.min(3, 4), 3.0) 76 | 77 | tree[2] = 0.7 78 | 79 | assert np.isclose(tree.min(), 0.7) 80 | assert np.isclose(tree.min(0, 2), 1.0) 81 | assert np.isclose(tree.min(0, 3), 0.7) 82 | assert np.isclose(tree.min(0, -1), 0.7) 83 | assert np.isclose(tree.min(2, 4), 0.7) 84 | assert np.isclose(tree.min(3, 4), 3.0) 85 | 86 | tree[2] = 4.0 87 | 88 | assert np.isclose(tree.min(), 1.0) 89 | assert np.isclose(tree.min(0, 2), 1.0) 90 | assert np.isclose(tree.min(0, 3), 1.0) 91 | assert np.isclose(tree.min(0, -1), 1.0) 92 | assert np.isclose(tree.min(2, 4), 3.0) 93 | assert np.isclose(tree.min(2, 3), 4.0) 94 | assert np.isclose(tree.min(2, -1), 4.0) 95 | assert np.isclose(tree.min(3, 4), 3.0) 96 | 97 | 98 | if __name__ == '__main__': 99 | test_tree_set() 100 | test_tree_set_overlap() 101 | test_prefixsum_idx() 102 | test_prefixsum_idx2() 103 | test_max_interval_tree() 104 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/test_tf_util.py: -------------------------------------------------------------------------------- 1 | # tests for tf_util 2 | import tensorflow as tf 3 | from baselines.common.tf_util import ( 4 | function, 5 | initialize, 6 | single_threaded_session 7 | ) 8 | 9 | 10 | def test_function(): 11 | with tf.Graph().as_default(): 12 | x = tf.placeholder(tf.int32, (), name="x") 13 | y = tf.placeholder(tf.int32, (), name="y") 14 | z = 3 * x + 2 * y 15 | lin = function([x, y], z, givens={y: 0}) 16 | 17 | with single_threaded_session(): 18 | initialize() 19 | 20 | assert lin(2) == 6 21 | assert lin(2, 2) == 10 22 | 23 | 24 | def test_multikwargs(): 25 | with tf.Graph().as_default(): 26 | x = tf.placeholder(tf.int32, (), name="x") 27 | with tf.variable_scope("other"): 28 | x2 = tf.placeholder(tf.int32, (), name="x") 29 | z = 3 * x + 2 * x2 30 | 31 | lin = function([x, x2], z, givens={x2: 0}) 32 | with single_threaded_session(): 33 | initialize() 34 | assert lin(2) == 6 35 | assert lin(2, 2) == 10 36 | 37 | 38 | if __name__ == '__main__': 39 | test_function() 40 | test_multikwargs() 41 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tests/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from gym.spaces import np_random 4 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 5 | 6 | N_TRIALS = 10000 7 | N_EPISODES = 100 8 | 9 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS): 10 | np.random.seed(0) 11 | np_random.seed(0) 12 | 13 | env = DummyVecEnv([env_fn]) 14 | 15 | 16 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): 17 | tf.set_random_seed(0) 18 | 19 | model = learn_fn(env) 20 | 21 | sum_rew = 0 22 | done = True 23 | 24 | for i in range(n_trials): 25 | if done: 26 | obs = env.reset() 27 | state = model.initial_state 28 | 29 | if state is not None: 30 | a, v, state, _ = model.step(obs, S=state, M=[False]) 31 | else: 32 | a, v, _, _ = model.step(obs) 33 | 34 | obs, rew, done, _ = env.step(a) 35 | sum_rew += float(rew) 36 | 37 | print("Reward in {} trials is {}".format(n_trials, sum_rew)) 38 | assert sum_rew > min_reward_fraction * n_trials, \ 39 | 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials) 40 | 41 | 42 | 43 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES): 44 | env = DummyVecEnv([env_fn]) 45 | 46 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): 47 | model = learn_fn(env) 48 | 49 | N_TRIALS = 100 50 | 51 | observations, actions, rewards = rollout(env, model, N_TRIALS) 52 | rewards = [sum(r) for r in rewards] 53 | 54 | avg_rew = sum(rewards) / N_TRIALS 55 | print("Average reward in {} episodes is {}".format(n_trials, avg_rew)) 56 | assert avg_rew > min_avg_reward, \ 57 | 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward) 58 | 59 | def rollout(env, model, n_trials): 60 | rewards = [] 61 | actions = [] 62 | observations = [] 63 | 64 | for i in range(n_trials): 65 | obs = env.reset() 66 | state = model.initial_state if hasattr(model, 'initial_state') else None 67 | episode_rew = [] 68 | episode_actions = [] 69 | episode_obs = [] 70 | 71 | while True: 72 | if state is not None: 73 | a, v, state, _ = model.step(obs, S=state, M=[False]) 74 | else: 75 | a,v, _, _ = model.step(obs) 76 | 77 | obs, rew, done, _ = env.step(a) 78 | 79 | episode_rew.append(rew) 80 | episode_actions.append(a) 81 | episode_obs.append(obs) 82 | 83 | if done: 84 | break 85 | 86 | rewards.append(episode_rew) 87 | actions.append(episode_actions) 88 | observations.append(episode_obs) 89 | 90 | return observations, actions, rewards 91 | 92 | -------------------------------------------------------------------------------- /miniworld/baselines/common/tile_images.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def tile_images(img_nhwc): 4 | """ 5 | Tile N images into one big PxQ image 6 | (P,Q) are chosen to be as close as possible, and if N 7 | is square, then P=Q. 8 | 9 | input: img_nhwc, list or array of images, ndim=4 once turned into array 10 | n = batch index, h = height, w = width, c = channel 11 | returns: 12 | bigim_HWc, ndarray with ndim=3 13 | """ 14 | img_nhwc = np.asarray(img_nhwc) 15 | N, h, w, c = img_nhwc.shape 16 | H = int(np.ceil(np.sqrt(N))) 17 | W = int(np.ceil(float(N)/H)) 18 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)]) 19 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c) 20 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4) 21 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c) 22 | return img_Hh_Ww_c 23 | 24 | -------------------------------------------------------------------------------- /miniworld/baselines/common/vec_env/dummy_vec_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | from . import VecEnv 4 | from .util import copy_obs_dict, dict_to_obs, obs_space_info 5 | 6 | class DummyVecEnv(VecEnv): 7 | """ 8 | VecEnv that does runs multiple environments sequentially, that is, 9 | the step and reset commands are send to one environment at a time. 10 | Useful when debugging and when num_env == 1 (in the latter case, 11 | avoids communication overhead) 12 | """ 13 | def __init__(self, env_fns): 14 | """ 15 | Arguments: 16 | 17 | env_fns: iterable of callables functions that build environments 18 | """ 19 | self.envs = [fn() for fn in env_fns] 20 | env = self.envs[0] 21 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 22 | obs_space = env.observation_space 23 | self.keys, shapes, dtypes = obs_space_info(obs_space) 24 | 25 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } 26 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) 27 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) 28 | self.buf_infos = [{} for _ in range(self.num_envs)] 29 | self.actions = None 30 | self.specs = [e.spec for e in self.envs] 31 | 32 | def step_async(self, actions): 33 | listify = True 34 | try: 35 | if len(actions) == self.num_envs: 36 | listify = False 37 | except TypeError: 38 | pass 39 | 40 | if not listify: 41 | self.actions = actions 42 | else: 43 | assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs) 44 | self.actions = [actions] 45 | 46 | def step_wait(self): 47 | for e in range(self.num_envs): 48 | action = self.actions[e] 49 | if isinstance(self.envs[e].action_space, spaces.Discrete): 50 | action = int(action) 51 | 52 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action) 53 | if self.buf_dones[e]: 54 | obs = self.envs[e].reset() 55 | self._save_obs(e, obs) 56 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), 57 | self.buf_infos.copy()) 58 | 59 | def reset(self): 60 | for e in range(self.num_envs): 61 | obs = self.envs[e].reset() 62 | self._save_obs(e, obs) 63 | return self._obs_from_buf() 64 | 65 | def _save_obs(self, e, obs): 66 | for k in self.keys: 67 | if k is None: 68 | self.buf_obs[k][e] = obs 69 | else: 70 | self.buf_obs[k][e] = obs[k] 71 | 72 | def _obs_from_buf(self): 73 | return dict_to_obs(copy_obs_dict(self.buf_obs)) 74 | 75 | def get_images(self): 76 | return [env.render(mode='rgb_array') for env in self.envs] 77 | 78 | def render(self, mode='human'): 79 | if self.num_envs == 1: 80 | return self.envs[0].render(mode=mode) 81 | else: 82 | return super().render(mode=mode) 83 | -------------------------------------------------------------------------------- /miniworld/baselines/common/vec_env/test_vec_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for asynchronous vectorized environments. 3 | """ 4 | 5 | import gym 6 | import numpy as np 7 | import pytest 8 | from .dummy_vec_env import DummyVecEnv 9 | from .shmem_vec_env import ShmemVecEnv 10 | from .subproc_vec_env import SubprocVecEnv 11 | 12 | 13 | def assert_envs_equal(env1, env2, num_steps): 14 | """ 15 | Compare two environments over num_steps steps and make sure 16 | that the observations produced by each are the same when given 17 | the same actions. 18 | """ 19 | assert env1.num_envs == env2.num_envs 20 | assert env1.action_space.shape == env2.action_space.shape 21 | assert env1.action_space.dtype == env2.action_space.dtype 22 | joint_shape = (env1.num_envs,) + env1.action_space.shape 23 | 24 | try: 25 | obs1, obs2 = env1.reset(), env2.reset() 26 | assert np.array(obs1).shape == np.array(obs2).shape 27 | assert np.array(obs1).shape == joint_shape 28 | assert np.allclose(obs1, obs2) 29 | np.random.seed(1337) 30 | for _ in range(num_steps): 31 | actions = np.array(np.random.randint(0, 0x100, size=joint_shape), 32 | dtype=env1.action_space.dtype) 33 | for env in [env1, env2]: 34 | env.step_async(actions) 35 | outs1 = env1.step_wait() 36 | outs2 = env2.step_wait() 37 | for out1, out2 in zip(outs1[:3], outs2[:3]): 38 | assert np.array(out1).shape == np.array(out2).shape 39 | assert np.allclose(out1, out2) 40 | assert list(outs1[3]) == list(outs2[3]) 41 | finally: 42 | env1.close() 43 | env2.close() 44 | 45 | 46 | @pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv)) 47 | @pytest.mark.parametrize('dtype', ('uint8', 'float32')) 48 | def test_vec_env(klass, dtype): # pylint: disable=R0914 49 | """ 50 | Test that a vectorized environment is equivalent to 51 | DummyVecEnv, since DummyVecEnv is less likely to be 52 | error prone. 53 | """ 54 | num_envs = 3 55 | num_steps = 100 56 | shape = (3, 8) 57 | 58 | def make_fn(seed): 59 | """ 60 | Get an environment constructor with a seed. 61 | """ 62 | return lambda: SimpleEnv(seed, shape, dtype) 63 | fns = [make_fn(i) for i in range(num_envs)] 64 | env1 = DummyVecEnv(fns) 65 | env2 = klass(fns) 66 | assert_envs_equal(env1, env2, num_steps=num_steps) 67 | 68 | 69 | class SimpleEnv(gym.Env): 70 | """ 71 | An environment with a pre-determined observation space 72 | and RNG seed. 73 | """ 74 | 75 | def __init__(self, seed, shape, dtype): 76 | np.random.seed(seed) 77 | self._dtype = dtype 78 | self._start_obs = np.array(np.random.randint(0, 0x100, size=shape), 79 | dtype=dtype) 80 | self._max_steps = seed + 1 81 | self._cur_obs = None 82 | self._cur_step = 0 83 | # this is 0xFF instead of 0x100 because the Box space includes 84 | # the high end, while randint does not 85 | self.action_space = gym.spaces.Box(low=0, high=0xFF, shape=shape, dtype=dtype) 86 | self.observation_space = self.action_space 87 | 88 | def step(self, action): 89 | self._cur_obs += np.array(action, dtype=self._dtype) 90 | self._cur_step += 1 91 | done = self._cur_step >= self._max_steps 92 | reward = self._cur_step / self._max_steps 93 | return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)} 94 | 95 | def reset(self): 96 | self._cur_obs = self._start_obs 97 | self._cur_step = 0 98 | return self._cur_obs 99 | 100 | def render(self, mode=None): 101 | raise NotImplementedError 102 | -------------------------------------------------------------------------------- /miniworld/baselines/common/vec_env/test_video_recorder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for asynchronous vectorized environments. 3 | """ 4 | 5 | import gym 6 | import pytest 7 | import os 8 | import glob 9 | import tempfile 10 | 11 | from .dummy_vec_env import DummyVecEnv 12 | from .shmem_vec_env import ShmemVecEnv 13 | from .subproc_vec_env import SubprocVecEnv 14 | from .vec_video_recorder import VecVideoRecorder 15 | 16 | @pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv)) 17 | @pytest.mark.parametrize('num_envs', (1, 4)) 18 | @pytest.mark.parametrize('video_length', (10, 100)) 19 | @pytest.mark.parametrize('video_interval', (1, 50)) 20 | def test_video_recorder(klass, num_envs, video_length, video_interval): 21 | """ 22 | Wrap an existing VecEnv with VevVideoRecorder, 23 | Make (video_interval + video_length + 1) steps, 24 | then check that the file is present 25 | """ 26 | 27 | def make_fn(): 28 | env = gym.make('PongNoFrameskip-v4') 29 | return env 30 | fns = [make_fn for _ in range(num_envs)] 31 | env = klass(fns) 32 | 33 | with tempfile.TemporaryDirectory() as video_path: 34 | env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length) 35 | 36 | env.reset() 37 | for _ in range(video_interval + video_length + 1): 38 | env.step([0] * num_envs) 39 | env.close() 40 | 41 | 42 | recorded_video = glob.glob(os.path.join(video_path, "*.mp4")) 43 | 44 | # first and second step 45 | assert len(recorded_video) == 2 46 | # Files are not empty 47 | assert all(os.stat(p).st_size != 0 for p in recorded_video) 48 | 49 | 50 | -------------------------------------------------------------------------------- /miniworld/baselines/common/vec_env/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for dealing with vectorized environments. 3 | """ 4 | 5 | from collections import OrderedDict 6 | 7 | import gym 8 | import numpy as np 9 | 10 | 11 | def copy_obs_dict(obs): 12 | """ 13 | Deep-copy an observation dict. 14 | """ 15 | return {k: np.copy(v) for k, v in obs.items()} 16 | 17 | 18 | def dict_to_obs(obs_dict): 19 | """ 20 | Convert an observation dict into a raw array if the 21 | original observation space was not a Dict space. 22 | """ 23 | if set(obs_dict.keys()) == {None}: 24 | return obs_dict[None] 25 | return obs_dict 26 | 27 | 28 | def obs_space_info(obs_space): 29 | """ 30 | Get dict-structured information about a gym.Space. 31 | 32 | Returns: 33 | A tuple (keys, shapes, dtypes): 34 | keys: a list of dict keys. 35 | shapes: a dict mapping keys to shapes. 36 | dtypes: a dict mapping keys to dtypes. 37 | """ 38 | if isinstance(obs_space, gym.spaces.Dict): 39 | assert isinstance(obs_space.spaces, OrderedDict) 40 | subspaces = obs_space.spaces 41 | else: 42 | subspaces = {None: obs_space} 43 | keys = [] 44 | shapes = {} 45 | dtypes = {} 46 | for key, box in subspaces.items(): 47 | keys.append(key) 48 | shapes[key] = box.shape 49 | dtypes[key] = box.dtype 50 | return keys, shapes, dtypes 51 | 52 | 53 | def obs_to_dict(obs): 54 | """ 55 | Convert an observation into a dict. 56 | """ 57 | if isinstance(obs, dict): 58 | return obs 59 | return {None: obs} 60 | -------------------------------------------------------------------------------- /miniworld/baselines/common/vec_env/vec_frame_stack.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | import numpy as np 3 | from gym import spaces 4 | 5 | 6 | class VecFrameStack(VecEnvWrapper): 7 | def __init__(self, venv, nstack): 8 | self.venv = venv 9 | self.nstack = nstack 10 | wos = venv.observation_space # wrapped ob space 11 | low = np.repeat(wos.low, self.nstack, axis=-1) 12 | high = np.repeat(wos.high, self.nstack, axis=-1) 13 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype) 14 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) 15 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 16 | 17 | def step_wait(self): 18 | obs, rews, news, infos = self.venv.step_wait() 19 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) 20 | for (i, new) in enumerate(news): 21 | if new: 22 | self.stackedobs[i] = 0 23 | self.stackedobs[..., -obs.shape[-1]:] = obs 24 | return self.stackedobs, rews, news, infos 25 | 26 | def reset(self): 27 | obs = self.venv.reset() 28 | self.stackedobs[...] = 0 29 | self.stackedobs[..., -obs.shape[-1]:] = obs 30 | return self.stackedobs 31 | -------------------------------------------------------------------------------- /miniworld/baselines/common/vec_env/vec_monitor.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | from baselines.bench.monitor import ResultsWriter 3 | import numpy as np 4 | import time 5 | 6 | 7 | class VecMonitor(VecEnvWrapper): 8 | def __init__(self, venv, filename=None): 9 | VecEnvWrapper.__init__(self, venv) 10 | self.eprets = None 11 | self.eplens = None 12 | self.tstart = time.time() 13 | self.results_writer = ResultsWriter(filename, header={'t_start': self.tstart}) 14 | 15 | def reset(self): 16 | obs = self.venv.reset() 17 | self.eprets = np.zeros(self.num_envs, 'f') 18 | self.eplens = np.zeros(self.num_envs, 'i') 19 | return obs 20 | 21 | def step_wait(self): 22 | obs, rews, dones, infos = self.venv.step_wait() 23 | self.eprets += rews 24 | self.eplens += 1 25 | newinfos = [] 26 | for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)): 27 | info = info.copy() 28 | if done: 29 | epinfo = {'r': ret, 'l': eplen, 't': round(time.time() - self.tstart, 6)} 30 | info['episode'] = epinfo 31 | self.eprets[i] = 0 32 | self.eplens[i] = 0 33 | self.results_writer.write_row(epinfo) 34 | 35 | newinfos.append(info) 36 | 37 | return obs, rews, dones, newinfos 38 | -------------------------------------------------------------------------------- /miniworld/baselines/common/vec_env/vec_normalize.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | from baselines.common.running_mean_std import RunningMeanStd 3 | import numpy as np 4 | 5 | 6 | class VecNormalize(VecEnvWrapper): 7 | """ 8 | A vectorized wrapper that normalizes the observations 9 | and returns from an environment. 10 | """ 11 | 12 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): 13 | VecEnvWrapper.__init__(self, venv) 14 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None 15 | self.ret_rms = RunningMeanStd(shape=()) if ret else None 16 | self.clipob = clipob 17 | self.cliprew = cliprew 18 | self.ret = np.zeros(self.num_envs) 19 | self.gamma = gamma 20 | self.epsilon = epsilon 21 | 22 | def step_wait(self): 23 | obs, rews, news, infos = self.venv.step_wait() 24 | self.ret = self.ret * self.gamma + rews 25 | obs = self._obfilt(obs) 26 | if self.ret_rms: 27 | self.ret_rms.update(self.ret) 28 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) 29 | self.ret[news] = 0. 30 | return obs, rews, news, infos 31 | 32 | def _obfilt(self, obs): 33 | if self.ob_rms: 34 | self.ob_rms.update(obs) 35 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) 36 | return obs 37 | else: 38 | return obs 39 | 40 | def reset(self): 41 | self.ret = np.zeros(self.num_envs) 42 | obs = self.venv.reset() 43 | return self._obfilt(obs) 44 | -------------------------------------------------------------------------------- /miniworld/baselines/common/vec_env/vec_video_recorder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from baselines import logger 3 | from baselines.common.vec_env import VecEnvWrapper 4 | from gym.wrappers.monitoring import video_recorder 5 | 6 | 7 | class VecVideoRecorder(VecEnvWrapper): 8 | """ 9 | Wrap VecEnv to record rendered image as mp4 video. 10 | """ 11 | 12 | def __init__(self, venv, directory, record_video_trigger, video_length=200): 13 | """ 14 | # Arguments 15 | venv: VecEnv to wrap 16 | directory: Where to save videos 17 | record_video_trigger: 18 | Function that defines when to start recording. 19 | The function takes the current number of step, 20 | and returns whether we should start recording or not. 21 | video_length: Length of recorded video 22 | """ 23 | 24 | VecEnvWrapper.__init__(self, venv) 25 | self.record_video_trigger = record_video_trigger 26 | self.video_recorder = None 27 | 28 | self.directory = os.path.abspath(directory) 29 | if not os.path.exists(self.directory): os.mkdir(self.directory) 30 | 31 | self.file_prefix = "vecenv" 32 | self.file_infix = '{}'.format(os.getpid()) 33 | self.step_id = 0 34 | self.video_length = video_length 35 | 36 | self.recording = False 37 | self.recorded_frames = 0 38 | 39 | def reset(self): 40 | obs = self.venv.reset() 41 | 42 | self.start_video_recorder() 43 | 44 | return obs 45 | 46 | def start_video_recorder(self): 47 | self.close_video_recorder() 48 | 49 | base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id)) 50 | self.video_recorder = video_recorder.VideoRecorder( 51 | env=self.venv, 52 | base_path=base_path, 53 | metadata={'step_id': self.step_id} 54 | ) 55 | 56 | self.video_recorder.capture_frame() 57 | self.recorded_frames = 1 58 | self.recording = True 59 | 60 | def _video_enabled(self): 61 | return self.record_video_trigger(self.step_id) 62 | 63 | def step_wait(self): 64 | obs, rews, dones, infos = self.venv.step_wait() 65 | 66 | self.step_id += 1 67 | if self.recording: 68 | self.video_recorder.capture_frame() 69 | self.recorded_frames += 1 70 | if self.recorded_frames > self.video_length: 71 | logger.info("Saving video to ", self.video_recorder.path) 72 | self.close_video_recorder() 73 | elif self._video_enabled(): 74 | self.start_video_recorder() 75 | 76 | return obs, rews, dones, infos 77 | 78 | def close_video_recorder(self): 79 | if self.recording: 80 | self.video_recorder.close() 81 | self.recording = False 82 | self.recorded_frames = 0 83 | 84 | def close(self): 85 | VecEnvWrapper.close(self) 86 | self.close_video_recorder() 87 | 88 | def __del__(self): 89 | self.close() 90 | -------------------------------------------------------------------------------- /miniworld/baselines/ppoc_int/README.md: -------------------------------------------------------------------------------- 1 | # PPOSGD 2 | 3 | - Original paper: https://arxiv.org/abs/1707.06347 4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ 5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. 6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment. 7 | 8 | - Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model` 9 | - Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model` 10 | -------------------------------------------------------------------------------- /miniworld/baselines/ppoc_int/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/baselines/ppoc_int/__init__.py -------------------------------------------------------------------------------- /miniworld/baselines/ppoc_int/muj.py: -------------------------------------------------------------------------------- 1 | 2 | # from rllab.envs.box2d.cartpole_swingup_env import CartpoleSwingupEnv 3 | # from rllab.envs.mujoco.maze.point_maze_env import PointMazeEnv 4 | # from rllab.envs.mujoco.maze.ant_maze_env import AntMazeEnv 5 | 6 | # from rllab.envs.mujoco.hill.half_cheetah_hill_env import HalfCheetahHillEnv 7 | # from rllab.envs.mujoco.hill.swimmer3d_hill_env import Swimmer3DHillEnv 8 | import pdb 9 | import time 10 | import gym 11 | import numpy as np 12 | # import my_gym; 13 | #from rllab.envs.mujoco.gather.swimmer_gather_env import SwimmerGatherEnv 14 | # from rllab.envs.mujoco.gather.ant_gather_env import AntGatherEnv 15 | # from rllab.envs.mujoco.gather.point_gather_env import PointGatherEnv 16 | # from rllab.envs.box2d.mountain_car_env import MountainCarEnv 17 | #from twod_tmaze2 import TMaze2 18 | #from antwalls import AntWallsEnv 19 | 20 | import time 21 | import gym_miniworld 22 | from gym_miniworld.entity import Box as miniBox 23 | from gym_miniworld.envs.oneroom import OneRoom 24 | 25 | 26 | #from antmaze import AntMazeEnv 27 | 28 | # from wheeled import WheeledEnv 29 | # from wheeled_maze import WheeledMazeEnv 30 | # from blockplaypen import BlockPlayPen 31 | # from twod_multi import TwoDMultiEnv 32 | # env = BlockPlayPen() 33 | # env = TwoDMaze() 34 | # env = TwoDMultiEnv() 35 | 36 | 37 | #env = SwimmerGatherEnv() 38 | #env = AntMazeEnv() 39 | 40 | #env = gym.make('MiniWorld-Hallway-v0') 41 | #env = gym.make('MiniWorld-OneRoom-v0') 42 | #env = gym.make('MiniWorld-PutNext-v0') 43 | env = gym.make('MiniWorld-PickupObjs-v0') 44 | 45 | 46 | #env=AntWallsEnv() 47 | #env= TMaze2() 48 | # env= gym.make("Reacher-v1") 49 | # env.seed(0) 50 | # pdb.set_trace() 51 | # env= PointMazeEnv() 52 | # env = gym.make("Acrobot-v1") 53 | #env.reset() 54 | # env.render() 55 | # state,reward, done, _ = env.step(np.array([0.,10.])) 56 | # env.render() 57 | # state,reward, done, _ = env.step(np.array([0.,10.])) 58 | # env.render() 59 | # state,reward, done, _ = env.step(np.array([0.,10.])) 60 | 61 | episodes = 0 62 | 63 | for step in range(500): 64 | env.render() 65 | time.sleep(1) 66 | # pdb.set_trace() 67 | # print(t) 68 | # if True: 69 | # continue 70 | # print("aaa") 71 | # state,reward, done, _ = env.step(np.array([0.,0.])) 72 | # pdb.set_trace() 73 | state,reward, done, _ = env.step(env.action_space.sample()) 74 | #print(env.box.pos) 75 | 76 | done = True 77 | if done: 78 | #pdb.set_trace() 79 | env.reset() 80 | episodes += 1 81 | 82 | # if episodes == 10: 83 | # env = OneRoom(change_goal=True) 84 | # 85 | # time.sleep(0.1) -------------------------------------------------------------------------------- /miniworld/baselines/ppoc_int/oneroom.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | from ..miniworld import MiniWorldEnv, Room 4 | from ..entity import Box 5 | 6 | class OneRoom(MiniWorldEnv): 7 | """ 8 | Environment in which the goal is to go to a red box 9 | placed randomly in one big room. 10 | """ 11 | 12 | def __init__(self, size=10,change_goal=None, **kwargs): 13 | assert size >= 2 14 | self.size = size 15 | self.change_goal = change_goal 16 | 17 | super().__init__( 18 | max_episode_steps=180, 19 | **kwargs 20 | ) 21 | 22 | def _gen_world(self): 23 | room = self.add_rect_room( 24 | min_x=0, 25 | max_x=self.size, 26 | min_z=0, 27 | max_z=self.size 28 | ) 29 | 30 | if not self.change_goal: 31 | self.box = self.place_entity(Box(color='red')) 32 | else: 33 | self.box = self.place_entity(Box(color='blue')) 34 | self.place_agent() 35 | 36 | def step(self, action): 37 | obs, reward, done, info = super().step(action) 38 | 39 | if self.near(self.box): 40 | reward += self._reward() 41 | done = True 42 | 43 | return obs, reward, done, info 44 | -------------------------------------------------------------------------------- /miniworld/baselines/ppoc_int/run_miniw.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | from baselines.common import set_global_seeds, tf_util as U 3 | from mpi4py import MPI 4 | from baselines import bench 5 | import os.path as osp 6 | import gym, logging 7 | import gym_miniworld 8 | from baselines import logger 9 | 10 | 11 | 12 | def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch, dc, plots, w_intfc, switch, mainlr, intlr, piolr, fewshot): 13 | from baselines.ppoc_int import cnn_policy, pposgd_simple 14 | rank = MPI.COMM_WORLD.Get_rank() 15 | sess = U.single_threaded_session() 16 | sess.__enter__() 17 | if rank == 0: 18 | logger.configure() 19 | else: 20 | logger.configure(format_strs=[]) 21 | workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None 22 | set_global_seeds(workerseed) 23 | 24 | env = gym.make(env_id) 25 | env.seed(workerseed) 26 | 27 | 28 | def policy_fn(name, ob_space, ac_space): 29 | return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, num_options=num_options, dc=dc, w_intfc=w_intfc) 30 | 31 | env = bench.Monitor(env, logger.get_dir() and 32 | osp.join(logger.get_dir(), str(rank))) 33 | 34 | optimsize = int(64 / num_options) 35 | 36 | 37 | num_timesteps = num_timesteps 38 | tperbatch = 2048 if not epoch else int(1e4) 39 | pposgd_simple.learn(env, policy_fn, 40 | max_timesteps=num_timesteps, 41 | timesteps_per_batch=tperbatch, 42 | clip_param=0.2, entcoeff=0.01, 43 | optim_epochs=4, optim_stepsize=mainlr, optim_batchsize=optimsize, 44 | gamma=0.99, lam=0.95, schedule='linear', num_options=num_options, 45 | app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed, dc=dc, plots=plots, 46 | w_intfc=w_intfc, switch=switch, intlr=intlr, piolr=piolr, fewshot=fewshot 47 | ) 48 | env.close() 49 | 50 | 51 | def main(): 52 | import argparse 53 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 54 | parser.add_argument('--env', help='environment ID', default='MiniWorld-OneRoom-v0') 55 | parser.add_argument('--timesteps', help='number of timesteps', type=int, default=1000000) 56 | parser.add_argument('--seed', help='RNG seed', type=int, default=1) 57 | parser.add_argument('--opt', help='number of options', type=int, default=2) 58 | parser.add_argument('--app', help='Append to folder name', type=str, default='') 59 | parser.add_argument('--saves', dest='saves', action='store_true', default=False) 60 | parser.add_argument('--wsaves', dest='wsaves', action='store_true', default=False) 61 | parser.add_argument('--plots', dest='plots', action='store_true', default=False) 62 | parser.add_argument('--switch', dest='switch', help='switch task after 150 iterations', action='store_true', default=False) 63 | parser.add_argument('--fewshot', dest='fewshot', help='value learning after 150 iterations', action='store_true', default=False) 64 | parser.add_argument('--nointfc', dest='w_intfc', help='disables interet functions', action='store_false', default=True) 65 | parser.add_argument('--epoch', help='Epoch', type=int, default=0) 66 | parser.add_argument('--dc', type=float, default=0.) 67 | parser.add_argument('--mainlr', type=float, default=3e-4) 68 | parser.add_argument('--intlr', type=float, default=1e-4) 69 | parser.add_argument('--piolr', type=float, default=1e-4) 70 | 71 | 72 | args = parser.parse_args() 73 | 74 | train(args.env, num_timesteps=args.timesteps, seed=args.seed, num_options=args.opt, app=args.app, saves=args.saves, 75 | wsaves=args.wsaves, epoch=args.epoch, dc=args.dc, plots=args.plots, w_intfc=args.w_intfc, switch=args.switch, 76 | mainlr=args.mainlr, intlr=args.intlr, piolr=args.piolr, fewshot=args.fewshot) 77 | 78 | 79 | if __name__ == '__main__': 80 | main() -------------------------------------------------------------------------------- /miniworld/baselines/ppoc_int/run_mujoco.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | from baselines.common import set_global_seeds, tf_util as U 3 | from baselines import bench 4 | import os.path as osp 5 | import gym, logging 6 | # import gym_miniworld 7 | import pdb 8 | from baselines import logger 9 | import sys 10 | # from gym_miniworld.wrappers import GreyscaleWrapper 11 | 12 | def train(env_id, num_timesteps, seed, num_options,app, saves ,wsaves, epoch,dc,plots,w_intfc,switch,mainlr,intlr,fewshot): 13 | from baselines.ppoc_int import mlp_policy, pposgd_simple 14 | U.make_session(num_cpu=1).__enter__() 15 | set_global_seeds(seed) 16 | 17 | if env_id=="TMaze": 18 | from twod_tmaze import TMaze 19 | env=TMaze() 20 | env.seed(seed) 21 | elif env_id=="TMaze2": 22 | from twod_tmaze2 import TMaze2 23 | env=TMaze2() 24 | env.seed(seed) 25 | elif env_id=="AntWalls": 26 | from antwalls import AntWallsEnv 27 | env=AntWallsEnv() 28 | env.seed(seed) 29 | elif env_id=="AntMaze": 30 | from ant_maze_env import AntMazeEnv 31 | mazeid = 'Maze' 32 | env = AntMazeEnv(mazeid) 33 | env.seed(seed) 34 | else: 35 | env = gym.make(env_id) 36 | env._seed(seed) 37 | 38 | 39 | def policy_fn(name, ob_space, ac_space): 40 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 41 | hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc, w_intfc=w_intfc) 42 | 43 | gym.logger.setLevel(logging.WARN) 44 | 45 | optimsize=int(64/num_options) 46 | 47 | # pdb.set_trace() 48 | num_timesteps = num_timesteps if env_id!="TMaze" else 5e5 49 | tperbatch = 2048 if not epoch else int(1e4) 50 | pposgd_simple.learn(env, policy_fn, 51 | max_timesteps=num_timesteps, 52 | timesteps_per_batch=tperbatch, 53 | clip_param=0.2, entcoeff=0.0, 54 | optim_epochs=10, optim_stepsize=mainlr, optim_batchsize=optimsize, 55 | gamma=0.99, lam=0.95, schedule='constant', num_options=num_options, 56 | app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed,dc=dc,plots=plots, 57 | w_intfc=w_intfc,switch=switch,intlr=intlr,fewshot=fewshot 58 | ) 59 | env.close() 60 | 61 | def main(): 62 | import argparse 63 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 64 | parser.add_argument('--env', help='environment ID', default='TMaze') 65 | parser.add_argument('--timesteps', help='number of timesteps', type=int, default=1000000) 66 | parser.add_argument('--seed', help='RNG seed', type=int, default=1) 67 | parser.add_argument('--opt', help='number of options', type=int, default=2) 68 | parser.add_argument('--app', help='Append to folder name', type=str, default='') 69 | parser.add_argument('--saves', dest='saves', action='store_true', default=False) 70 | parser.add_argument('--wsaves', dest='wsaves', action='store_true', default=False) 71 | parser.add_argument('--plots', dest='plots', action='store_true', default=False) 72 | parser.add_argument('--switch', dest='switch', action='store_true', default=False) 73 | parser.add_argument('--fewshot', dest='fewshot', action='store_true', default=False) 74 | parser.add_argument('--nointfc', dest='w_intfc', action='store_false', default=True) 75 | parser.add_argument('--epoch', help='Epoch', type=int, default=0) 76 | parser.add_argument('--dc', type=float, default=0.) 77 | parser.add_argument('--mainlr', type=float, default=3e-4) 78 | parser.add_argument('--intlr', type=float, default=1e-4) 79 | 80 | # pdb.set_trace() 81 | args = parser.parse_args() 82 | 83 | train(args.env, num_timesteps=args.timesteps, seed=args.seed, num_options=args.opt, app=args.app, 84 | saves=args.saves, wsaves=args.wsaves, epoch=args.epoch,dc=args.dc,plots=args.plots, 85 | w_intfc=args.w_intfc,switch=args.switch,mainlr=args.mainlr,intlr=args.intlr,fewshot=args.fewshot) 86 | 87 | 88 | if __name__ == '__main__': 89 | main() -------------------------------------------------------------------------------- /miniworld/baselines/results_plotter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode 4 | 5 | import matplotlib.pyplot as plt 6 | plt.rcParams['svg.fonttype'] = 'none' 7 | 8 | from baselines.common import plot_util 9 | 10 | X_TIMESTEPS = 'timesteps' 11 | X_EPISODES = 'episodes' 12 | X_WALLTIME = 'walltime_hrs' 13 | Y_REWARD = 'reward' 14 | Y_TIMESTEPS = 'timesteps' 15 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] 16 | EPISODES_WINDOW = 100 17 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 18 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', 19 | 'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue'] 20 | 21 | def rolling_window(a, window): 22 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 23 | strides = a.strides + (a.strides[-1],) 24 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 25 | 26 | def window_func(x, y, window, func): 27 | yw = rolling_window(y, window) 28 | yw_func = func(yw, axis=-1) 29 | return x[window-1:], yw_func 30 | 31 | def ts2xy(ts, xaxis, yaxis): 32 | if xaxis == X_TIMESTEPS: 33 | x = np.cumsum(ts.l.values) 34 | elif xaxis == X_EPISODES: 35 | x = np.arange(len(ts)) 36 | elif xaxis == X_WALLTIME: 37 | x = ts.t.values / 3600. 38 | else: 39 | raise NotImplementedError 40 | if yaxis == Y_REWARD: 41 | y = ts.r.values 42 | elif yaxis == Y_TIMESTEPS: 43 | y = ts.l.values 44 | else: 45 | raise NotImplementedError 46 | return x, y 47 | 48 | def plot_curves(xy_list, xaxis, yaxis, title): 49 | fig = plt.figure(figsize=(8,2)) 50 | maxx = max(xy[0][-1] for xy in xy_list) 51 | minx = 0 52 | for (i, (x, y)) in enumerate(xy_list): 53 | color = COLORS[i % len(COLORS)] 54 | plt.scatter(x, y, s=2) 55 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes 56 | plt.plot(x, y_mean, color=color) 57 | plt.xlim(minx, maxx) 58 | plt.title(title) 59 | plt.xlabel(xaxis) 60 | plt.ylabel(yaxis) 61 | plt.tight_layout() 62 | fig.canvas.mpl_connect('resize_event', lambda event: plt.tight_layout()) 63 | plt.grid(True) 64 | 65 | 66 | def split_by_task(taskpath): 67 | return taskpath['dirname'].split('/')[-1].split('-')[0] 68 | 69 | def plot_results(dirs, num_timesteps=10e6, xaxis=X_TIMESTEPS, yaxis=Y_REWARD, title='', split_fn=split_by_task): 70 | results = plot_util.load_results(dirs) 71 | plot_util.plot_results(results, xy_fn=lambda r: ts2xy(r['monitor'], xaxis, yaxis), split_fn=split_fn, average_group=True, resample=int(1e6)) 72 | 73 | # Example usage in jupyter-notebook 74 | # from baselines.results_plotter import plot_results 75 | # %matplotlib inline 76 | # plot_results("./log") 77 | # Here ./log is a directory containing the monitor.csv files 78 | 79 | def main(): 80 | import argparse 81 | import os 82 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 83 | parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log']) 84 | parser.add_argument('--num_timesteps', type=int, default=int(10e6)) 85 | parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS) 86 | parser.add_argument('--yaxis', help = 'Varible on Y-axis', default = Y_REWARD) 87 | parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout') 88 | args = parser.parse_args() 89 | args.dirs = [os.path.abspath(dir) for dir in args.dirs] 90 | plot_results(args.dirs, args.num_timesteps, args.xaxis, args.yaxis, args.task_name) 91 | plt.show() 92 | 93 | if __name__ == '__main__': 94 | main() 95 | -------------------------------------------------------------------------------- /miniworld/data/cartpole.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/data/cartpole.gif -------------------------------------------------------------------------------- /miniworld/data/fetchPickAndPlaceContrast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/data/fetchPickAndPlaceContrast.png -------------------------------------------------------------------------------- /miniworld/data/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/data/logo.jpg -------------------------------------------------------------------------------- /miniworld/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = F,E999,W291,W293 3 | exclude = 4 | .git, 5 | __pycache__, 6 | baselines/ppo1, 7 | baselines/bench, 8 | -------------------------------------------------------------------------------- /miniworld/setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | from setuptools import setup, find_packages 3 | import sys 4 | 5 | if sys.version_info.major != 3: 6 | print('This Python is only compatible with Python 3, but you are running ' 7 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major)) 8 | 9 | 10 | extras = { 11 | 'test': [ 12 | 'filelock', 13 | 'pytest', 14 | 'pytest-forked', 15 | 'atari-py' 16 | ], 17 | 'bullet': [ 18 | 'pybullet', 19 | ], 20 | 'mpi': [ 21 | 'mpi4py' 22 | ] 23 | } 24 | 25 | all_deps = [] 26 | for group_name in extras: 27 | all_deps += extras[group_name] 28 | 29 | extras['all'] = all_deps 30 | 31 | setup(name='baselines', 32 | packages=[package for package in find_packages() 33 | if package.startswith('baselines')], 34 | install_requires=[ 35 | 'gym', 36 | 'scipy', 37 | 'tqdm', 38 | 'joblib', 39 | 'dill', 40 | 'progressbar2', 41 | 'cloudpickle', 42 | 'click', 43 | 'opencv-python' 44 | ], 45 | extras_require=extras, 46 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', 47 | author='OpenAI', 48 | url='https://github.com/openai/baselines', 49 | author_email='gym@openai.com', 50 | version='0.1.5') 51 | 52 | 53 | # ensure there is some tensorflow build with version above 1.4 54 | import pkg_resources 55 | tf_pkg = None 56 | for tf_pkg_name in ['tensorflow', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-gpu']: 57 | try: 58 | tf_pkg = pkg_resources.get_distribution(tf_pkg_name) 59 | except pkg_resources.DistributionNotFound: 60 | pass 61 | assert tf_pkg is not None, 'TensorFlow needed, of version above 1.4' 62 | from distutils.version import LooseVersion 63 | assert LooseVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= LooseVersion('1.4.0') 64 | -------------------------------------------------------------------------------- /tabular/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/.DS_Store -------------------------------------------------------------------------------- /tabular/FR_Return_10Runs_Smooth_MisspecifiedPiO.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/FR_Return_10Runs_Smooth_MisspecifiedPiO.pdf -------------------------------------------------------------------------------- /tabular/FR_Steps_10Runs_Smooth_MisspecifiedPiO.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/FR_Steps_10Runs_Smooth_MisspecifiedPiO.pdf -------------------------------------------------------------------------------- /tabular/GoalG62.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/GoalG62.png -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/History.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/History.npy -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_0.png -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_1.png -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_2.png -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_3.png -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Params.txt: -------------------------------------------------------------------------------- 1 | baseline:True 2 | discount:0.99 3 | epsilon:0.01 4 | lr_critic:0.5 5 | lr_interestfn:0.15 6 | lr_intra:0.25 7 | lr_reg:0.0 8 | lr_term:0.25 9 | nepisodes:500 10 | noptions:4 11 | nruns:10 12 | nsteps:2000 13 | primitive:False 14 | regularize:False 15 | seed:7200 16 | seed_startstate:10 17 | temperature:0.01 18 | -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/StateFreq.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/StateFreq.npy -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_ActionValueFunction.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_ActionValueFunction.npy -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_InterestFunction.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_InterestFunction.npy -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_IntraOption.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_IntraOption.npy -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_OptionValueFunction.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_OptionValueFunction.npy -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_Policy.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_Policy.npy -------------------------------------------------------------------------------- /tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_Termination.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_Termination.npy -------------------------------------------------------------------------------- /tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/History.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/History.npy -------------------------------------------------------------------------------- /tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Params.txt: -------------------------------------------------------------------------------- 1 | baseline:True 2 | discount:0.99 3 | epsilon:0.01 4 | lr_critic:0.5 5 | lr_intra:0.25 6 | lr_term:0.25 7 | nepisodes:500 8 | noptions:4 9 | nruns:10 10 | nsteps:2000 11 | primitive:False 12 | seed:7200 13 | seed_startstate:10 14 | temperature:0.01 15 | -------------------------------------------------------------------------------- /tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/StateFreq.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/StateFreq.npy -------------------------------------------------------------------------------- /tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_ActionValueFunction.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_ActionValueFunction.npy -------------------------------------------------------------------------------- /tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_IntraOption.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_IntraOption.npy -------------------------------------------------------------------------------- /tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_OptionValueFunction.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_OptionValueFunction.npy -------------------------------------------------------------------------------- /tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_Termination.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_Termination.npy -------------------------------------------------------------------------------- /tabular/TransferVisual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/TransferVisual.png -------------------------------------------------------------------------------- /tabular/__pycache__/fourrooms.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/__pycache__/fourrooms.cpython-36.pyc -------------------------------------------------------------------------------- /tabular/fourrooms.py: -------------------------------------------------------------------------------- 1 | #Environment File for Classic Fourrooms Grid World 2 | import numpy as np 3 | import gym 4 | from gym import core, spaces 5 | from gym.envs.registration import register 6 | from random import uniform 7 | 8 | #class Fourrooms(gym.Env): 9 | class Fourrooms(): 10 | def __init__(self,initstate_seed): 11 | layout = """\ 12 | wwwwwwwwwwwww 13 | w w w 14 | w w w 15 | w w 16 | w w w 17 | w w w 18 | ww wwww w 19 | w www www 20 | w w w 21 | w w w 22 | w w 23 | w w w 24 | wwwwwwwwwwwww 25 | """ 26 | 27 | 28 | self.occupancy = np.array([list(map(lambda c: 1 if c=='w' else 0, line)) for line in layout.splitlines()]) 29 | 30 | # Action Space: from any state the agent can perform one of the four actions; Up, Down, Left and Right 31 | self.action_space = spaces.Discrete(4) 32 | 33 | # Observation Space 34 | self.observation_space = spaces.Discrete(np.sum(self.occupancy == 0)) 35 | 36 | self.directions = [np.array((-1,0)), np.array((1,0)), np.array((0,-1)), np.array((0,1))] 37 | 38 | self.rng = np.random.RandomState(1234) 39 | 40 | self.initstate_seed = initstate_seed 41 | self.rng_init_state = np.random.RandomState(self.initstate_seed) 42 | 43 | self.tostate = {} 44 | 45 | self.occ_dict = dict(zip(range(self.observation_space.n), 46 | np.argwhere(self.occupancy.flatten() == 0).squeeze())) 47 | 48 | 49 | statenum = 0 50 | for i in range(13): 51 | for j in range(13): 52 | if self.occupancy[i, j] == 0: 53 | self.tostate[(i, j)] = statenum 54 | statenum += 1 55 | 56 | self.tocell = {v:k for k,v in self.tostate.items()} 57 | 58 | self.goal = 62 59 | self.init_states = list(range(self.observation_space.n)) 60 | self.init_states.remove(self.goal) 61 | 62 | 63 | def empty_around(self, cell): 64 | avail = [] 65 | for action in range(self.action_space.n): 66 | nextcell = tuple(cell + self.directions[action]) 67 | if not self.occupancy[nextcell]: 68 | avail.append(nextcell) 69 | return avail 70 | 71 | # def reset(self): 72 | # state = self.rng.choice(self.init_states) 73 | # self.currentcell = self.tocell[state] 74 | # return state 75 | 76 | 77 | def reset(self): 78 | state = self.rng_init_state.choice(self.init_states) 79 | self.currentcell = self.tocell[state] 80 | return state 81 | 82 | def step(self, action): 83 | """ 84 | The agent can perform one of four actions, 85 | up, down, left or right, which have a stochastic effect. 86 | We consider a case in which rewards are zero on all state transitions 87 | except the goal state which has a reward of +50. 88 | """ 89 | 90 | reward = 0 91 | if self.rng.uniform() < 1/3: 92 | empty_cells = self.empty_around(self.currentcell) 93 | nextcell = empty_cells[self.rng.randint(len(empty_cells))] 94 | else: 95 | nextcell = tuple(self.currentcell + self.directions[action]) 96 | 97 | if not self.occupancy[nextcell]: 98 | self.currentcell = nextcell 99 | 100 | state = self.tostate[self.currentcell] 101 | 102 | if state == self.goal: 103 | reward = 50 104 | 105 | done = state == self.goal 106 | return state, reward, float(done), None 107 | 108 | register( 109 | id='Fourrooms-v0', 110 | entry_point='fourrooms:Fourrooms', 111 | timestep_limit=20000, 112 | reward_threshold=1, 113 | ) 114 | --------------------------------------------------------------------------------