├── .DS_Store
├── .gitignore
├── .idea
├── ioc.iml
├── misc.xml
├── modules.xml
└── vcs.xml
├── README.md
├── control
├── .benchmark_pattern
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── baselines
│ ├── __init__.py
│ ├── bench
│ │ ├── __init__.py
│ │ ├── benchmarks.py
│ │ └── monitor.py
│ ├── common
│ │ ├── __init__.py
│ │ ├── atari_wrappers.py
│ │ ├── cg.py
│ │ ├── cmd_util.py
│ │ ├── console_util.py
│ │ ├── dataset.py
│ │ ├── distributions.py
│ │ ├── input.py
│ │ ├── math_util.py
│ │ ├── misc_util.py
│ │ ├── models.py
│ │ ├── mpi_adam.py
│ │ ├── mpi_adam_optimizer.py
│ │ ├── mpi_fork.py
│ │ ├── mpi_moments.py
│ │ ├── mpi_running_mean_std.py
│ │ ├── mpi_util.py
│ │ ├── plot_util.py
│ │ ├── policies.py
│ │ ├── retro_wrappers.py
│ │ ├── runners.py
│ │ ├── running_mean_std.py
│ │ ├── schedules.py
│ │ ├── segment_tree.py
│ │ ├── tests
│ │ │ ├── __init__.py
│ │ │ ├── envs
│ │ │ │ ├── __init__.py
│ │ │ │ ├── fixed_sequence_env.py
│ │ │ │ ├── identity_env.py
│ │ │ │ └── mnist_env.py
│ │ │ ├── test_cartpole.py
│ │ │ ├── test_doc_examples.py
│ │ │ ├── test_env_after_learn.py
│ │ │ ├── test_fetchreach.py
│ │ │ ├── test_fixed_sequence.py
│ │ │ ├── test_identity.py
│ │ │ ├── test_mnist.py
│ │ │ ├── test_schedules.py
│ │ │ ├── test_segment_tree.py
│ │ │ ├── test_serialization.py
│ │ │ ├── test_tf_util.py
│ │ │ └── util.py
│ │ ├── tf_util.py
│ │ ├── tile_images.py
│ │ └── vec_env
│ │ │ ├── __init__.py
│ │ │ ├── dummy_vec_env.py
│ │ │ ├── shmem_vec_env.py
│ │ │ ├── subproc_vec_env.py
│ │ │ ├── test_vec_env.py
│ │ │ ├── test_video_recorder.py
│ │ │ ├── util.py
│ │ │ ├── vec_frame_stack.py
│ │ │ ├── vec_monitor.py
│ │ │ ├── vec_normalize.py
│ │ │ └── vec_video_recorder.py
│ ├── logger.py
│ ├── ppoc_int
│ │ ├── __init__.py
│ │ ├── assets
│ │ │ ├── half_cheetah.xml
│ │ │ └── twod_tmaze.xml
│ │ ├── half_cheetah.py
│ │ ├── mlp_policy.py
│ │ ├── normalized_env.py
│ │ ├── plot_res.py
│ │ ├── pposgd_simple.py
│ │ ├── run_mujoco.py
│ │ ├── seeding.py
│ │ └── twod_tmaze.py
│ ├── results_plotter.py
│ └── run.py
├── benchmarks_atari10M.htm
├── benchmarks_mujoco1M.htm
├── data
│ ├── cartpole.gif
│ ├── fetchPickAndPlaceContrast.png
│ └── logo.jpg
├── docs
│ └── viz
│ │ └── viz.ipynb
├── setup.cfg
└── setup.py
├── launcher_miniworld.sh
├── launcher_mujoco.sh
├── miniworld
├── .benchmark_pattern
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── baselines
│ ├── __init__.py
│ ├── bench
│ │ ├── __init__.py
│ │ ├── benchmarks.py
│ │ └── monitor.py
│ ├── common
│ │ ├── __init__.py
│ │ ├── atari_wrappers.py
│ │ ├── cg.py
│ │ ├── cmd_util.py
│ │ ├── console_util.py
│ │ ├── dataset.py
│ │ ├── distributions.py
│ │ ├── input.py
│ │ ├── math_util.py
│ │ ├── misc_util.py
│ │ ├── models.py
│ │ ├── mpi_adam.py
│ │ ├── mpi_adam_optimizer.py
│ │ ├── mpi_fork.py
│ │ ├── mpi_moments.py
│ │ ├── mpi_running_mean_std.py
│ │ ├── mpi_util.py
│ │ ├── plot_util.py
│ │ ├── policies.py
│ │ ├── retro_wrappers.py
│ │ ├── runners.py
│ │ ├── running_mean_std.py
│ │ ├── schedules.py
│ │ ├── segment_tree.py
│ │ ├── tests
│ │ │ ├── __init__.py
│ │ │ ├── envs
│ │ │ │ ├── __init__.py
│ │ │ │ ├── fixed_sequence_env.py
│ │ │ │ ├── identity_env.py
│ │ │ │ └── mnist_env.py
│ │ │ ├── test_cartpole.py
│ │ │ ├── test_doc_examples.py
│ │ │ ├── test_env_after_learn.py
│ │ │ ├── test_fetchreach.py
│ │ │ ├── test_fixed_sequence.py
│ │ │ ├── test_identity.py
│ │ │ ├── test_mnist.py
│ │ │ ├── test_schedules.py
│ │ │ ├── test_segment_tree.py
│ │ │ ├── test_serialization.py
│ │ │ ├── test_tf_util.py
│ │ │ └── util.py
│ │ ├── tf_util.py
│ │ ├── tile_images.py
│ │ └── vec_env
│ │ │ ├── __init__.py
│ │ │ ├── dummy_vec_env.py
│ │ │ ├── shmem_vec_env.py
│ │ │ ├── subproc_vec_env.py
│ │ │ ├── test_vec_env.py
│ │ │ ├── test_video_recorder.py
│ │ │ ├── util.py
│ │ │ ├── vec_frame_stack.py
│ │ │ ├── vec_monitor.py
│ │ │ ├── vec_normalize.py
│ │ │ └── vec_video_recorder.py
│ ├── logger.py
│ ├── ppoc_int
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── cnn_policy.py
│ │ ├── mlp_policy.py
│ │ ├── muj.py
│ │ ├── oneroom.py
│ │ ├── plot_res.py
│ │ ├── pposgd_simple.py
│ │ ├── run_miniw.py
│ │ └── run_mujoco.py
│ ├── results_plotter.py
│ └── run.py
├── data
│ ├── cartpole.gif
│ ├── fetchPickAndPlaceContrast.png
│ └── logo.jpg
├── docs
│ └── viz
│ │ └── viz.ipynb
├── setup.cfg
└── setup.py
└── tabular
├── .DS_Store
├── .ipynb_checkpoints
├── fr_analysis_heatmaps-checkpoint.ipynb
├── fr_analysis_performance-checkpoint.ipynb
└── fr_env_plots-checkpoint.ipynb
├── FR_Return_10Runs_Smooth_MisspecifiedPiO.pdf
├── FR_Steps_10Runs_Smooth_MisspecifiedPiO.pdf
├── GoalG62.png
├── InterestOptionCritic
└── Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200
│ ├── History.npy
│ ├── IOC_Task1_IntraOptionPolicy_Opt_0.png
│ ├── IOC_Task1_IntraOptionPolicy_Opt_1.png
│ ├── IOC_Task1_IntraOptionPolicy_Opt_2.png
│ ├── IOC_Task1_IntraOptionPolicy_Opt_3.png
│ ├── Params.txt
│ ├── StateFreq.npy
│ ├── Weights_ActionValueFunction.npy
│ ├── Weights_InterestFunction.npy
│ ├── Weights_IntraOption.npy
│ ├── Weights_OptionValueFunction.npy
│ ├── Weights_Policy.npy
│ └── Weights_Termination.npy
├── OptionCritic
└── Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200
│ ├── History.npy
│ ├── Params.txt
│ ├── StateFreq.npy
│ ├── Weights_ActionValueFunction.npy
│ ├── Weights_IntraOption.npy
│ ├── Weights_OptionValueFunction.npy
│ └── Weights_Termination.npy
├── TransferVisual.png
├── __pycache__
└── fourrooms.cpython-36.pyc
├── fourrooms.py
├── fr_analysis_heatmaps.ipynb
├── fr_analysis_performance.ipynb
├── fr_env_plots.ipynb
├── interestoptioncritic_tabular_fr.py
└── optioncritic_tabular_fr.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/.DS_Store
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/workspace.xml
2 | .idea/tasks.xml
3 |
--------------------------------------------------------------------------------
/.idea/ioc.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/control/.benchmark_pattern:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/control/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.pyc
3 | *.pkl
4 | *.py~
5 | .pytest_cache
6 | .DS_Store
7 | .idea
8 |
9 | # Setuptools distribution and build folders.
10 | /dist/
11 | /build
12 | keys/
13 |
14 | # Virtualenv
15 | /env
16 |
17 |
18 | *.sublime-project
19 | *.sublime-workspace
20 |
21 | .idea
22 |
23 | logs/
24 |
25 | .ipynb_checkpoints
26 | ghostdriver.log
27 |
28 | htmlcov
29 |
30 | junk
31 | src
32 |
33 | *.egg-info
34 | .cache
35 |
36 | MUJOCO_LOG.TXT
37 |
--------------------------------------------------------------------------------
/control/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 |
5 | services:
6 | - docker
7 |
8 | install:
9 | - pip install flake8
10 | - docker build . -t baselines-test
11 |
12 | script:
13 | - flake8 . --show-source --statistics
14 | - docker run baselines-test pytest -v --forked .
15 |
--------------------------------------------------------------------------------
/control/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6
2 |
3 | RUN apt-get -y update && apt-get -y install ffmpeg
4 | # RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
5 |
6 | ENV CODE_DIR /root/code
7 |
8 | COPY . $CODE_DIR/baselines
9 | WORKDIR $CODE_DIR/baselines
10 |
11 | # Clean up pycache and pyc files
12 | RUN rm -rf __pycache__ && \
13 | find . -name "*.pyc" -delete && \
14 | pip install tensorflow && \
15 | pip install -e .[test]
16 |
17 |
18 | CMD /bin/bash
19 |
--------------------------------------------------------------------------------
/control/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2017 OpenAI (http://openai.com)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/control/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/baselines/__init__.py
--------------------------------------------------------------------------------
/control/baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.bench.benchmarks import *
2 | from baselines.bench.monitor import *
3 |
--------------------------------------------------------------------------------
/control/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F403
2 | from baselines.common.console_util import *
3 | from baselines.common.dataset import Dataset
4 | from baselines.common.math_util import *
5 | from baselines.common.misc_util import *
6 |
--------------------------------------------------------------------------------
/control/baselines/common/cg.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
3 | """
4 | Demmel p 312
5 | """
6 | p = b.copy()
7 | r = b.copy()
8 | x = np.zeros_like(b)
9 | rdotr = r.dot(r)
10 |
11 | fmtstr = "%10i %10.3g %10.3g"
12 | titlestr = "%10s %10s %10s"
13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 |
15 | for i in range(cg_iters):
16 | if callback is not None:
17 | callback(x)
18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 | z = f_Ax(p)
20 | v = rdotr / p.dot(z)
21 | x += v*p
22 | r -= v*z
23 | newrdotr = r.dot(r)
24 | mu = newrdotr/rdotr
25 | p = r + mu*p
26 |
27 | rdotr = newrdotr
28 | if rdotr < residual_tol:
29 | break
30 |
31 | if callback is not None:
32 | callback(x)
33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
34 | return x
35 |
--------------------------------------------------------------------------------
/control/baselines/common/console_util.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from contextlib import contextmanager
3 | import numpy as np
4 | import time
5 | import shlex
6 | import subprocess
7 |
8 | # ================================================================
9 | # Misc
10 | # ================================================================
11 |
12 | def fmt_row(width, row, header=False):
13 | out = " | ".join(fmt_item(x, width) for x in row)
14 | if header: out = out + "\n" + "-"*len(out)
15 | return out
16 |
17 | def fmt_item(x, l):
18 | if isinstance(x, np.ndarray):
19 | assert x.ndim==0
20 | x = x.item()
21 | if isinstance(x, (float, np.float32, np.float64)):
22 | v = abs(x)
23 | if (v < 1e-4 or v > 1e+4) and v > 0:
24 | rep = "%7.2e" % x
25 | else:
26 | rep = "%7.5f" % x
27 | else: rep = str(x)
28 | return " "*(l - len(rep)) + rep
29 |
30 | color2num = dict(
31 | gray=30,
32 | red=31,
33 | green=32,
34 | yellow=33,
35 | blue=34,
36 | magenta=35,
37 | cyan=36,
38 | white=37,
39 | crimson=38
40 | )
41 |
42 | def colorize(string, color='green', bold=False, highlight=False):
43 | attr = []
44 | num = color2num[color]
45 | if highlight: num += 10
46 | attr.append(str(num))
47 | if bold: attr.append('1')
48 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
49 |
50 | def print_cmd(cmd, dry=False):
51 | if isinstance(cmd, str): # for shell=True
52 | pass
53 | else:
54 | cmd = ' '.join(shlex.quote(arg) for arg in cmd)
55 | print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd))
56 |
57 |
58 | def get_git_commit(cwd=None):
59 | return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8')
60 |
61 | def get_git_commit_message(cwd=None):
62 | return subprocess.check_output(['git', 'show', '-s', '--format=%B', 'HEAD'], cwd=cwd).decode('utf8')
63 |
64 | def ccap(cmd, dry=False, env=None, **kwargs):
65 | print_cmd(cmd, dry)
66 | if not dry:
67 | subprocess.check_call(cmd, env=env, **kwargs)
68 |
69 |
70 | MESSAGE_DEPTH = 0
71 |
72 | @contextmanager
73 | def timed(msg):
74 | global MESSAGE_DEPTH #pylint: disable=W0603
75 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
76 | tstart = time.time()
77 | MESSAGE_DEPTH += 1
78 | yield
79 | MESSAGE_DEPTH -= 1
80 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
81 |
--------------------------------------------------------------------------------
/control/baselines/common/dataset.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class Dataset(object):
4 | def __init__(self, data_map, deterministic=False, shuffle=True):
5 | self.data_map = data_map
6 | self.deterministic = deterministic
7 | self.enable_shuffle = shuffle
8 | self.n = next(iter(data_map.values())).shape[0]
9 | self._next_id = 0
10 | self.shuffle()
11 |
12 | def shuffle(self):
13 | if self.deterministic:
14 | return
15 | perm = np.arange(self.n)
16 | np.random.shuffle(perm)
17 |
18 | for key in self.data_map:
19 | self.data_map[key] = self.data_map[key][perm]
20 |
21 | self._next_id = 0
22 |
23 | def next_batch(self, batch_size):
24 | if self._next_id >= self.n and self.enable_shuffle:
25 | self.shuffle()
26 |
27 | cur_id = self._next_id
28 | cur_batch_size = min(batch_size, self.n - self._next_id)
29 | self._next_id += cur_batch_size
30 |
31 | data_map = dict()
32 | for key in self.data_map:
33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 | return data_map
35 |
36 | def iterate_once(self, batch_size):
37 | if self.enable_shuffle: self.shuffle()
38 |
39 | while self._next_id <= self.n - batch_size:
40 | yield self.next_batch(batch_size)
41 | self._next_id = 0
42 |
43 | def subset(self, num_elements, deterministic=True):
44 | data_map = dict()
45 | for key in self.data_map:
46 | data_map[key] = self.data_map[key][:num_elements]
47 | return Dataset(data_map, deterministic)
48 |
49 |
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 | arrays = tuple(map(np.asarray, arrays))
53 | n = arrays[0].shape[0]
54 | assert all(a.shape[0] == n for a in arrays[1:])
55 | inds = np.arange(n)
56 | if shuffle: np.random.shuffle(inds)
57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 | for batch_inds in np.array_split(inds, sections):
59 | if include_final_partial_batch or len(batch_inds) == batch_size:
60 | yield tuple(a[batch_inds] for a in arrays)
61 |
--------------------------------------------------------------------------------
/control/baselines/common/input.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from gym.spaces import Discrete, Box, MultiDiscrete
4 |
5 | def observation_placeholder(ob_space, batch_size=None, name='Ob'):
6 | '''
7 | Create placeholder to feed observations into of the size appropriate to the observation space
8 |
9 | Parameters:
10 | ----------
11 |
12 | ob_space: gym.Space observation space
13 |
14 | batch_size: int size of the batch to be fed into input. Can be left None in most cases.
15 |
16 | name: str name of the placeholder
17 |
18 | Returns:
19 | -------
20 |
21 | tensorflow placeholder tensor
22 | '''
23 |
24 | assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \
25 | 'Can only deal with Discrete and Box observation spaces for now'
26 |
27 | dtype = ob_space.dtype
28 | if dtype == np.int8:
29 | dtype = np.uint8
30 |
31 | return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name)
32 |
33 |
34 | def observation_input(ob_space, batch_size=None, name='Ob'):
35 | '''
36 | Create placeholder to feed observations into of the size appropriate to the observation space, and add input
37 | encoder of the appropriate type.
38 | '''
39 |
40 | placeholder = observation_placeholder(ob_space, batch_size, name)
41 | return placeholder, encode_observation(ob_space, placeholder)
42 |
43 | def encode_observation(ob_space, placeholder):
44 | '''
45 | Encode input in the way that is appropriate to the observation space
46 |
47 | Parameters:
48 | ----------
49 |
50 | ob_space: gym.Space observation space
51 |
52 | placeholder: tf.placeholder observation input placeholder
53 | '''
54 | if isinstance(ob_space, Discrete):
55 | return tf.to_float(tf.one_hot(placeholder, ob_space.n))
56 | elif isinstance(ob_space, Box):
57 | return tf.to_float(placeholder)
58 | elif isinstance(ob_space, MultiDiscrete):
59 | placeholder = tf.cast(placeholder, tf.int32)
60 | one_hots = [tf.to_float(tf.one_hot(placeholder[..., i], ob_space.nvec[i])) for i in range(placeholder.shape[-1])]
61 | return tf.concat(one_hots, axis=-1)
62 | else:
63 | raise NotImplementedError
64 |
65 |
--------------------------------------------------------------------------------
/control/baselines/common/math_util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.signal
3 |
4 |
5 | def discount(x, gamma):
6 | """
7 | computes discounted sums along 0th dimension of x.
8 |
9 | inputs
10 | ------
11 | x: ndarray
12 | gamma: float
13 |
14 | outputs
15 | -------
16 | y: ndarray with same shape as x, satisfying
17 |
18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 | where k = len(x) - t - 1
20 |
21 | """
22 | assert x.ndim >= 1
23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 |
25 | def explained_variance(ypred,y):
26 | """
27 | Computes fraction of variance that ypred explains about y.
28 | Returns 1 - Var[y-ypred] / Var[y]
29 |
30 | interpretation:
31 | ev=0 => might as well have predicted zero
32 | ev=1 => perfect prediction
33 | ev<0 => worse than just predicting zero
34 |
35 | """
36 | assert y.ndim == 1 and ypred.ndim == 1
37 | vary = np.var(y)
38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 |
40 | def explained_variance_2d(ypred, y):
41 | assert y.ndim == 2 and ypred.ndim == 2
42 | vary = np.var(y, axis=0)
43 | out = 1 - np.var(y-ypred)/vary
44 | out[vary < 1e-10] = 0
45 | return out
46 |
47 | def ncc(ypred, y):
48 | return np.corrcoef(ypred, y)[1,0]
49 |
50 | def flatten_arrays(arrs):
51 | return np.concatenate([arr.flat for arr in arrs])
52 |
53 | def unflatten_vector(vec, shapes):
54 | i=0
55 | arrs = []
56 | for shape in shapes:
57 | size = np.prod(shape)
58 | arr = vec[i:i+size].reshape(shape)
59 | arrs.append(arr)
60 | i += size
61 | return arrs
62 |
63 | def discount_with_boundaries(X, New, gamma):
64 | """
65 | X: 2d array of floats, time x features
66 | New: 2d array of bools, indicating when a new episode has started
67 | """
68 | Y = np.zeros_like(X)
69 | T = X.shape[0]
70 | Y[T-1] = X[T-1]
71 | for t in range(T-2, -1, -1):
72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 | return Y
74 |
75 | def test_discount_with_boundaries():
76 | gamma=0.9
77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 | starts = [1.0, 0.0, 0.0, 1.0]
79 | y = discount_with_boundaries(x, starts, gamma)
80 | assert np.allclose(y, [
81 | 1 + gamma * 2 + gamma**2 * 3,
82 | 2 + gamma * 3,
83 | 3,
84 | 4
85 | ])
86 |
--------------------------------------------------------------------------------
/control/baselines/common/mpi_adam.py:
--------------------------------------------------------------------------------
1 | import baselines.common.tf_util as U
2 | import tensorflow as tf
3 | import numpy as np
4 | try:
5 | from mpi4py import MPI
6 | except ImportError:
7 | MPI = None
8 |
9 |
10 | class MpiAdam(object):
11 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
12 | self.var_list = var_list
13 | self.beta1 = beta1
14 | self.beta2 = beta2
15 | self.epsilon = epsilon
16 | self.scale_grad_by_procs = scale_grad_by_procs
17 | size = sum(U.numel(v) for v in var_list)
18 | self.m = np.zeros(size, 'float32')
19 | self.v = np.zeros(size, 'float32')
20 | self.t = 0
21 | self.setfromflat = U.SetFromFlat(var_list)
22 | self.getflat = U.GetFlat(var_list)
23 | self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm
24 |
25 | def update(self, localg, stepsize):
26 | if self.t % 100 == 0:
27 | self.check_synced()
28 | localg = localg.astype('float32')
29 | if self.comm is not None:
30 | globalg = np.zeros_like(localg)
31 | self.comm.Allreduce(localg, globalg, op=MPI.SUM)
32 | if self.scale_grad_by_procs:
33 | globalg /= self.comm.Get_size()
34 | else:
35 | globalg = np.copy(localg)
36 |
37 | self.t += 1
38 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
39 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
40 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
41 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
42 | self.setfromflat(self.getflat() + step)
43 |
44 | def sync(self):
45 | if self.comm is None:
46 | return
47 | theta = self.getflat()
48 | self.comm.Bcast(theta, root=0)
49 | self.setfromflat(theta)
50 |
51 | def check_synced(self):
52 | if self.comm is None:
53 | return
54 | if self.comm.Get_rank() == 0: # this is root
55 | theta = self.getflat()
56 | self.comm.Bcast(theta, root=0)
57 | else:
58 | thetalocal = self.getflat()
59 | thetaroot = np.empty_like(thetalocal)
60 | self.comm.Bcast(thetaroot, root=0)
61 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
62 |
63 | @U.in_session
64 | def test_MpiAdam():
65 | np.random.seed(0)
66 | tf.set_random_seed(0)
67 |
68 | a = tf.Variable(np.random.randn(3).astype('float32'))
69 | b = tf.Variable(np.random.randn(2,5).astype('float32'))
70 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
71 |
72 | stepsize = 1e-2
73 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
74 | do_update = U.function([], loss, updates=[update_op])
75 |
76 | tf.get_default_session().run(tf.global_variables_initializer())
77 | losslist_ref = []
78 | for i in range(10):
79 | l = do_update()
80 | print(i, l)
81 | losslist_ref.append(l)
82 |
83 |
84 |
85 | tf.set_random_seed(0)
86 | tf.get_default_session().run(tf.global_variables_initializer())
87 |
88 | var_list = [a,b]
89 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)])
90 | adam = MpiAdam(var_list)
91 |
92 | losslist_test = []
93 | for i in range(10):
94 | l,g = lossandgrad()
95 | adam.update(g, stepsize)
96 | print(i,l)
97 | losslist_test.append(l)
98 |
99 | np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
100 |
101 |
102 | if __name__ == '__main__':
103 | test_MpiAdam()
104 |
--------------------------------------------------------------------------------
/control/baselines/common/mpi_adam_optimizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from mpi4py import MPI
4 |
5 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
6 | """Adam optimizer that averages gradients across mpi processes."""
7 | def __init__(self, comm, **kwargs):
8 | self.comm = comm
9 | tf.train.AdamOptimizer.__init__(self, **kwargs)
10 | def compute_gradients(self, loss, var_list, **kwargs):
11 | grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
12 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
13 | flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
14 | shapes = [v.shape.as_list() for g, v in grads_and_vars]
15 | sizes = [int(np.prod(s)) for s in shapes]
16 |
17 | num_tasks = self.comm.Get_size()
18 | buf = np.zeros(sum(sizes), np.float32)
19 |
20 | def _collect_grads(flat_grad):
21 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
22 | np.divide(buf, float(num_tasks), out=buf)
23 | return buf
24 |
25 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
26 | avg_flat_grad.set_shape(flat_grad.shape)
27 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
28 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
29 | for g, (_, v) in zip(avg_grads, grads_and_vars)]
30 |
31 | return avg_grads_and_vars
32 |
--------------------------------------------------------------------------------
/control/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
1 | import os, subprocess, sys
2 |
3 | def mpi_fork(n, bind_to_core=False):
4 | """Re-launches the current script with workers
5 | Returns "parent" for original parent, "child" for MPI children
6 | """
7 | if n<=1:
8 | return "child"
9 | if os.getenv("IN_MPI") is None:
10 | env = os.environ.copy()
11 | env.update(
12 | MKL_NUM_THREADS="1",
13 | OMP_NUM_THREADS="1",
14 | IN_MPI="1"
15 | )
16 | args = ["mpirun", "-np", str(n)]
17 | if bind_to_core:
18 | args += ["-bind-to", "core"]
19 | args += [sys.executable] + sys.argv
20 | subprocess.check_call(args, env=env)
21 | return "parent"
22 | else:
23 | return "child"
24 |
--------------------------------------------------------------------------------
/control/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
1 | from mpi4py import MPI
2 | import numpy as np
3 | from baselines.common import zipsame
4 |
5 |
6 | def mpi_mean(x, axis=0, comm=None, keepdims=False):
7 | x = np.asarray(x)
8 | assert x.ndim > 0
9 | if comm is None: comm = MPI.COMM_WORLD
10 | xsum = x.sum(axis=axis, keepdims=keepdims)
11 | n = xsum.size
12 | localsum = np.zeros(n+1, x.dtype)
13 | localsum[:n] = xsum.ravel()
14 | localsum[n] = x.shape[axis]
15 | globalsum = np.zeros_like(localsum)
16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM)
17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
18 |
19 | def mpi_moments(x, axis=0, comm=None, keepdims=False):
20 | x = np.asarray(x)
21 | assert x.ndim > 0
22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
23 | sqdiffs = np.square(x - mean)
24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
25 | assert count1 == count
26 | std = np.sqrt(meansqdiff)
27 | if not keepdims:
28 | newshape = mean.shape[:axis] + mean.shape[axis+1:]
29 | mean = mean.reshape(newshape)
30 | std = std.reshape(newshape)
31 | return mean, std, count
32 |
33 |
34 | def test_runningmeanstd():
35 | import subprocess
36 | subprocess.check_call(['mpirun', '-np', '3',
37 | 'python','-c',
38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
39 |
40 | def _helper_runningmeanstd():
41 | comm = MPI.COMM_WORLD
42 | np.random.seed(0)
43 | for (triple,axis) in [
44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
47 | ]:
48 |
49 |
50 | x = np.concatenate(triple, axis=axis)
51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
52 |
53 |
54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
55 |
56 | for (a1,a2) in zipsame(ms1, ms2):
57 | print(a1, a2)
58 | assert np.allclose(a1, a2)
59 | print("ok!")
60 |
61 |
--------------------------------------------------------------------------------
/control/baselines/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
1 | try:
2 | from mpi4py import MPI
3 | except ImportError:
4 | MPI = None
5 |
6 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np
7 |
8 | class RunningMeanStd(object):
9 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
10 | def __init__(self, epsilon=1e-2, shape=()):
11 |
12 | self._sum = tf.get_variable(
13 | dtype=tf.float64,
14 | shape=shape,
15 | initializer=tf.constant_initializer(0.0),
16 | name="runningsum", trainable=False)
17 | self._sumsq = tf.get_variable(
18 | dtype=tf.float64,
19 | shape=shape,
20 | initializer=tf.constant_initializer(epsilon),
21 | name="runningsumsq", trainable=False)
22 | self._count = tf.get_variable(
23 | dtype=tf.float64,
24 | shape=(),
25 | initializer=tf.constant_initializer(epsilon),
26 | name="count", trainable=False)
27 | self.shape = shape
28 |
29 | self.mean = tf.to_float(self._sum / self._count)
30 | self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
31 |
32 | newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
33 | newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
34 | newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
35 | self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
36 | updates=[tf.assign_add(self._sum, newsum),
37 | tf.assign_add(self._sumsq, newsumsq),
38 | tf.assign_add(self._count, newcount)])
39 |
40 |
41 | def update(self, x):
42 | x = x.astype('float64')
43 | n = int(np.prod(self.shape))
44 | totalvec = np.zeros(n*2+1, 'float64')
45 | addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
46 | if MPI is not None:
47 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
48 | self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
49 |
50 | @U.in_session
51 | def test_runningmeanstd():
52 | for (x1, x2, x3) in [
53 | (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
54 | (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
55 | ]:
56 |
57 | rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
58 | U.initialize()
59 |
60 | x = np.concatenate([x1, x2, x3], axis=0)
61 | ms1 = [x.mean(axis=0), x.std(axis=0)]
62 | rms.update(x1)
63 | rms.update(x2)
64 | rms.update(x3)
65 | ms2 = [rms.mean.eval(), rms.std.eval()]
66 |
67 | assert np.allclose(ms1, ms2)
68 |
69 | @U.in_session
70 | def test_dist():
71 | np.random.seed(0)
72 | p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
73 | q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
74 |
75 | # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
76 | # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
77 |
78 | comm = MPI.COMM_WORLD
79 | assert comm.Get_size()==2
80 | if comm.Get_rank()==0:
81 | x1,x2,x3 = p1,p2,p3
82 | elif comm.Get_rank()==1:
83 | x1,x2,x3 = q1,q2,q3
84 | else:
85 | assert False
86 |
87 | rms = RunningMeanStd(epsilon=0.0, shape=(1,))
88 | U.initialize()
89 |
90 | rms.update(x1)
91 | rms.update(x2)
92 | rms.update(x3)
93 |
94 | bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
95 |
96 | def checkallclose(x,y):
97 | print(x,y)
98 | return np.allclose(x,y)
99 |
100 | assert checkallclose(
101 | bigvec.mean(axis=0),
102 | rms.mean.eval(),
103 | )
104 | assert checkallclose(
105 | bigvec.std(axis=0),
106 | rms.std.eval(),
107 | )
108 |
109 |
110 | if __name__ == "__main__":
111 | # Run with mpirun -np 2 python
112 | test_dist()
113 |
--------------------------------------------------------------------------------
/control/baselines/common/mpi_util.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from mpi4py import MPI
3 | import os, numpy as np
4 | import platform
5 | import shutil
6 | import subprocess
7 |
8 | def sync_from_root(sess, variables, comm=None):
9 | """
10 | Send the root node's parameters to every worker.
11 | Arguments:
12 | sess: the TensorFlow session.
13 | variables: all parameter variables including optimizer's
14 | """
15 | if comm is None: comm = MPI.COMM_WORLD
16 | rank = comm.Get_rank()
17 | for var in variables:
18 | if rank == 0:
19 | comm.Bcast(sess.run(var))
20 | else:
21 | import tensorflow as tf
22 | returned_var = np.empty(var.shape, dtype='float32')
23 | comm.Bcast(returned_var)
24 | sess.run(tf.assign(var, returned_var))
25 |
26 | def gpu_count():
27 | """
28 | Count the GPUs on this machine.
29 | """
30 | if shutil.which('nvidia-smi') is None:
31 | return 0
32 | output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
33 | return max(0, len(output.split(b'\n')) - 2)
34 |
35 | def setup_mpi_gpus():
36 | """
37 | Set CUDA_VISIBLE_DEVICES using MPI.
38 | """
39 | num_gpus = gpu_count()
40 | if num_gpus == 0:
41 | return
42 | local_rank, _ = get_local_rank_size(MPI.COMM_WORLD)
43 | os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus)
44 |
45 | def get_local_rank_size(comm):
46 | """
47 | Returns the rank of each process on its machine
48 | The processes on a given machine will be assigned ranks
49 | 0, 1, 2, ..., N-1,
50 | where N is the number of processes on this machine.
51 |
52 | Useful if you want to assign one gpu per machine
53 | """
54 | this_node = platform.node()
55 | ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
56 | node2rankssofar = defaultdict(int)
57 | local_rank = None
58 | for (rank, node) in ranks_nodes:
59 | if rank == comm.Get_rank():
60 | local_rank = node2rankssofar[node]
61 | node2rankssofar[node] += 1
62 | assert local_rank is not None
63 | return local_rank, node2rankssofar[this_node]
64 |
65 | def share_file(comm, path):
66 | """
67 | Copies the file from rank 0 to all other ranks
68 | Puts it in the same place on all machines
69 | """
70 | localrank, _ = get_local_rank_size(comm)
71 | if comm.Get_rank() == 0:
72 | with open(path, 'rb') as fh:
73 | data = fh.read()
74 | comm.bcast(data)
75 | else:
76 | data = comm.bcast(None)
77 | if localrank == 0:
78 | os.makedirs(os.path.dirname(path), exist_ok=True)
79 | with open(path, 'wb') as fh:
80 | fh.write(data)
81 | comm.Barrier()
82 |
83 | def dict_gather(comm, d, op='mean', assert_all_have_data=True):
84 | if comm is None: return d
85 | alldicts = comm.allgather(d)
86 | size = comm.size
87 | k2li = defaultdict(list)
88 | for d in alldicts:
89 | for (k,v) in d.items():
90 | k2li[k].append(v)
91 | result = {}
92 | for (k,li) in k2li.items():
93 | if assert_all_have_data:
94 | assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k)
95 | if op=='mean':
96 | result[k] = np.mean(li, axis=0)
97 | elif op=='sum':
98 | result[k] = np.sum(li, axis=0)
99 | else:
100 | assert 0, op
101 | return result
102 |
--------------------------------------------------------------------------------
/control/baselines/common/runners.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from abc import ABC, abstractmethod
3 |
4 | class AbstractEnvRunner(ABC):
5 | def __init__(self, *, env, model, nsteps):
6 | self.env = env
7 | self.model = model
8 | self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
11 | self.obs[:] = env.reset()
12 | self.nsteps = nsteps
13 | self.states = model.initial_state
14 | self.dones = [False for _ in range(nenv)]
15 |
16 | @abstractmethod
17 | def run(self):
18 | raise NotImplementedError
19 |
20 |
--------------------------------------------------------------------------------
/control/baselines/common/schedules.py:
--------------------------------------------------------------------------------
1 | """This file is used for specifying various schedules that evolve over
2 | time throughout the execution of the algorithm, such as:
3 | - learning rate for the optimizer
4 | - exploration epsilon for the epsilon greedy exploration strategy
5 | - beta parameter for beta parameter in prioritized replay
6 |
7 | Each schedule has a function `value(t)` which returns the current value
8 | of the parameter given the timestep t of the optimization procedure.
9 | """
10 |
11 |
12 | class Schedule(object):
13 | def value(self, t):
14 | """Value of the schedule at time t"""
15 | raise NotImplementedError()
16 |
17 |
18 | class ConstantSchedule(object):
19 | def __init__(self, value):
20 | """Value remains constant over time.
21 |
22 | Parameters
23 | ----------
24 | value: float
25 | Constant value of the schedule
26 | """
27 | self._v = value
28 |
29 | def value(self, t):
30 | """See Schedule.value"""
31 | return self._v
32 |
33 |
34 | def linear_interpolation(l, r, alpha):
35 | return l + alpha * (r - l)
36 |
37 |
38 | class PiecewiseSchedule(object):
39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
40 | """Piecewise schedule.
41 |
42 | endpoints: [(int, int)]
43 | list of pairs `(time, value)` meanining that schedule should output
44 | `value` when `t==time`. All the values for time must be sorted in
45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)`
46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
48 | time passed between `time_a` and `time_b` for time `t`.
49 | interpolation: lambda float, float, float: float
50 | a function that takes value to the left and to the right of t according
51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to
52 | right endpoint that t has covered. See linear_interpolation for example.
53 | outside_value: float
54 | if the value is requested outside of all the intervals sepecified in
55 | `endpoints` this value is returned. If None then AssertionError is
56 | raised when outside value is requested.
57 | """
58 | idxes = [e[0] for e in endpoints]
59 | assert idxes == sorted(idxes)
60 | self._interpolation = interpolation
61 | self._outside_value = outside_value
62 | self._endpoints = endpoints
63 |
64 | def value(self, t):
65 | """See Schedule.value"""
66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
67 | if l_t <= t and t < r_t:
68 | alpha = float(t - l_t) / (r_t - l_t)
69 | return self._interpolation(l, r, alpha)
70 |
71 | # t does not belong to any of the pieces, so doom.
72 | assert self._outside_value is not None
73 | return self._outside_value
74 |
75 |
76 | class LinearSchedule(object):
77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
78 | """Linear interpolation between initial_p and final_p over
79 | schedule_timesteps. After this many timesteps pass final_p is
80 | returned.
81 |
82 | Parameters
83 | ----------
84 | schedule_timesteps: int
85 | Number of timesteps for which to linearly anneal initial_p
86 | to final_p
87 | initial_p: float
88 | initial output value
89 | final_p: float
90 | final output value
91 | """
92 | self.schedule_timesteps = schedule_timesteps
93 | self.final_p = final_p
94 | self.initial_p = initial_p
95 |
96 | def value(self, t):
97 | """See Schedule.value"""
98 | fraction = min(float(t) / self.schedule_timesteps, 1.0)
99 | return self.initial_p + fraction * (self.final_p - self.initial_p)
100 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/baselines/common/tests/__init__.py
--------------------------------------------------------------------------------
/control/baselines/common/tests/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/baselines/common/tests/envs/__init__.py
--------------------------------------------------------------------------------
/control/baselines/common/tests/envs/fixed_sequence_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import Env
3 | from gym.spaces import Discrete
4 |
5 |
6 | class FixedSequenceEnv(Env):
7 | def __init__(
8 | self,
9 | n_actions=10,
10 | seed=0,
11 | episode_len=100
12 | ):
13 | self.np_random = np.random.RandomState()
14 | self.np_random.seed(seed)
15 | self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)]
16 |
17 | self.action_space = Discrete(n_actions)
18 | self.observation_space = Discrete(1)
19 |
20 | self.episode_len = episode_len
21 | self.time = 0
22 | self.reset()
23 |
24 | def reset(self):
25 | self.time = 0
26 | return 0
27 |
28 | def step(self, actions):
29 | rew = self._get_reward(actions)
30 | self._choose_next_state()
31 | done = False
32 | if self.episode_len and self.time >= self.episode_len:
33 | rew = 0
34 | done = True
35 |
36 | return 0, rew, done, {}
37 |
38 | def _choose_next_state(self):
39 | self.time += 1
40 |
41 | def _get_reward(self, actions):
42 | return 1 if actions == self.sequence[self.time] else 0
43 |
44 |
45 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/envs/identity_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from abc import abstractmethod
3 | from gym import Env
4 | from gym.spaces import MultiDiscrete, Discrete, Box
5 |
6 |
7 | class IdentityEnv(Env):
8 | def __init__(
9 | self,
10 | episode_len=None
11 | ):
12 |
13 | self.episode_len = episode_len
14 | self.time = 0
15 | self.reset()
16 |
17 | def reset(self):
18 | self._choose_next_state()
19 | self.time = 0
20 | self.observation_space = self.action_space
21 |
22 | return self.state
23 |
24 | def step(self, actions):
25 | rew = self._get_reward(actions)
26 | self._choose_next_state()
27 | done = False
28 | if self.episode_len and self.time >= self.episode_len:
29 | rew = 0
30 | done = True
31 |
32 | return self.state, rew, done, {}
33 |
34 | def _choose_next_state(self):
35 | self.state = self.action_space.sample()
36 | self.time += 1
37 |
38 | @abstractmethod
39 | def _get_reward(self, actions):
40 | raise NotImplementedError
41 |
42 |
43 | class DiscreteIdentityEnv(IdentityEnv):
44 | def __init__(
45 | self,
46 | dim,
47 | episode_len=None,
48 | ):
49 |
50 | self.action_space = Discrete(dim)
51 | super().__init__(episode_len=episode_len)
52 |
53 | def _get_reward(self, actions):
54 | return 1 if self.state == actions else 0
55 |
56 | class MultiDiscreteIdentityEnv(IdentityEnv):
57 | def __init__(
58 | self,
59 | dims,
60 | episode_len=None,
61 | ):
62 |
63 | self.action_space = MultiDiscrete(dims)
64 | super().__init__(episode_len=episode_len)
65 |
66 | def _get_reward(self, actions):
67 | return 1 if all(self.state == actions) else 0
68 |
69 |
70 | class BoxIdentityEnv(IdentityEnv):
71 | def __init__(
72 | self,
73 | shape,
74 | episode_len=None,
75 | ):
76 |
77 | self.action_space = Box(low=-1.0, high=1.0, shape=shape)
78 | super().__init__(episode_len=episode_len)
79 |
80 | def _get_reward(self, actions):
81 | diff = actions - self.state
82 | diff = diff[:]
83 | return -0.5 * np.dot(diff, diff)
84 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/envs/mnist_env.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import numpy as np
3 | import tempfile
4 | from gym import Env
5 | from gym.spaces import Discrete, Box
6 |
7 |
8 |
9 | class MnistEnv(Env):
10 | def __init__(
11 | self,
12 | seed=0,
13 | episode_len=None,
14 | no_images=None
15 | ):
16 | import filelock
17 | from tensorflow.examples.tutorials.mnist import input_data
18 | # we could use temporary directory for this with a context manager and
19 | # TemporaryDirecotry, but then each test that uses mnist would re-download the data
20 | # this way the data is not cleaned up, but we only download it once per machine
21 | mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
22 | with filelock.FileLock(mnist_path + '.lock'):
23 | self.mnist = input_data.read_data_sets(mnist_path)
24 |
25 | self.np_random = np.random.RandomState()
26 | self.np_random.seed(seed)
27 |
28 | self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
29 | self.action_space = Discrete(10)
30 | self.episode_len = episode_len
31 | self.time = 0
32 | self.no_images = no_images
33 |
34 | self.train_mode()
35 | self.reset()
36 |
37 | def reset(self):
38 | self._choose_next_state()
39 | self.time = 0
40 |
41 | return self.state[0]
42 |
43 | def step(self, actions):
44 | rew = self._get_reward(actions)
45 | self._choose_next_state()
46 | done = False
47 | if self.episode_len and self.time >= self.episode_len:
48 | rew = 0
49 | done = True
50 |
51 | return self.state[0], rew, done, {}
52 |
53 | def train_mode(self):
54 | self.dataset = self.mnist.train
55 |
56 | def test_mode(self):
57 | self.dataset = self.mnist.test
58 |
59 | def _choose_next_state(self):
60 | max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1
61 | index = self.np_random.randint(0, max_index)
62 | image = self.dataset.images[index].reshape(28,28,1)*255
63 | label = self.dataset.labels[index]
64 | self.state = (image, label)
65 | self.time += 1
66 |
67 | def _get_reward(self, actions):
68 | return 1 if self.state[1] == actions else 0
69 |
70 |
71 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_cartpole.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 |
4 | from baselines.run import get_learn_function
5 | from baselines.common.tests.util import reward_per_episode_test
6 |
7 | common_kwargs = dict(
8 | total_timesteps=30000,
9 | network='mlp',
10 | gamma=1.0,
11 | seed=0,
12 | )
13 |
14 | learn_kwargs = {
15 | 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
16 | 'acer': dict(value_network='copy'),
17 | 'acktr': dict(nsteps=32, value_network='copy', is_async=False),
18 | 'deepq': dict(total_timesteps=20000),
19 | 'ppo2': dict(value_network='copy'),
20 | 'trpo_mpi': {}
21 | }
22 |
23 | @pytest.mark.slow
24 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
25 | def test_cartpole(alg):
26 | '''
27 | Test if the algorithm (with an mlp policy)
28 | can learn to balance the cartpole
29 | '''
30 |
31 | kwargs = common_kwargs.copy()
32 | kwargs.update(learn_kwargs[alg])
33 |
34 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
35 | def env_fn():
36 |
37 | env = gym.make('CartPole-v0')
38 | env.seed(0)
39 | return env
40 |
41 | reward_per_episode_test(env_fn, learn_fn, 100)
42 |
43 | if __name__ == '__main__':
44 | test_cartpole('acer')
45 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_doc_examples.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | try:
3 | import mujoco_py
4 | _mujoco_present = True
5 | except BaseException:
6 | mujoco_py = None
7 | _mujoco_present = False
8 |
9 |
10 | @pytest.mark.skipif(
11 | not _mujoco_present,
12 | reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
13 | )
14 | def test_lstm_example():
15 | import tensorflow as tf
16 | from baselines.common import policies, models, cmd_util
17 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
18 |
19 | # create vectorized environment
20 | venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])
21 |
22 | with tf.Session() as sess:
23 | # build policy based on lstm network with 128 units
24 | policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1)
25 |
26 | # initialize tensorflow variables
27 | sess.run(tf.global_variables_initializer())
28 |
29 | # prepare environment variables
30 | ob = venv.reset()
31 | state = policy.initial_state
32 | done = [False]
33 | step_counter = 0
34 |
35 | # run a single episode until the end (i.e. until done)
36 | while True:
37 | action, _, state, _ = policy.step(ob, S=state, M=done)
38 | ob, reward, done, _ = venv.step(action)
39 | step_counter += 1
40 | if done:
41 | break
42 |
43 |
44 | assert step_counter > 5
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_env_after_learn.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 | import tensorflow as tf
4 |
5 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
6 | from baselines.run import get_learn_function
7 | from baselines.common.tf_util import make_session
8 |
9 | algos = ['a2c', 'acer', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
10 |
11 | @pytest.mark.parametrize('algo', algos)
12 | def test_env_after_learn(algo):
13 | def make_env():
14 | # acktr requires too much RAM, fails on travis
15 | env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
16 | return env
17 |
18 | make_session(make_default=True, graph=tf.Graph())
19 | env = SubprocVecEnv([make_env])
20 |
21 | learn = get_learn_function(algo)
22 |
23 | # Commenting out the following line resolves the issue, though crash happens at env.reset().
24 | learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)
25 |
26 | env.reset()
27 | env.close()
28 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_fetchreach.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 |
4 | from baselines.run import get_learn_function
5 | from baselines.common.tests.util import reward_per_episode_test
6 |
7 | pytest.importorskip('mujoco_py')
8 |
9 | common_kwargs = dict(
10 | network='mlp',
11 | seed=0,
12 | )
13 |
14 | learn_kwargs = {
15 | 'her': dict(total_timesteps=2000)
16 | }
17 |
18 | @pytest.mark.slow
19 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
20 | def test_fetchreach(alg):
21 | '''
22 | Test if the algorithm (with an mlp policy)
23 | can learn the FetchReach task
24 | '''
25 |
26 | kwargs = common_kwargs.copy()
27 | kwargs.update(learn_kwargs[alg])
28 |
29 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
30 | def env_fn():
31 |
32 | env = gym.make('FetchReach-v1')
33 | env.seed(0)
34 | return env
35 |
36 | reward_per_episode_test(env_fn, learn_fn, -15)
37 |
38 | if __name__ == '__main__':
39 | test_fetchreach('her')
40 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_fixed_sequence.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
3 |
4 | from baselines.common.tests.util import simple_test
5 | from baselines.run import get_learn_function
6 |
7 | common_kwargs = dict(
8 | seed=0,
9 | total_timesteps=50000,
10 | )
11 |
12 | learn_kwargs = {
13 | 'a2c': {},
14 | 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
15 | # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
16 | # github issue: https://github.com/openai/baselines/issues/188
17 | # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
18 | }
19 |
20 |
21 | alg_list = learn_kwargs.keys()
22 | rnn_list = ['lstm']
23 |
24 | @pytest.mark.slow
25 | @pytest.mark.parametrize("alg", alg_list)
26 | @pytest.mark.parametrize("rnn", rnn_list)
27 | def test_fixed_sequence(alg, rnn):
28 | '''
29 | Test if the algorithm (with a given policy)
30 | can learn an identity transformation (i.e. return observation as an action)
31 | '''
32 |
33 | kwargs = learn_kwargs[alg]
34 | kwargs.update(common_kwargs)
35 |
36 | episode_len = 5
37 | env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
38 | learn = lambda e: get_learn_function(alg)(
39 | env=e,
40 | network=rnn,
41 | **kwargs
42 | )
43 |
44 | simple_test(env_fn, learn, 0.7)
45 |
46 |
47 | if __name__ == '__main__':
48 | test_fixed_sequence('ppo2', 'lstm')
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_identity.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv
3 | from baselines.run import get_learn_function
4 | from baselines.common.tests.util import simple_test
5 |
6 | common_kwargs = dict(
7 | total_timesteps=30000,
8 | network='mlp',
9 | gamma=0.9,
10 | seed=0,
11 | )
12 |
13 | learn_kwargs = {
14 | 'a2c' : {},
15 | 'acktr': {},
16 | 'deepq': {},
17 | 'ddpg': dict(layer_norm=True),
18 | 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
19 | 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
20 | }
21 |
22 |
23 | algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
24 | algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi']
25 | algos_cont = ['a2c', 'acktr', 'ddpg', 'ppo2', 'trpo_mpi']
26 |
27 | @pytest.mark.slow
28 | @pytest.mark.parametrize("alg", algos_disc)
29 | def test_discrete_identity(alg):
30 | '''
31 | Test if the algorithm (with an mlp policy)
32 | can learn an identity transformation (i.e. return observation as an action)
33 | '''
34 |
35 | kwargs = learn_kwargs[alg]
36 | kwargs.update(common_kwargs)
37 |
38 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
39 | env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
40 | simple_test(env_fn, learn_fn, 0.9)
41 |
42 | @pytest.mark.slow
43 | @pytest.mark.parametrize("alg", algos_multidisc)
44 | def test_multidiscrete_identity(alg):
45 | '''
46 | Test if the algorithm (with an mlp policy)
47 | can learn an identity transformation (i.e. return observation as an action)
48 | '''
49 |
50 | kwargs = learn_kwargs[alg]
51 | kwargs.update(common_kwargs)
52 |
53 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
54 | env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100)
55 | simple_test(env_fn, learn_fn, 0.9)
56 |
57 | @pytest.mark.slow
58 | @pytest.mark.parametrize("alg", algos_cont)
59 | def test_continuous_identity(alg):
60 | '''
61 | Test if the algorithm (with an mlp policy)
62 | can learn an identity transformation (i.e. return observation as an action)
63 | to a required precision
64 | '''
65 |
66 | kwargs = learn_kwargs[alg]
67 | kwargs.update(common_kwargs)
68 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
69 |
70 | env_fn = lambda: BoxIdentityEnv((1,), episode_len=100)
71 | simple_test(env_fn, learn_fn, -0.1)
72 |
73 | if __name__ == '__main__':
74 | test_multidiscrete_identity('acktr')
75 |
76 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_mnist.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | # from baselines.acer import acer_simple as acer
4 | from baselines.common.tests.envs.mnist_env import MnistEnv
5 | from baselines.common.tests.util import simple_test
6 | from baselines.run import get_learn_function
7 |
8 |
9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
10 | # GitHub issue https://github.com/openai/baselines/issues/189
11 | common_kwargs = {
12 | 'seed': 0,
13 | 'network':'cnn',
14 | 'gamma':0.9,
15 | 'pad':'SAME'
16 | }
17 |
18 | learn_args = {
19 | 'a2c': dict(total_timesteps=50000),
20 | 'acer': dict(total_timesteps=20000),
21 | 'deepq': dict(total_timesteps=5000),
22 | 'acktr': dict(total_timesteps=30000),
23 | 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0),
24 | 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
25 | }
26 |
27 |
28 | #tests pass, but are too slow on travis. Same algorithms are covered
29 | # by other tests with less compute-hungry nn's and by benchmarks
30 | @pytest.mark.skip
31 | @pytest.mark.slow
32 | @pytest.mark.parametrize("alg", learn_args.keys())
33 | def test_mnist(alg):
34 | '''
35 | Test if the algorithm can learn to classify MNIST digits.
36 | Uses CNN policy.
37 | '''
38 |
39 | learn_kwargs = learn_args[alg]
40 | learn_kwargs.update(common_kwargs)
41 |
42 | learn = get_learn_function(alg)
43 | learn_fn = lambda e: learn(env=e, **learn_kwargs)
44 | env_fn = lambda: MnistEnv(seed=0, episode_len=100)
45 |
46 | simple_test(env_fn, learn_fn, 0.6)
47 |
48 | if __name__ == '__main__':
49 | test_mnist('acer')
50 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
4 |
5 |
6 | def test_piecewise_schedule():
7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
8 |
9 | assert np.isclose(ps.value(-10), 500)
10 | assert np.isclose(ps.value(0), 150)
11 | assert np.isclose(ps.value(5), 200)
12 | assert np.isclose(ps.value(9), 80)
13 | assert np.isclose(ps.value(50), 50)
14 | assert np.isclose(ps.value(80), 50)
15 | assert np.isclose(ps.value(150), 0)
16 | assert np.isclose(ps.value(175), -25)
17 | assert np.isclose(ps.value(201), 500)
18 | assert np.isclose(ps.value(500), 500)
19 |
20 | assert np.isclose(ps.value(200 - 1e-10), -50)
21 |
22 |
23 | def test_constant_schedule():
24 | cs = ConstantSchedule(5)
25 | for i in range(-100, 100):
26 | assert np.isclose(cs.value(i), 5)
27 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
4 |
5 |
6 | def test_tree_set():
7 | tree = SumSegmentTree(4)
8 |
9 | tree[2] = 1.0
10 | tree[3] = 3.0
11 |
12 | assert np.isclose(tree.sum(), 4.0)
13 | assert np.isclose(tree.sum(0, 2), 0.0)
14 | assert np.isclose(tree.sum(0, 3), 1.0)
15 | assert np.isclose(tree.sum(2, 3), 1.0)
16 | assert np.isclose(tree.sum(2, -1), 1.0)
17 | assert np.isclose(tree.sum(2, 4), 4.0)
18 |
19 |
20 | def test_tree_set_overlap():
21 | tree = SumSegmentTree(4)
22 |
23 | tree[2] = 1.0
24 | tree[2] = 3.0
25 |
26 | assert np.isclose(tree.sum(), 3.0)
27 | assert np.isclose(tree.sum(2, 3), 3.0)
28 | assert np.isclose(tree.sum(2, -1), 3.0)
29 | assert np.isclose(tree.sum(2, 4), 3.0)
30 | assert np.isclose(tree.sum(1, 2), 0.0)
31 |
32 |
33 | def test_prefixsum_idx():
34 | tree = SumSegmentTree(4)
35 |
36 | tree[2] = 1.0
37 | tree[3] = 3.0
38 |
39 | assert tree.find_prefixsum_idx(0.0) == 2
40 | assert tree.find_prefixsum_idx(0.5) == 2
41 | assert tree.find_prefixsum_idx(0.99) == 2
42 | assert tree.find_prefixsum_idx(1.01) == 3
43 | assert tree.find_prefixsum_idx(3.00) == 3
44 | assert tree.find_prefixsum_idx(4.00) == 3
45 |
46 |
47 | def test_prefixsum_idx2():
48 | tree = SumSegmentTree(4)
49 |
50 | tree[0] = 0.5
51 | tree[1] = 1.0
52 | tree[2] = 1.0
53 | tree[3] = 3.0
54 |
55 | assert tree.find_prefixsum_idx(0.00) == 0
56 | assert tree.find_prefixsum_idx(0.55) == 1
57 | assert tree.find_prefixsum_idx(0.99) == 1
58 | assert tree.find_prefixsum_idx(1.51) == 2
59 | assert tree.find_prefixsum_idx(3.00) == 3
60 | assert tree.find_prefixsum_idx(5.50) == 3
61 |
62 |
63 | def test_max_interval_tree():
64 | tree = MinSegmentTree(4)
65 |
66 | tree[0] = 1.0
67 | tree[2] = 0.5
68 | tree[3] = 3.0
69 |
70 | assert np.isclose(tree.min(), 0.5)
71 | assert np.isclose(tree.min(0, 2), 1.0)
72 | assert np.isclose(tree.min(0, 3), 0.5)
73 | assert np.isclose(tree.min(0, -1), 0.5)
74 | assert np.isclose(tree.min(2, 4), 0.5)
75 | assert np.isclose(tree.min(3, 4), 3.0)
76 |
77 | tree[2] = 0.7
78 |
79 | assert np.isclose(tree.min(), 0.7)
80 | assert np.isclose(tree.min(0, 2), 1.0)
81 | assert np.isclose(tree.min(0, 3), 0.7)
82 | assert np.isclose(tree.min(0, -1), 0.7)
83 | assert np.isclose(tree.min(2, 4), 0.7)
84 | assert np.isclose(tree.min(3, 4), 3.0)
85 |
86 | tree[2] = 4.0
87 |
88 | assert np.isclose(tree.min(), 1.0)
89 | assert np.isclose(tree.min(0, 2), 1.0)
90 | assert np.isclose(tree.min(0, 3), 1.0)
91 | assert np.isclose(tree.min(0, -1), 1.0)
92 | assert np.isclose(tree.min(2, 4), 3.0)
93 | assert np.isclose(tree.min(2, 3), 4.0)
94 | assert np.isclose(tree.min(2, -1), 4.0)
95 | assert np.isclose(tree.min(3, 4), 3.0)
96 |
97 |
98 | if __name__ == '__main__':
99 | test_tree_set()
100 | test_tree_set_overlap()
101 | test_prefixsum_idx()
102 | test_prefixsum_idx2()
103 | test_max_interval_tree()
104 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
1 | # tests for tf_util
2 | import tensorflow as tf
3 | from baselines.common.tf_util import (
4 | function,
5 | initialize,
6 | single_threaded_session
7 | )
8 |
9 |
10 | def test_function():
11 | with tf.Graph().as_default():
12 | x = tf.placeholder(tf.int32, (), name="x")
13 | y = tf.placeholder(tf.int32, (), name="y")
14 | z = 3 * x + 2 * y
15 | lin = function([x, y], z, givens={y: 0})
16 |
17 | with single_threaded_session():
18 | initialize()
19 |
20 | assert lin(2) == 6
21 | assert lin(2, 2) == 10
22 |
23 |
24 | def test_multikwargs():
25 | with tf.Graph().as_default():
26 | x = tf.placeholder(tf.int32, (), name="x")
27 | with tf.variable_scope("other"):
28 | x2 = tf.placeholder(tf.int32, (), name="x")
29 | z = 3 * x + 2 * x2
30 |
31 | lin = function([x, x2], z, givens={x2: 0})
32 | with single_threaded_session():
33 | initialize()
34 | assert lin(2) == 6
35 | assert lin(2, 2) == 10
36 |
37 |
38 | if __name__ == '__main__':
39 | test_function()
40 | test_multikwargs()
41 |
--------------------------------------------------------------------------------
/control/baselines/common/tests/util.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | from gym.spaces import np_random
4 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
5 |
6 | N_TRIALS = 10000
7 | N_EPISODES = 100
8 |
9 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
10 | np.random.seed(0)
11 | np_random.seed(0)
12 |
13 | env = DummyVecEnv([env_fn])
14 |
15 |
16 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
17 | tf.set_random_seed(0)
18 |
19 | model = learn_fn(env)
20 |
21 | sum_rew = 0
22 | done = True
23 |
24 | for i in range(n_trials):
25 | if done:
26 | obs = env.reset()
27 | state = model.initial_state
28 |
29 | if state is not None:
30 | a, v, state, _ = model.step(obs, S=state, M=[False])
31 | else:
32 | a, v, _, _ = model.step(obs)
33 |
34 | obs, rew, done, _ = env.step(a)
35 | sum_rew += float(rew)
36 |
37 | print("Reward in {} trials is {}".format(n_trials, sum_rew))
38 | assert sum_rew > min_reward_fraction * n_trials, \
39 | 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
40 |
41 |
42 |
43 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
44 | env = DummyVecEnv([env_fn])
45 |
46 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
47 | model = learn_fn(env)
48 |
49 | N_TRIALS = 100
50 |
51 | observations, actions, rewards = rollout(env, model, N_TRIALS)
52 | rewards = [sum(r) for r in rewards]
53 |
54 | avg_rew = sum(rewards) / N_TRIALS
55 | print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
56 | assert avg_rew > min_avg_reward, \
57 | 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
58 |
59 | def rollout(env, model, n_trials):
60 | rewards = []
61 | actions = []
62 | observations = []
63 |
64 | for i in range(n_trials):
65 | obs = env.reset()
66 | state = model.initial_state if hasattr(model, 'initial_state') else None
67 | episode_rew = []
68 | episode_actions = []
69 | episode_obs = []
70 |
71 | while True:
72 | if state is not None:
73 | a, v, state, _ = model.step(obs, S=state, M=[False])
74 | else:
75 | a,v, _, _ = model.step(obs)
76 |
77 | obs, rew, done, _ = env.step(a)
78 |
79 | episode_rew.append(rew)
80 | episode_actions.append(a)
81 | episode_obs.append(obs)
82 |
83 | if done:
84 | break
85 |
86 | rewards.append(episode_rew)
87 | actions.append(episode_actions)
88 | observations.append(episode_obs)
89 |
90 | return observations, actions, rewards
91 |
92 |
--------------------------------------------------------------------------------
/control/baselines/common/tile_images.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def tile_images(img_nhwc):
4 | """
5 | Tile N images into one big PxQ image
6 | (P,Q) are chosen to be as close as possible, and if N
7 | is square, then P=Q.
8 |
9 | input: img_nhwc, list or array of images, ndim=4 once turned into array
10 | n = batch index, h = height, w = width, c = channel
11 | returns:
12 | bigim_HWc, ndarray with ndim=3
13 | """
14 | img_nhwc = np.asarray(img_nhwc)
15 | N, h, w, c = img_nhwc.shape
16 | H = int(np.ceil(np.sqrt(N)))
17 | W = int(np.ceil(float(N)/H))
18 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
19 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
20 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
21 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
22 | return img_Hh_Ww_c
23 |
24 |
--------------------------------------------------------------------------------
/control/baselines/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import spaces
3 | from . import VecEnv
4 | from .util import copy_obs_dict, dict_to_obs, obs_space_info
5 |
6 | class DummyVecEnv(VecEnv):
7 | """
8 | VecEnv that does runs multiple environments sequentially, that is,
9 | the step and reset commands are send to one environment at a time.
10 | Useful when debugging and when num_env == 1 (in the latter case,
11 | avoids communication overhead)
12 | """
13 | def __init__(self, env_fns):
14 | """
15 | Arguments:
16 |
17 | env_fns: iterable of callables functions that build environments
18 | """
19 | self.envs = [fn() for fn in env_fns]
20 | env = self.envs[0]
21 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
22 | obs_space = env.observation_space
23 | self.keys, shapes, dtypes = obs_space_info(obs_space)
24 |
25 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
26 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
27 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
28 | self.buf_infos = [{} for _ in range(self.num_envs)]
29 | self.actions = None
30 | self.specs = [e.spec for e in self.envs]
31 |
32 | def step_async(self, actions):
33 | listify = True
34 | try:
35 | if len(actions) == self.num_envs:
36 | listify = False
37 | except TypeError:
38 | pass
39 |
40 | if not listify:
41 | self.actions = actions
42 | else:
43 | assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
44 | self.actions = [actions]
45 |
46 | def step_wait(self):
47 | for e in range(self.num_envs):
48 | action = self.actions[e]
49 | if isinstance(self.envs[e].action_space, spaces.Discrete):
50 | action = int(action)
51 |
52 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
53 | if self.buf_dones[e]:
54 | obs = self.envs[e].reset()
55 | self._save_obs(e, obs)
56 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
57 | self.buf_infos.copy())
58 |
59 | def reset(self):
60 | for e in range(self.num_envs):
61 | obs = self.envs[e].reset()
62 | self._save_obs(e, obs)
63 | return self._obs_from_buf()
64 |
65 | def _save_obs(self, e, obs):
66 | for k in self.keys:
67 | if k is None:
68 | self.buf_obs[k][e] = obs
69 | else:
70 | self.buf_obs[k][e] = obs[k]
71 |
72 | def _obs_from_buf(self):
73 | return dict_to_obs(copy_obs_dict(self.buf_obs))
74 |
75 | def get_images(self):
76 | return [env.render(mode='rgb_array') for env in self.envs]
77 |
78 | def render(self, mode='human'):
79 | if self.num_envs == 1:
80 | return self.envs[0].render(mode=mode)
81 | else:
82 | return super().render(mode=mode)
83 |
--------------------------------------------------------------------------------
/control/baselines/common/vec_env/test_vec_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for asynchronous vectorized environments.
3 | """
4 |
5 | import gym
6 | import numpy as np
7 | import pytest
8 | from .dummy_vec_env import DummyVecEnv
9 | from .shmem_vec_env import ShmemVecEnv
10 | from .subproc_vec_env import SubprocVecEnv
11 |
12 |
13 | def assert_envs_equal(env1, env2, num_steps):
14 | """
15 | Compare two environments over num_steps steps and make sure
16 | that the observations produced by each are the same when given
17 | the same actions.
18 | """
19 | assert env1.num_envs == env2.num_envs
20 | assert env1.action_space.shape == env2.action_space.shape
21 | assert env1.action_space.dtype == env2.action_space.dtype
22 | joint_shape = (env1.num_envs,) + env1.action_space.shape
23 |
24 | try:
25 | obs1, obs2 = env1.reset(), env2.reset()
26 | assert np.array(obs1).shape == np.array(obs2).shape
27 | assert np.array(obs1).shape == joint_shape
28 | assert np.allclose(obs1, obs2)
29 | np.random.seed(1337)
30 | for _ in range(num_steps):
31 | actions = np.array(np.random.randint(0, 0x100, size=joint_shape),
32 | dtype=env1.action_space.dtype)
33 | for env in [env1, env2]:
34 | env.step_async(actions)
35 | outs1 = env1.step_wait()
36 | outs2 = env2.step_wait()
37 | for out1, out2 in zip(outs1[:3], outs2[:3]):
38 | assert np.array(out1).shape == np.array(out2).shape
39 | assert np.allclose(out1, out2)
40 | assert list(outs1[3]) == list(outs2[3])
41 | finally:
42 | env1.close()
43 | env2.close()
44 |
45 |
46 | @pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv))
47 | @pytest.mark.parametrize('dtype', ('uint8', 'float32'))
48 | def test_vec_env(klass, dtype): # pylint: disable=R0914
49 | """
50 | Test that a vectorized environment is equivalent to
51 | DummyVecEnv, since DummyVecEnv is less likely to be
52 | error prone.
53 | """
54 | num_envs = 3
55 | num_steps = 100
56 | shape = (3, 8)
57 |
58 | def make_fn(seed):
59 | """
60 | Get an environment constructor with a seed.
61 | """
62 | return lambda: SimpleEnv(seed, shape, dtype)
63 | fns = [make_fn(i) for i in range(num_envs)]
64 | env1 = DummyVecEnv(fns)
65 | env2 = klass(fns)
66 | assert_envs_equal(env1, env2, num_steps=num_steps)
67 |
68 |
69 | class SimpleEnv(gym.Env):
70 | """
71 | An environment with a pre-determined observation space
72 | and RNG seed.
73 | """
74 |
75 | def __init__(self, seed, shape, dtype):
76 | np.random.seed(seed)
77 | self._dtype = dtype
78 | self._start_obs = np.array(np.random.randint(0, 0x100, size=shape),
79 | dtype=dtype)
80 | self._max_steps = seed + 1
81 | self._cur_obs = None
82 | self._cur_step = 0
83 | # this is 0xFF instead of 0x100 because the Box space includes
84 | # the high end, while randint does not
85 | self.action_space = gym.spaces.Box(low=0, high=0xFF, shape=shape, dtype=dtype)
86 | self.observation_space = self.action_space
87 |
88 | def step(self, action):
89 | self._cur_obs += np.array(action, dtype=self._dtype)
90 | self._cur_step += 1
91 | done = self._cur_step >= self._max_steps
92 | reward = self._cur_step / self._max_steps
93 | return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)}
94 |
95 | def reset(self):
96 | self._cur_obs = self._start_obs
97 | self._cur_step = 0
98 | return self._cur_obs
99 |
100 | def render(self, mode=None):
101 | raise NotImplementedError
102 |
--------------------------------------------------------------------------------
/control/baselines/common/vec_env/test_video_recorder.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for asynchronous vectorized environments.
3 | """
4 |
5 | import gym
6 | import pytest
7 | import os
8 | import glob
9 | import tempfile
10 |
11 | from .dummy_vec_env import DummyVecEnv
12 | from .shmem_vec_env import ShmemVecEnv
13 | from .subproc_vec_env import SubprocVecEnv
14 | from .vec_video_recorder import VecVideoRecorder
15 |
16 | @pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv))
17 | @pytest.mark.parametrize('num_envs', (1, 4))
18 | @pytest.mark.parametrize('video_length', (10, 100))
19 | @pytest.mark.parametrize('video_interval', (1, 50))
20 | def test_video_recorder(klass, num_envs, video_length, video_interval):
21 | """
22 | Wrap an existing VecEnv with VevVideoRecorder,
23 | Make (video_interval + video_length + 1) steps,
24 | then check that the file is present
25 | """
26 |
27 | def make_fn():
28 | env = gym.make('PongNoFrameskip-v4')
29 | return env
30 | fns = [make_fn for _ in range(num_envs)]
31 | env = klass(fns)
32 |
33 | with tempfile.TemporaryDirectory() as video_path:
34 | env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length)
35 |
36 | env.reset()
37 | for _ in range(video_interval + video_length + 1):
38 | env.step([0] * num_envs)
39 | env.close()
40 |
41 |
42 | recorded_video = glob.glob(os.path.join(video_path, "*.mp4"))
43 |
44 | # first and second step
45 | assert len(recorded_video) == 2
46 | # Files are not empty
47 | assert all(os.stat(p).st_size != 0 for p in recorded_video)
48 |
49 |
50 |
--------------------------------------------------------------------------------
/control/baselines/common/vec_env/util.py:
--------------------------------------------------------------------------------
1 | """
2 | Helpers for dealing with vectorized environments.
3 | """
4 |
5 | from collections import OrderedDict
6 |
7 | import gym
8 | import numpy as np
9 |
10 |
11 | def copy_obs_dict(obs):
12 | """
13 | Deep-copy an observation dict.
14 | """
15 | return {k: np.copy(v) for k, v in obs.items()}
16 |
17 |
18 | def dict_to_obs(obs_dict):
19 | """
20 | Convert an observation dict into a raw array if the
21 | original observation space was not a Dict space.
22 | """
23 | if set(obs_dict.keys()) == {None}:
24 | return obs_dict[None]
25 | return obs_dict
26 |
27 |
28 | def obs_space_info(obs_space):
29 | """
30 | Get dict-structured information about a gym.Space.
31 |
32 | Returns:
33 | A tuple (keys, shapes, dtypes):
34 | keys: a list of dict keys.
35 | shapes: a dict mapping keys to shapes.
36 | dtypes: a dict mapping keys to dtypes.
37 | """
38 | if isinstance(obs_space, gym.spaces.Dict):
39 | assert isinstance(obs_space.spaces, OrderedDict)
40 | subspaces = obs_space.spaces
41 | else:
42 | subspaces = {None: obs_space}
43 | keys = []
44 | shapes = {}
45 | dtypes = {}
46 | for key, box in subspaces.items():
47 | keys.append(key)
48 | shapes[key] = box.shape
49 | dtypes[key] = box.dtype
50 | return keys, shapes, dtypes
51 |
52 |
53 | def obs_to_dict(obs):
54 | """
55 | Convert an observation into a dict.
56 | """
57 | if isinstance(obs, dict):
58 | return obs
59 | return {None: obs}
60 |
--------------------------------------------------------------------------------
/control/baselines/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | import numpy as np
3 | from gym import spaces
4 |
5 |
6 | class VecFrameStack(VecEnvWrapper):
7 | def __init__(self, venv, nstack):
8 | self.venv = venv
9 | self.nstack = nstack
10 | wos = venv.observation_space # wrapped ob space
11 | low = np.repeat(wos.low, self.nstack, axis=-1)
12 | high = np.repeat(wos.high, self.nstack, axis=-1)
13 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
14 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
15 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
16 |
17 | def step_wait(self):
18 | obs, rews, news, infos = self.venv.step_wait()
19 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
20 | for (i, new) in enumerate(news):
21 | if new:
22 | self.stackedobs[i] = 0
23 | self.stackedobs[..., -obs.shape[-1]:] = obs
24 | return self.stackedobs, rews, news, infos
25 |
26 | def reset(self):
27 | obs = self.venv.reset()
28 | self.stackedobs[...] = 0
29 | self.stackedobs[..., -obs.shape[-1]:] = obs
30 | return self.stackedobs
31 |
--------------------------------------------------------------------------------
/control/baselines/common/vec_env/vec_monitor.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | from baselines.bench.monitor import ResultsWriter
3 | import numpy as np
4 | import time
5 |
6 |
7 | class VecMonitor(VecEnvWrapper):
8 | def __init__(self, venv, filename=None):
9 | VecEnvWrapper.__init__(self, venv)
10 | self.eprets = None
11 | self.eplens = None
12 | self.tstart = time.time()
13 | self.results_writer = ResultsWriter(filename, header={'t_start': self.tstart})
14 |
15 | def reset(self):
16 | obs = self.venv.reset()
17 | self.eprets = np.zeros(self.num_envs, 'f')
18 | self.eplens = np.zeros(self.num_envs, 'i')
19 | return obs
20 |
21 | def step_wait(self):
22 | obs, rews, dones, infos = self.venv.step_wait()
23 | self.eprets += rews
24 | self.eplens += 1
25 | newinfos = []
26 | for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)):
27 | info = info.copy()
28 | if done:
29 | epinfo = {'r': ret, 'l': eplen, 't': round(time.time() - self.tstart, 6)}
30 | info['episode'] = epinfo
31 | self.eprets[i] = 0
32 | self.eplens[i] = 0
33 | self.results_writer.write_row(epinfo)
34 |
35 | newinfos.append(info)
36 |
37 | return obs, rews, dones, newinfos
38 |
--------------------------------------------------------------------------------
/control/baselines/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | from baselines.common.running_mean_std import RunningMeanStd
3 | import numpy as np
4 |
5 |
6 | class VecNormalize(VecEnvWrapper):
7 | """
8 | A vectorized wrapper that normalizes the observations
9 | and returns from an environment.
10 | """
11 |
12 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
13 | VecEnvWrapper.__init__(self, venv)
14 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
15 | self.ret_rms = RunningMeanStd(shape=()) if ret else None
16 | self.clipob = clipob
17 | self.cliprew = cliprew
18 | self.ret = np.zeros(self.num_envs)
19 | self.gamma = gamma
20 | self.epsilon = epsilon
21 |
22 | def step_wait(self):
23 | obs, rews, news, infos = self.venv.step_wait()
24 | self.ret = self.ret * self.gamma + rews
25 | obs = self._obfilt(obs)
26 | if self.ret_rms:
27 | self.ret_rms.update(self.ret)
28 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
29 | self.ret[news] = 0.
30 | return obs, rews, news, infos
31 |
32 | def _obfilt(self, obs):
33 | if self.ob_rms:
34 | self.ob_rms.update(obs)
35 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
36 | return obs
37 | else:
38 | return obs
39 |
40 | def reset(self):
41 | self.ret = np.zeros(self.num_envs)
42 | obs = self.venv.reset()
43 | return self._obfilt(obs)
44 |
--------------------------------------------------------------------------------
/control/baselines/common/vec_env/vec_video_recorder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from baselines import logger
3 | from baselines.common.vec_env import VecEnvWrapper
4 | from gym.wrappers.monitoring import video_recorder
5 |
6 |
7 | class VecVideoRecorder(VecEnvWrapper):
8 | """
9 | Wrap VecEnv to record rendered image as mp4 video.
10 | """
11 |
12 | def __init__(self, venv, directory, record_video_trigger, video_length=200):
13 | """
14 | # Arguments
15 | venv: VecEnv to wrap
16 | directory: Where to save videos
17 | record_video_trigger:
18 | Function that defines when to start recording.
19 | The function takes the current number of step,
20 | and returns whether we should start recording or not.
21 | video_length: Length of recorded video
22 | """
23 |
24 | VecEnvWrapper.__init__(self, venv)
25 | self.record_video_trigger = record_video_trigger
26 | self.video_recorder = None
27 |
28 | self.directory = os.path.abspath(directory)
29 | if not os.path.exists(self.directory): os.mkdir(self.directory)
30 |
31 | self.file_prefix = "vecenv"
32 | self.file_infix = '{}'.format(os.getpid())
33 | self.step_id = 0
34 | self.video_length = video_length
35 |
36 | self.recording = False
37 | self.recorded_frames = 0
38 |
39 | def reset(self):
40 | obs = self.venv.reset()
41 |
42 | self.start_video_recorder()
43 |
44 | return obs
45 |
46 | def start_video_recorder(self):
47 | self.close_video_recorder()
48 |
49 | base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id))
50 | self.video_recorder = video_recorder.VideoRecorder(
51 | env=self.venv,
52 | base_path=base_path,
53 | metadata={'step_id': self.step_id}
54 | )
55 |
56 | self.video_recorder.capture_frame()
57 | self.recorded_frames = 1
58 | self.recording = True
59 |
60 | def _video_enabled(self):
61 | return self.record_video_trigger(self.step_id)
62 |
63 | def step_wait(self):
64 | obs, rews, dones, infos = self.venv.step_wait()
65 |
66 | self.step_id += 1
67 | if self.recording:
68 | self.video_recorder.capture_frame()
69 | self.recorded_frames += 1
70 | if self.recorded_frames > self.video_length:
71 | logger.info("Saving video to ", self.video_recorder.path)
72 | self.close_video_recorder()
73 | elif self._video_enabled():
74 | self.start_video_recorder()
75 |
76 | return obs, rews, dones, infos
77 |
78 | def close_video_recorder(self):
79 | if self.recording:
80 | self.video_recorder.close()
81 | self.recording = False
82 | self.recorded_frames = 0
83 |
84 | def close(self):
85 | VecEnvWrapper.close(self)
86 | self.close_video_recorder()
87 |
88 | def __del__(self):
89 | self.close()
90 |
--------------------------------------------------------------------------------
/control/baselines/ppoc_int/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/baselines/ppoc_int/__init__.py
--------------------------------------------------------------------------------
/control/baselines/ppoc_int/assets/twod_tmaze.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/control/baselines/ppoc_int/normalized_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from gym import spaces
4 |
5 | from gym.envs.registration import load
6 |
7 |
8 | class NormalizedActionWrapper(gym.ActionWrapper):
9 | """Environment wrapper to normalize the action space to [-1, 1]. This
10 | wrapper is adapted from rllab's [1] wrapper `NormalizedEnv`
11 | https://github.com/rll/rllab/blob/b3a28992eca103cab3cb58363dd7a4bb07f250a0/rllab/envs/normalized_env.py
12 | [1] Yan Duan, Xi Chen, Rein Houthooft, John Schulman, Pieter Abbeel,
13 | "Benchmarking Deep Reinforcement Learning for Continuous Control", 2016
14 | (https://arxiv.org/abs/1604.06778)
15 | """
16 | def __init__(self, env):
17 | super(NormalizedActionWrapper, self).__init__(env)
18 | self.action_space = spaces.Box(low=-1.0, high=1.0,
19 | shape=self.env.action_space.shape)
20 |
21 | def action(self, action):
22 | # Clip the action in [-1, 1]
23 | action = np.clip(action, -1.0, 1.0)
24 | # Map the normalized action to original action space
25 | lb, ub = self.env.action_space.low, self.env.action_space.high
26 | action = lb + 0.5 * (action + 1.0) * (ub - lb)
27 | return action
28 |
29 | def reverse_action(self, action):
30 | # Map the original action to normalized action space
31 | lb, ub = self.env.action_space.low, self.env.action_space.high
32 | action = 2.0 * (action - lb) / (ub - lb) - 1.0
33 | # Clip the action in [-1, 1]
34 | action = np.clip(action, -1.0, 1.0)
35 | return action
36 |
37 |
38 |
39 | def mujoco_wrapper(entry_point, **kwargs):
40 | # Load the environment from its entry point
41 | env_cls = load(entry_point)
42 | env = env_cls(**kwargs)
43 | # Normalization wrapper
44 | env = NormalizedActionWrapper(env)
45 | return env
--------------------------------------------------------------------------------
/control/baselines/ppoc_int/plot_res.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import seaborn as sns; sns.set(color_codes=True)
3 | import numpy as np
4 | from collections import deque
5 | import pdb
6 | sns.set(style='ticks')
7 |
8 | name='TMaze'
9 |
10 | seeds = [0,1,2,3,4,]
11 | shortest=np.inf
12 | data=[]
13 | axes=[]
14 | direc='res'
15 | for seed in seeds:
16 | dat = np.genfromtxt('{}/{}seed{}_intfc1_2opts.csv'.format(direc,name,seed), delimiter=',')[1:200,1]
17 | print(len(dat))
18 | if len(dat) < shortest:
19 | shortest=len(dat)
20 |
21 | rewbuffer = deque(maxlen=100)
22 | real_dat=[]
23 | for d in dat:
24 | rewbuffer.append(d)
25 | real_dat.append(np.mean(rewbuffer))
26 | data.append(real_dat)
27 | for i in range(len(data)):
28 | data[i] = data[i][:shortest]
29 | axes.append(sns.tsplot(data=data,legend=True,condition='IOC',color='red'))
30 |
31 |
32 |
33 | shortest=np.inf
34 | data=[]
35 | for seed in seeds:
36 | dat = np.genfromtxt('{}/{}seed{}_intfc0_2opts.csv'.format(direc,name,seed), delimiter=',')[1:200,1]
37 | print(len(dat))
38 | if len(dat) < shortest:
39 | shortest=len(dat)
40 |
41 | rewbuffer = deque(maxlen=100)
42 | real_dat=[]
43 | for d in dat:
44 | rewbuffer.append(d)
45 | real_dat.append(np.mean(rewbuffer))
46 | data.append(real_dat)
47 | for i in range(len(data)):
48 | data[i] = data[i][:shortest]
49 | axes.append(sns.tsplot(data=data,legend=True,condition='OC',color='blue'))
50 |
51 |
52 | plt.gcf().subplots_adjust(bottom=0.15)
53 | plt.xlabel('Iterations',fontsize=18)
54 | plt.ylabel('Average Rewards',fontsize=18)
55 | plt.legend()
56 | plt.title("Results on {}-v0".format(name))
57 | plt.savefig('plots/{}_notrans.png'.format(name))
58 | plt.clf()
59 |
--------------------------------------------------------------------------------
/control/baselines/ppoc_int/run_mujoco.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | from baselines.common import set_global_seeds, tf_util as U
3 | import gym, logging
4 | from baselines import logger
5 | from half_cheetah import *
6 |
7 |
8 | def train(env_id, num_timesteps, seed, num_options,app, saves ,wsaves, epoch,dc,plots,w_intfc,switch,mainlr,intlr,piolr,fewshot,k):
9 | from baselines.ppoc_int import mlp_policy, pposgd_simple
10 | U.make_session(num_cpu=1).__enter__()
11 | set_global_seeds(seed)
12 |
13 | if env_id=="TMaze":
14 | from twod_tmaze import TMaze
15 | env=TMaze()
16 | env.seed(seed)
17 | else:
18 | env = gym.make(env_id)
19 | env._seed(seed)
20 |
21 |
22 | def policy_fn(name, ob_space, ac_space):
23 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
24 | hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc, w_intfc=w_intfc,k=k)
25 |
26 | gym.logger.setLevel(logging.WARN)
27 |
28 | if num_options ==1:
29 | optimsize=64
30 | elif num_options ==2:
31 | optimsize=32
32 | else:
33 | optimsize=int(64/num_options)
34 |
35 |
36 | num_timesteps = num_timesteps #if env_id!="TMaze" else 5e5
37 | tperbatch = 2048 if not epoch else int(1e4)
38 | pposgd_simple.learn(env, policy_fn,
39 | max_timesteps=num_timesteps,
40 | timesteps_per_batch=tperbatch,
41 | clip_param=0.2, entcoeff=0.0,
42 | optim_epochs=10, optim_stepsize=mainlr, optim_batchsize=optimsize,
43 | gamma=0.99, lam=0.95, schedule='constant', num_options=num_options,
44 | app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed,dc=dc,plots=plots,
45 | w_intfc=w_intfc,switch=switch,intlr=intlr,piolr=piolr,fewshot=fewshot,k=k
46 | )
47 | env.close()
48 |
49 | def main():
50 | import argparse
51 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
52 | parser.add_argument('--env', help='environment ID', default='TMaze')
53 | parser.add_argument('--timesteps', help='number of timesteps', type=int, default=1e6)
54 | parser.add_argument('--seed', help='RNG seed', type=int, default=1)
55 | parser.add_argument('--opt', help='number of options', type=int, default=2)
56 | parser.add_argument('--app', help='Append to folder name', type=str, default='')
57 | parser.add_argument('--saves', help='Save the returns at each iteration', dest='saves', action='store_true', default=False)
58 | parser.add_argument('--wsaves', help='Save the weights',dest='wsaves', action='store_true', default=False)
59 | parser.add_argument('--plots', help='Plot some visualization', dest='plots', action='store_true', default=False)
60 | parser.add_argument('--switch', help='Switch task after 150 iterations', dest='switch', action='store_true', default=False)
61 | parser.add_argument('--fewshot', help='Value learning after 150 iterations', dest='fewshot', action='store_true', default=False)
62 | parser.add_argument('--nointfc', help='Disables interet functions', dest='w_intfc', action='store_false', default=True)
63 | parser.add_argument('--epoch', help='Load weights from a certain epoch', type=int, default=0)
64 | parser.add_argument('--dc', help='Deliberation cost (not used)', type=float, default=0.)
65 | parser.add_argument('--mainlr', type=float, default=3e-4)
66 | parser.add_argument('--intlr', type=float, default=1e-4)
67 | parser.add_argument('--piolr', type=float, default=1e-4)
68 | parser.add_argument('--k', type=float, default=0., help='threshold for interest function')
69 |
70 |
71 |
72 |
73 | args = parser.parse_args()
74 |
75 | train(args.env, num_timesteps=args.timesteps, seed=args.seed, num_options=args.opt, app=args.app,
76 | saves=args.saves, wsaves=args.wsaves, epoch=args.epoch,dc=args.dc,plots=args.plots,
77 | w_intfc=args.w_intfc,switch=args.switch,mainlr=args.mainlr,intlr=args.intlr,piolr=args.piolr,fewshot=args.fewshot,k=args.k)
78 |
79 |
80 | if __name__ == '__main__':
81 | main()
82 |
--------------------------------------------------------------------------------
/control/baselines/ppoc_int/seeding.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import numpy as np
3 | import os
4 | import random as _random
5 | import struct
6 | import sys
7 |
8 | from gym import error
9 |
10 | if sys.version_info < (3,):
11 | integer_types = (int, long)
12 | else:
13 | integer_types = (int,)
14 |
15 | # Fortunately not needed right now!
16 | #
17 | # def random(seed=None):
18 | # seed = _seed(seed)
19 | #
20 | # rng = _random.Random()
21 | # rng.seed(hash_seed(seed))
22 | # return rng, seed
23 |
24 | def np_random(seed=None):
25 | if seed is not None and not (isinstance(seed, integer_types) and 0 <= seed):
26 | raise error.Error('Seed must be a non-negative integer or omitted, not {}'.format(seed))
27 |
28 | seed = _seed(seed)
29 |
30 | rng = np.random.RandomState()
31 | rng.seed(_int_list_from_bigint(hash_seed(seed)))
32 | return rng, seed
33 |
34 | def hash_seed(seed=None, max_bytes=8):
35 | """Any given evaluation is likely to have many PRNG's active at
36 | once. (Most commonly, because the environment is running in
37 | multiple processes.) There's literature indicating that having
38 | linear correlations between seeds of multiple PRNG's can correlate
39 | the outputs:
40 |
41 | http://blogs.unity3d.com/2015/01/07/a-primer-on-repeatable-random-numbers/
42 | http://stackoverflow.com/questions/1554958/how-different-do-random-seeds-need-to-be
43 | http://dl.acm.org/citation.cfm?id=1276928
44 |
45 | Thus, for sanity we hash the seeds before using them. (This scheme
46 | is likely not crypto-strength, but it should be good enough to get
47 | rid of simple correlations.)
48 |
49 | Args:
50 | seed (Optional[int]): None seeds from an operating system specific randomness source.
51 | max_bytes: Maximum number of bytes to use in the hashed seed.
52 | """
53 | if seed is None:
54 | seed = _seed(max_bytes=max_bytes)
55 | hash = hashlib.sha512(str(seed).encode('utf8')).digest()
56 | return _bigint_from_bytes(hash[:max_bytes])
57 |
58 | def _seed(a=None, max_bytes=8):
59 | """Create a strong random seed. Otherwise, Python 2 would seed using
60 | the system time, which might be non-robust especially in the
61 | presence of concurrency.
62 |
63 | Args:
64 | a (Optional[int, str]): None seeds from an operating system specific randomness source.
65 | max_bytes: Maximum number of bytes to use in the seed.
66 | """
67 | # Adapted from https://svn.python.org/projects/python/tags/r32/Lib/random.py
68 | if a is None:
69 | a = _bigint_from_bytes(os.urandom(max_bytes))
70 | elif isinstance(a, str):
71 | a = a.encode('utf8')
72 | a += hashlib.sha512(a).digest()
73 | a = _bigint_from_bytes(a[:max_bytes])
74 | elif isinstance(a, integer_types):
75 | a = a % 2**(8 * max_bytes)
76 | else:
77 | raise error.Error('Invalid type for seed: {} ({})'.format(type(a), a))
78 |
79 | return a
80 |
81 | # TODO: don't hardcode sizeof_int here
82 | def _bigint_from_bytes(bytes):
83 | sizeof_int = 4
84 | padding = sizeof_int - len(bytes) % sizeof_int
85 | bytes += b'\0' * padding
86 | int_count = int(len(bytes) / sizeof_int)
87 | unpacked = struct.unpack("{}I".format(int_count), bytes)
88 | accum = 0
89 | for i, val in enumerate(unpacked):
90 | accum += 2 ** (sizeof_int * 8 * i) * val
91 | return accum
92 |
93 | def _int_list_from_bigint(bigint):
94 | # Special case 0
95 | if bigint < 0:
96 | raise error.Error('Seed must be non-negative, not {}'.format(bigint))
97 | elif bigint == 0:
98 | return [0]
99 |
100 | ints = []
101 | while bigint > 0:
102 | bigint, mod = divmod(bigint, 2 ** 32)
103 | ints.append(mod)
104 | return ints
105 |
--------------------------------------------------------------------------------
/control/baselines/ppoc_int/twod_tmaze.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco import mujoco_env
3 | from gym.spaces import Box
4 | import seeding
5 |
6 |
7 | class TwoDEnv(mujoco_env.MujocoEnv):
8 | def __init__(self, model_path, frame_skip, xbounds, ybounds):
9 | super(TwoDEnv, self).__init__(model_path=model_path, frame_skip=frame_skip)
10 | assert isinstance(self.observation_space, Box)
11 | assert self.observation_space.shape == (2,)
12 |
13 | def get_viewer(self):
14 | return self._get_viewer()
15 |
16 | import numpy as np
17 | from gym import utils
18 | import os
19 |
20 |
21 |
22 |
23 | def get_asset_xml(xml_name):
24 | return os.path.join(os.path.join(os.path.dirname(__file__), 'assets'), xml_name)
25 |
26 | class TMaze(TwoDEnv, utils.EzPickle):
27 | NAME='TMaze'
28 | def __init__(self, verbose=False,change_goal=None):
29 | self.verbose = verbose
30 | self.steps = 0
31 | self.change_goal = change_goal
32 | utils.EzPickle.__init__(self)
33 | TwoDEnv.__init__(self, get_asset_xml('twod_tmaze.xml'), 2, xbounds=[-0.3,0.3], ybounds=[-0.3,0.3])
34 |
35 |
36 | def _step(self, a):
37 | self.do_simulation(a, self.frame_skip)
38 | ob = self._get_obs()
39 | pos = ob[0:2]
40 |
41 | if not self.change_goal:
42 | target = self.model.body_pos.copy()[-1][:2]
43 | else:
44 | target = self.change_goal
45 | dist_thresh = 0.1
46 |
47 |
48 |
49 | if pos[0]>target[0]-dist_thresh and pos[0]target[1]-dist_thresh:
51 | reward = 1.
52 | else:
53 | reward = 0.
54 |
55 | self.steps += 1
56 | if self.verbose:
57 | print(pos, reward)
58 | done = self.steps >= 500 or int(reward)
59 | return ob, reward, done, np.concatenate([self.model.data.qvel]).ravel()
60 |
61 | def reset_model(self):
62 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-0.01, high=0.01)
63 | qvel = self.init_qvel + self.np_random.uniform(size=self.model.nv, low=-0.01, high=0.01)
64 | self.set_state(qpos, qvel)
65 | self.steps = 0
66 | return self._get_obs()
67 |
68 | def _get_obs(self):
69 | init_pos = self.model.body_pos.copy()[1][:2]
70 | return np.concatenate([self.model.data.qpos]).ravel() + init_pos
71 |
72 | def viewer_setup(self):
73 | v = self.viewer
74 |
75 | def seed(self, seed=None):
76 | self.np_random, seed = seeding.np_random(seed)
77 |
--------------------------------------------------------------------------------
/control/baselines/results_plotter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib
3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
4 |
5 | import matplotlib.pyplot as plt
6 | plt.rcParams['svg.fonttype'] = 'none'
7 |
8 | from baselines.common import plot_util
9 |
10 | X_TIMESTEPS = 'timesteps'
11 | X_EPISODES = 'episodes'
12 | X_WALLTIME = 'walltime_hrs'
13 | Y_REWARD = 'reward'
14 | Y_TIMESTEPS = 'timesteps'
15 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
16 | EPISODES_WINDOW = 100
17 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
18 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
19 | 'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue']
20 |
21 | def rolling_window(a, window):
22 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
23 | strides = a.strides + (a.strides[-1],)
24 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
25 |
26 | def window_func(x, y, window, func):
27 | yw = rolling_window(y, window)
28 | yw_func = func(yw, axis=-1)
29 | return x[window-1:], yw_func
30 |
31 | def ts2xy(ts, xaxis, yaxis):
32 | if xaxis == X_TIMESTEPS:
33 | x = np.cumsum(ts.l.values)
34 | elif xaxis == X_EPISODES:
35 | x = np.arange(len(ts))
36 | elif xaxis == X_WALLTIME:
37 | x = ts.t.values / 3600.
38 | else:
39 | raise NotImplementedError
40 | if yaxis == Y_REWARD:
41 | y = ts.r.values
42 | elif yaxis == Y_TIMESTEPS:
43 | y = ts.l.values
44 | else:
45 | raise NotImplementedError
46 | return x, y
47 |
48 | def plot_curves(xy_list, xaxis, yaxis, title):
49 | fig = plt.figure(figsize=(8,2))
50 | maxx = max(xy[0][-1] for xy in xy_list)
51 | minx = 0
52 | for (i, (x, y)) in enumerate(xy_list):
53 | color = COLORS[i % len(COLORS)]
54 | plt.scatter(x, y, s=2)
55 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
56 | plt.plot(x, y_mean, color=color)
57 | plt.xlim(minx, maxx)
58 | plt.title(title)
59 | plt.xlabel(xaxis)
60 | plt.ylabel(yaxis)
61 | plt.tight_layout()
62 | fig.canvas.mpl_connect('resize_event', lambda event: plt.tight_layout())
63 | plt.grid(True)
64 |
65 |
66 | def split_by_task(taskpath):
67 | return taskpath['dirname'].split('/')[-1].split('-')[0]
68 |
69 | def plot_results(dirs, num_timesteps=10e6, xaxis=X_TIMESTEPS, yaxis=Y_REWARD, title='', split_fn=split_by_task):
70 | results = plot_util.load_results(dirs)
71 | plot_util.plot_results(results, xy_fn=lambda r: ts2xy(r['monitor'], xaxis, yaxis), split_fn=split_fn, average_group=True, resample=int(1e6))
72 |
73 | # Example usage in jupyter-notebook
74 | # from baselines.results_plotter import plot_results
75 | # %matplotlib inline
76 | # plot_results("./log")
77 | # Here ./log is a directory containing the monitor.csv files
78 |
79 | def main():
80 | import argparse
81 | import os
82 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
83 | parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log'])
84 | parser.add_argument('--num_timesteps', type=int, default=int(10e6))
85 | parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
86 | parser.add_argument('--yaxis', help = 'Varible on Y-axis', default = Y_REWARD)
87 | parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout')
88 | args = parser.parse_args()
89 | args.dirs = [os.path.abspath(dir) for dir in args.dirs]
90 | plot_results(args.dirs, args.num_timesteps, args.xaxis, args.yaxis, args.task_name)
91 | plt.show()
92 |
93 | if __name__ == '__main__':
94 | main()
95 |
--------------------------------------------------------------------------------
/control/data/cartpole.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/data/cartpole.gif
--------------------------------------------------------------------------------
/control/data/fetchPickAndPlaceContrast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/data/fetchPickAndPlaceContrast.png
--------------------------------------------------------------------------------
/control/data/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/control/data/logo.jpg
--------------------------------------------------------------------------------
/control/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | select = F,E999,W291,W293
3 | exclude =
4 | .git,
5 | __pycache__,
6 | baselines/ppo1,
7 | baselines/bench,
8 |
--------------------------------------------------------------------------------
/control/setup.py:
--------------------------------------------------------------------------------
1 | import re
2 | from setuptools import setup, find_packages
3 | import sys
4 |
5 | if sys.version_info.major != 3:
6 | print('This Python is only compatible with Python 3, but you are running '
7 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major))
8 |
9 |
10 | extras = {
11 | 'test': [
12 | 'filelock',
13 | 'pytest',
14 | 'pytest-forked',
15 | 'atari-py'
16 | ],
17 | 'bullet': [
18 | 'pybullet',
19 | ],
20 | 'mpi': [
21 | 'mpi4py'
22 | ]
23 | }
24 |
25 | all_deps = []
26 | for group_name in extras:
27 | all_deps += extras[group_name]
28 |
29 | extras['all'] = all_deps
30 |
31 | setup(name='baselines',
32 | packages=[package for package in find_packages()
33 | if package.startswith('baselines')],
34 | install_requires=[
35 | 'gym',
36 | 'scipy',
37 | 'tqdm',
38 | 'joblib',
39 | 'dill',
40 | 'progressbar2',
41 | 'cloudpickle',
42 | 'click',
43 | 'opencv-python'
44 | ],
45 | extras_require=extras,
46 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
47 | author='OpenAI',
48 | url='https://github.com/openai/baselines',
49 | author_email='gym@openai.com',
50 | version='0.1.5')
51 |
52 |
53 | # ensure there is some tensorflow build with version above 1.4
54 | import pkg_resources
55 | tf_pkg = None
56 | for tf_pkg_name in ['tensorflow', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-gpu']:
57 | try:
58 | tf_pkg = pkg_resources.get_distribution(tf_pkg_name)
59 | except pkg_resources.DistributionNotFound:
60 | pass
61 | assert tf_pkg is not None, 'TensorFlow needed, of version above 1.4'
62 | from distutils.version import LooseVersion
63 | assert LooseVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= LooseVersion('1.4.0')
64 |
--------------------------------------------------------------------------------
/launcher_miniworld.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | main_lr=(1e-4) #(1e-4 3e-4 7e-4 5e-4)
4 | int_lr=(9e-5) #(1e-4 3e-3 8e-4 8e-5 5e-4 3e-4 9e-5) #(3e-3 8e-5 8e-4)
5 | seed=(0)
6 | port=({4000..4020})
7 | envname="MiniWorld-OneRoom-v0" #"MiniWorld-PickupObjs-v0" #MiniWorld-PutNext-v0
8 | numoption=2
9 |
10 | count=0
11 | for _main_lr in ${main_lr[@]}
12 | do
13 | for _int_lr in ${int_lr[@]}
14 | do
15 | for _seed in ${seed[@]}
16 | do
17 | if [ -f temprun.sh ] ; then
18 | rm temprun.sh
19 | fi
20 |
21 | echo "#!/bin/bash" >> temprun.sh
22 | echo "#SBATCH --account=addccaccounthere" >> temprun.sh
23 | echo "#SBATCH --output=\"/scratch/username/slurm-%j.out\"" >> temprun.sh
24 | echo "#SBATCH --gres=gpu:1" >> temprun.sh
25 | echo "#SBATCH --mem=30G" >> temprun.sh
26 | echo "#SBATCH --time=10:00:00" >> temprun.sh
27 | echo "source $HOME/intf/bin/activate" >> temprun.sh
28 | echo "cd $HOME/ioc/miniworld/baselines/ppoc_int/" >> temprun.sh
29 | k="xvfb-run -n "${port[$count]}" -s \"-screen 0 1024x768x24 -ac +extension GLX +render -noreset\" python run_miniw.py --env "$envname" --seed $_seed --opt $numoption --saves --mainlr $_main_lr --intlr $_int_lr --switch --wsaves"
30 | echo $k >> temprun.sh
31 | echo $k
32 | eval "sbatch temprun.sh"
33 | rm temprun.sh
34 | count=$((count + 1))
35 | done
36 | done
37 | done
38 |
--------------------------------------------------------------------------------
/launcher_mujoco.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | seed=(0 1 2 3 4)
4 | mainlr=(1e-4)
5 | #intfclr=(1e-4 3e-4 5e-4 7e-4 9e-4)
6 | intfclr=(5e-4)
7 | #piolr=(7e-4 9e-4 3e-4 5e-4)
8 | piolr=(3e-4)
9 |
10 | port=($(seq 4000 1 4100))
11 |
12 | envname="HalfCheetahDir-v1"
13 |
14 | count=0
15 |
16 | for _piolr in ${piolr[@]}
17 | do
18 | for _intfclr in ${intfclr[@]}
19 | do
20 | for _mainlr in ${mainlr[@]}
21 | do
22 | for _seed in ${seed[@]}
23 | do
24 | if [ -f temprun.sh ] ; then
25 | rm temprun.sh
26 | fi
27 | echo "#!/bin/bash" >> temprun.sh
28 | echo "#SBATCH --account=addaccounthere" >> temprun.sh
29 | echo "#SBATCH --output=\"/scratch/username/maml/Maml_seed${_seed}_mainlr${_mainlr}_intfclr_${_intfclr}_piolr_${_piolr}-%j.out\"" >> temprun.sh
30 | echo "#SBATCH --job-name=Maml_seed${_seed}_mainlr${_mainlr}_intfclr_${_intfclr}_piolr_${_piolr}" >> temprun.sh
31 | echo "#SBATCH --gres=gpu:0" >> temprun.sh
32 | echo "#SBATCH --mem=5G" >> temprun.sh
33 | echo "#SBATCH --time=1:00:00" >> temprun.sh
34 | echo "source $HOME/miniconda3/etc/profile.d/conda.sh" >> temprun.sh
35 | echo "conda activate intfc" >> temprun.sh
36 | echo "cd $HOME/ioc/control/baselines/ppoc_int/" >> temprun.sh
37 | k="xvfb-run -n "${port[$count]}" -s \"-screen 0 1024x768x24 -ac +extension GLX +render -noreset\" python run_mujoco.py --env "$envname" --saves --opt 2 --seed ${_seed} --mainlr ${_mainlr} --intlr ${_intfclr} --piolr ${_piolr} --switch --wsaves"
38 | echo $k >> temprun.sh
39 | echo $k
40 | eval "sbatch temprun.sh"
41 | rm temprun.sh
42 | count=$((count + 1))
43 | done
44 | done
45 | done
46 | done
47 |
--------------------------------------------------------------------------------
/miniworld/.benchmark_pattern:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/miniworld/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.pyc
3 | *.pkl
4 | *.py~
5 | .pytest_cache
6 | .DS_Store
7 | .idea
8 |
9 | # Setuptools distribution and build folders.
10 | /dist/
11 | /build
12 | keys/
13 |
14 | # Virtualenv
15 | /env
16 |
17 |
18 | *.sublime-project
19 | *.sublime-workspace
20 |
21 | .idea
22 |
23 | logs/
24 |
25 | .ipynb_checkpoints
26 | ghostdriver.log
27 |
28 | htmlcov
29 |
30 | junk
31 | src
32 |
33 | *.egg-info
34 | .cache
35 |
36 | MUJOCO_LOG.TXT
37 |
--------------------------------------------------------------------------------
/miniworld/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 |
5 | services:
6 | - docker
7 |
8 | install:
9 | - pip install flake8
10 | - docker build . -t baselines-test
11 |
12 | script:
13 | - flake8 . --show-source --statistics
14 | - docker run baselines-test pytest -v --forked .
15 |
--------------------------------------------------------------------------------
/miniworld/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6
2 |
3 | RUN apt-get -y update && apt-get -y install ffmpeg
4 | # RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
5 |
6 | ENV CODE_DIR /root/code
7 |
8 | COPY . $CODE_DIR/baselines
9 | WORKDIR $CODE_DIR/baselines
10 |
11 | # Clean up pycache and pyc files
12 | RUN rm -rf __pycache__ && \
13 | find . -name "*.pyc" -delete && \
14 | pip install tensorflow && \
15 | pip install -e .[test]
16 |
17 |
18 | CMD /bin/bash
19 |
--------------------------------------------------------------------------------
/miniworld/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2017 OpenAI (http://openai.com)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/miniworld/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/baselines/__init__.py
--------------------------------------------------------------------------------
/miniworld/baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.bench.benchmarks import *
2 | from baselines.bench.monitor import *
3 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F403
2 | from baselines.common.console_util import *
3 | from baselines.common.dataset import Dataset
4 | from baselines.common.math_util import *
5 | from baselines.common.misc_util import *
6 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/cg.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
3 | """
4 | Demmel p 312
5 | """
6 | p = b.copy()
7 | r = b.copy()
8 | x = np.zeros_like(b)
9 | rdotr = r.dot(r)
10 |
11 | fmtstr = "%10i %10.3g %10.3g"
12 | titlestr = "%10s %10s %10s"
13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 |
15 | for i in range(cg_iters):
16 | if callback is not None:
17 | callback(x)
18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 | z = f_Ax(p)
20 | v = rdotr / p.dot(z)
21 | x += v*p
22 | r -= v*z
23 | newrdotr = r.dot(r)
24 | mu = newrdotr/rdotr
25 | p = r + mu*p
26 |
27 | rdotr = newrdotr
28 | if rdotr < residual_tol:
29 | break
30 |
31 | if callback is not None:
32 | callback(x)
33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
34 | return x
35 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/console_util.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from contextlib import contextmanager
3 | import numpy as np
4 | import time
5 | import shlex
6 | import subprocess
7 |
8 | # ================================================================
9 | # Misc
10 | # ================================================================
11 |
12 | def fmt_row(width, row, header=False):
13 | out = " | ".join(fmt_item(x, width) for x in row)
14 | if header: out = out + "\n" + "-"*len(out)
15 | return out
16 |
17 | def fmt_item(x, l):
18 | if isinstance(x, np.ndarray):
19 | assert x.ndim==0
20 | x = x.item()
21 | if isinstance(x, (float, np.float32, np.float64)):
22 | v = abs(x)
23 | if (v < 1e-4 or v > 1e+4) and v > 0:
24 | rep = "%7.2e" % x
25 | else:
26 | rep = "%7.5f" % x
27 | else: rep = str(x)
28 | return " "*(l - len(rep)) + rep
29 |
30 | color2num = dict(
31 | gray=30,
32 | red=31,
33 | green=32,
34 | yellow=33,
35 | blue=34,
36 | magenta=35,
37 | cyan=36,
38 | white=37,
39 | crimson=38
40 | )
41 |
42 | def colorize(string, color='green', bold=False, highlight=False):
43 | attr = []
44 | num = color2num[color]
45 | if highlight: num += 10
46 | attr.append(str(num))
47 | if bold: attr.append('1')
48 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
49 |
50 | def print_cmd(cmd, dry=False):
51 | if isinstance(cmd, str): # for shell=True
52 | pass
53 | else:
54 | cmd = ' '.join(shlex.quote(arg) for arg in cmd)
55 | print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd))
56 |
57 |
58 | def get_git_commit(cwd=None):
59 | return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8')
60 |
61 | def get_git_commit_message(cwd=None):
62 | return subprocess.check_output(['git', 'show', '-s', '--format=%B', 'HEAD'], cwd=cwd).decode('utf8')
63 |
64 | def ccap(cmd, dry=False, env=None, **kwargs):
65 | print_cmd(cmd, dry)
66 | if not dry:
67 | subprocess.check_call(cmd, env=env, **kwargs)
68 |
69 |
70 | MESSAGE_DEPTH = 0
71 |
72 | @contextmanager
73 | def timed(msg):
74 | global MESSAGE_DEPTH #pylint: disable=W0603
75 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
76 | tstart = time.time()
77 | MESSAGE_DEPTH += 1
78 | yield
79 | MESSAGE_DEPTH -= 1
80 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
81 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/dataset.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class Dataset(object):
4 | def __init__(self, data_map, deterministic=False, shuffle=True):
5 | self.data_map = data_map
6 | self.deterministic = deterministic
7 | self.enable_shuffle = shuffle
8 | self.n = next(iter(data_map.values())).shape[0]
9 | self._next_id = 0
10 | self.shuffle()
11 |
12 | def shuffle(self):
13 | if self.deterministic:
14 | return
15 | perm = np.arange(self.n)
16 | np.random.shuffle(perm)
17 |
18 | for key in self.data_map:
19 | self.data_map[key] = self.data_map[key][perm]
20 |
21 | self._next_id = 0
22 |
23 | def next_batch(self, batch_size):
24 | if self._next_id >= self.n and self.enable_shuffle:
25 | self.shuffle()
26 |
27 | cur_id = self._next_id
28 | cur_batch_size = min(batch_size, self.n - self._next_id)
29 | self._next_id += cur_batch_size
30 |
31 | data_map = dict()
32 | for key in self.data_map:
33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 | return data_map
35 |
36 | def iterate_once(self, batch_size):
37 | if self.enable_shuffle: self.shuffle()
38 |
39 | while self._next_id <= self.n - batch_size:
40 | yield self.next_batch(batch_size)
41 | self._next_id = 0
42 |
43 | def subset(self, num_elements, deterministic=True):
44 | data_map = dict()
45 | for key in self.data_map:
46 | data_map[key] = self.data_map[key][:num_elements]
47 | return Dataset(data_map, deterministic)
48 |
49 |
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 | arrays = tuple(map(np.asarray, arrays))
53 | n = arrays[0].shape[0]
54 | assert all(a.shape[0] == n for a in arrays[1:])
55 | inds = np.arange(n)
56 | if shuffle: np.random.shuffle(inds)
57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 | for batch_inds in np.array_split(inds, sections):
59 | if include_final_partial_batch or len(batch_inds) == batch_size:
60 | yield tuple(a[batch_inds] for a in arrays)
61 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/input.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from gym.spaces import Discrete, Box, MultiDiscrete
4 |
5 | def observation_placeholder(ob_space, batch_size=None, name='Ob'):
6 | '''
7 | Create placeholder to feed observations into of the size appropriate to the observation space
8 |
9 | Parameters:
10 | ----------
11 |
12 | ob_space: gym.Space observation space
13 |
14 | batch_size: int size of the batch to be fed into input. Can be left None in most cases.
15 |
16 | name: str name of the placeholder
17 |
18 | Returns:
19 | -------
20 |
21 | tensorflow placeholder tensor
22 | '''
23 |
24 | assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \
25 | 'Can only deal with Discrete and Box observation spaces for now'
26 |
27 | dtype = ob_space.dtype
28 | if dtype == np.int8:
29 | dtype = np.uint8
30 |
31 | return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name)
32 |
33 |
34 | def observation_input(ob_space, batch_size=None, name='Ob'):
35 | '''
36 | Create placeholder to feed observations into of the size appropriate to the observation space, and add input
37 | encoder of the appropriate type.
38 | '''
39 |
40 | placeholder = observation_placeholder(ob_space, batch_size, name)
41 | return placeholder, encode_observation(ob_space, placeholder)
42 |
43 | def encode_observation(ob_space, placeholder):
44 | '''
45 | Encode input in the way that is appropriate to the observation space
46 |
47 | Parameters:
48 | ----------
49 |
50 | ob_space: gym.Space observation space
51 |
52 | placeholder: tf.placeholder observation input placeholder
53 | '''
54 | if isinstance(ob_space, Discrete):
55 | return tf.to_float(tf.one_hot(placeholder, ob_space.n))
56 | elif isinstance(ob_space, Box):
57 | return tf.to_float(placeholder)
58 | elif isinstance(ob_space, MultiDiscrete):
59 | placeholder = tf.cast(placeholder, tf.int32)
60 | one_hots = [tf.to_float(tf.one_hot(placeholder[..., i], ob_space.nvec[i])) for i in range(placeholder.shape[-1])]
61 | return tf.concat(one_hots, axis=-1)
62 | else:
63 | raise NotImplementedError
64 |
65 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/math_util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.signal
3 |
4 |
5 | def discount(x, gamma):
6 | """
7 | computes discounted sums along 0th dimension of x.
8 |
9 | inputs
10 | ------
11 | x: ndarray
12 | gamma: float
13 |
14 | outputs
15 | -------
16 | y: ndarray with same shape as x, satisfying
17 |
18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 | where k = len(x) - t - 1
20 |
21 | """
22 | assert x.ndim >= 1
23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 |
25 | def explained_variance(ypred,y):
26 | """
27 | Computes fraction of variance that ypred explains about y.
28 | Returns 1 - Var[y-ypred] / Var[y]
29 |
30 | interpretation:
31 | ev=0 => might as well have predicted zero
32 | ev=1 => perfect prediction
33 | ev<0 => worse than just predicting zero
34 |
35 | """
36 | assert y.ndim == 1 and ypred.ndim == 1
37 | vary = np.var(y)
38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 |
40 | def explained_variance_2d(ypred, y):
41 | assert y.ndim == 2 and ypred.ndim == 2
42 | vary = np.var(y, axis=0)
43 | out = 1 - np.var(y-ypred)/vary
44 | out[vary < 1e-10] = 0
45 | return out
46 |
47 | def ncc(ypred, y):
48 | return np.corrcoef(ypred, y)[1,0]
49 |
50 | def flatten_arrays(arrs):
51 | return np.concatenate([arr.flat for arr in arrs])
52 |
53 | def unflatten_vector(vec, shapes):
54 | i=0
55 | arrs = []
56 | for shape in shapes:
57 | size = np.prod(shape)
58 | arr = vec[i:i+size].reshape(shape)
59 | arrs.append(arr)
60 | i += size
61 | return arrs
62 |
63 | def discount_with_boundaries(X, New, gamma):
64 | """
65 | X: 2d array of floats, time x features
66 | New: 2d array of bools, indicating when a new episode has started
67 | """
68 | Y = np.zeros_like(X)
69 | T = X.shape[0]
70 | Y[T-1] = X[T-1]
71 | for t in range(T-2, -1, -1):
72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 | return Y
74 |
75 | def test_discount_with_boundaries():
76 | gamma=0.9
77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 | starts = [1.0, 0.0, 0.0, 1.0]
79 | y = discount_with_boundaries(x, starts, gamma)
80 | assert np.allclose(y, [
81 | 1 + gamma * 2 + gamma**2 * 3,
82 | 2 + gamma * 3,
83 | 3,
84 | 4
85 | ])
86 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/mpi_adam.py:
--------------------------------------------------------------------------------
1 | import baselines.common.tf_util as U
2 | import tensorflow as tf
3 | import numpy as np
4 | try:
5 | from mpi4py import MPI
6 | except ImportError:
7 | MPI = None
8 |
9 |
10 | class MpiAdam(object):
11 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
12 | self.var_list = var_list
13 | self.beta1 = beta1
14 | self.beta2 = beta2
15 | self.epsilon = epsilon
16 | self.scale_grad_by_procs = scale_grad_by_procs
17 | size = sum(U.numel(v) for v in var_list)
18 | self.m = np.zeros(size, 'float32')
19 | self.v = np.zeros(size, 'float32')
20 | self.t = 0
21 | self.setfromflat = U.SetFromFlat(var_list)
22 | self.getflat = U.GetFlat(var_list)
23 | self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm
24 |
25 | def update(self, localg, stepsize):
26 | if self.t % 100 == 0:
27 | self.check_synced()
28 | localg = localg.astype('float32')
29 | if self.comm is not None:
30 | globalg = np.zeros_like(localg)
31 | self.comm.Allreduce(localg, globalg, op=MPI.SUM)
32 | if self.scale_grad_by_procs:
33 | globalg /= self.comm.Get_size()
34 | else:
35 | globalg = np.copy(localg)
36 |
37 | self.t += 1
38 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
39 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
40 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
41 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
42 | self.setfromflat(self.getflat() + step)
43 |
44 | def sync(self):
45 | if self.comm is None:
46 | return
47 | theta = self.getflat()
48 | self.comm.Bcast(theta, root=0)
49 | self.setfromflat(theta)
50 |
51 | def check_synced(self):
52 | if self.comm is None:
53 | return
54 | if self.comm.Get_rank() == 0: # this is root
55 | theta = self.getflat()
56 | self.comm.Bcast(theta, root=0)
57 | else:
58 | thetalocal = self.getflat()
59 | thetaroot = np.empty_like(thetalocal)
60 | self.comm.Bcast(thetaroot, root=0)
61 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
62 |
63 | @U.in_session
64 | def test_MpiAdam():
65 | np.random.seed(0)
66 | tf.set_random_seed(0)
67 |
68 | a = tf.Variable(np.random.randn(3).astype('float32'))
69 | b = tf.Variable(np.random.randn(2,5).astype('float32'))
70 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
71 |
72 | stepsize = 1e-2
73 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
74 | do_update = U.function([], loss, updates=[update_op])
75 |
76 | tf.get_default_session().run(tf.global_variables_initializer())
77 | losslist_ref = []
78 | for i in range(10):
79 | l = do_update()
80 | print(i, l)
81 | losslist_ref.append(l)
82 |
83 |
84 |
85 | tf.set_random_seed(0)
86 | tf.get_default_session().run(tf.global_variables_initializer())
87 |
88 | var_list = [a,b]
89 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)])
90 | adam = MpiAdam(var_list)
91 |
92 | losslist_test = []
93 | for i in range(10):
94 | l,g = lossandgrad()
95 | adam.update(g, stepsize)
96 | print(i,l)
97 | losslist_test.append(l)
98 |
99 | np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
100 |
101 |
102 | if __name__ == '__main__':
103 | test_MpiAdam()
104 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/mpi_adam_optimizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from mpi4py import MPI
4 |
5 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
6 | """Adam optimizer that averages gradients across mpi processes."""
7 | def __init__(self, comm, **kwargs):
8 | self.comm = comm
9 | tf.train.AdamOptimizer.__init__(self, **kwargs)
10 | def compute_gradients(self, loss, var_list, **kwargs):
11 | grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
12 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
13 | flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
14 | shapes = [v.shape.as_list() for g, v in grads_and_vars]
15 | sizes = [int(np.prod(s)) for s in shapes]
16 |
17 | num_tasks = self.comm.Get_size()
18 | buf = np.zeros(sum(sizes), np.float32)
19 |
20 | def _collect_grads(flat_grad):
21 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
22 | np.divide(buf, float(num_tasks), out=buf)
23 | return buf
24 |
25 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
26 | avg_flat_grad.set_shape(flat_grad.shape)
27 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
28 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
29 | for g, (_, v) in zip(avg_grads, grads_and_vars)]
30 |
31 | return avg_grads_and_vars
32 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
1 | import os, subprocess, sys
2 |
3 | def mpi_fork(n, bind_to_core=False):
4 | """Re-launches the current script with workers
5 | Returns "parent" for original parent, "child" for MPI children
6 | """
7 | if n<=1:
8 | return "child"
9 | if os.getenv("IN_MPI") is None:
10 | env = os.environ.copy()
11 | env.update(
12 | MKL_NUM_THREADS="1",
13 | OMP_NUM_THREADS="1",
14 | IN_MPI="1"
15 | )
16 | args = ["mpirun", "-np", str(n)]
17 | if bind_to_core:
18 | args += ["-bind-to", "core"]
19 | args += [sys.executable] + sys.argv
20 | subprocess.check_call(args, env=env)
21 | return "parent"
22 | else:
23 | return "child"
24 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
1 | from mpi4py import MPI
2 | import numpy as np
3 | from baselines.common import zipsame
4 |
5 |
6 | def mpi_mean(x, axis=0, comm=None, keepdims=False):
7 | x = np.asarray(x)
8 | assert x.ndim > 0
9 | if comm is None: comm = MPI.COMM_WORLD
10 | xsum = x.sum(axis=axis, keepdims=keepdims)
11 | n = xsum.size
12 | localsum = np.zeros(n+1, x.dtype)
13 | localsum[:n] = xsum.ravel()
14 | localsum[n] = x.shape[axis]
15 | globalsum = np.zeros_like(localsum)
16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM)
17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
18 |
19 | def mpi_moments(x, axis=0, comm=None, keepdims=False):
20 | x = np.asarray(x)
21 | assert x.ndim > 0
22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
23 | sqdiffs = np.square(x - mean)
24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
25 | assert count1 == count
26 | std = np.sqrt(meansqdiff)
27 | if not keepdims:
28 | newshape = mean.shape[:axis] + mean.shape[axis+1:]
29 | mean = mean.reshape(newshape)
30 | std = std.reshape(newshape)
31 | return mean, std, count
32 |
33 |
34 | def test_runningmeanstd():
35 | import subprocess
36 | subprocess.check_call(['mpirun', '-np', '3',
37 | 'python','-c',
38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
39 |
40 | def _helper_runningmeanstd():
41 | comm = MPI.COMM_WORLD
42 | np.random.seed(0)
43 | for (triple,axis) in [
44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
47 | ]:
48 |
49 |
50 | x = np.concatenate(triple, axis=axis)
51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
52 |
53 |
54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
55 |
56 | for (a1,a2) in zipsame(ms1, ms2):
57 | print(a1, a2)
58 | assert np.allclose(a1, a2)
59 | print("ok!")
60 |
61 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/mpi_util.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from mpi4py import MPI
3 | import os, numpy as np
4 | import platform
5 | import shutil
6 | import subprocess
7 |
8 | def sync_from_root(sess, variables, comm=None):
9 | """
10 | Send the root node's parameters to every worker.
11 | Arguments:
12 | sess: the TensorFlow session.
13 | variables: all parameter variables including optimizer's
14 | """
15 | if comm is None: comm = MPI.COMM_WORLD
16 | rank = comm.Get_rank()
17 | for var in variables:
18 | if rank == 0:
19 | comm.Bcast(sess.run(var))
20 | else:
21 | import tensorflow as tf
22 | returned_var = np.empty(var.shape, dtype='float32')
23 | comm.Bcast(returned_var)
24 | sess.run(tf.assign(var, returned_var))
25 |
26 | def gpu_count():
27 | """
28 | Count the GPUs on this machine.
29 | """
30 | if shutil.which('nvidia-smi') is None:
31 | return 0
32 | output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
33 | return max(0, len(output.split(b'\n')) - 2)
34 |
35 | def setup_mpi_gpus():
36 | """
37 | Set CUDA_VISIBLE_DEVICES using MPI.
38 | """
39 | num_gpus = gpu_count()
40 | if num_gpus == 0:
41 | return
42 | local_rank, _ = get_local_rank_size(MPI.COMM_WORLD)
43 | os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus)
44 |
45 | def get_local_rank_size(comm):
46 | """
47 | Returns the rank of each process on its machine
48 | The processes on a given machine will be assigned ranks
49 | 0, 1, 2, ..., N-1,
50 | where N is the number of processes on this machine.
51 |
52 | Useful if you want to assign one gpu per machine
53 | """
54 | this_node = platform.node()
55 | ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
56 | node2rankssofar = defaultdict(int)
57 | local_rank = None
58 | for (rank, node) in ranks_nodes:
59 | if rank == comm.Get_rank():
60 | local_rank = node2rankssofar[node]
61 | node2rankssofar[node] += 1
62 | assert local_rank is not None
63 | return local_rank, node2rankssofar[this_node]
64 |
65 | def share_file(comm, path):
66 | """
67 | Copies the file from rank 0 to all other ranks
68 | Puts it in the same place on all machines
69 | """
70 | localrank, _ = get_local_rank_size(comm)
71 | if comm.Get_rank() == 0:
72 | with open(path, 'rb') as fh:
73 | data = fh.read()
74 | comm.bcast(data)
75 | else:
76 | data = comm.bcast(None)
77 | if localrank == 0:
78 | os.makedirs(os.path.dirname(path), exist_ok=True)
79 | with open(path, 'wb') as fh:
80 | fh.write(data)
81 | comm.Barrier()
82 |
83 | def dict_gather(comm, d, op='mean', assert_all_have_data=True):
84 | if comm is None: return d
85 | alldicts = comm.allgather(d)
86 | size = comm.size
87 | k2li = defaultdict(list)
88 | for d in alldicts:
89 | for (k,v) in d.items():
90 | k2li[k].append(v)
91 | result = {}
92 | for (k,li) in k2li.items():
93 | if assert_all_have_data:
94 | assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k)
95 | if op=='mean':
96 | result[k] = np.mean(li, axis=0)
97 | elif op=='sum':
98 | result[k] = np.sum(li, axis=0)
99 | else:
100 | assert 0, op
101 | return result
102 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/runners.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from abc import ABC, abstractmethod
3 |
4 | class AbstractEnvRunner(ABC):
5 | def __init__(self, *, env, model, nsteps):
6 | self.env = env
7 | self.model = model
8 | self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
11 | self.obs[:] = env.reset()
12 | self.nsteps = nsteps
13 | self.states = model.initial_state
14 | self.dones = [False for _ in range(nenv)]
15 |
16 | @abstractmethod
17 | def run(self):
18 | raise NotImplementedError
19 |
20 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/schedules.py:
--------------------------------------------------------------------------------
1 | """This file is used for specifying various schedules that evolve over
2 | time throughout the execution of the algorithm, such as:
3 | - learning rate for the optimizer
4 | - exploration epsilon for the epsilon greedy exploration strategy
5 | - beta parameter for beta parameter in prioritized replay
6 |
7 | Each schedule has a function `value(t)` which returns the current value
8 | of the parameter given the timestep t of the optimization procedure.
9 | """
10 |
11 |
12 | class Schedule(object):
13 | def value(self, t):
14 | """Value of the schedule at time t"""
15 | raise NotImplementedError()
16 |
17 |
18 | class ConstantSchedule(object):
19 | def __init__(self, value):
20 | """Value remains constant over time.
21 |
22 | Parameters
23 | ----------
24 | value: float
25 | Constant value of the schedule
26 | """
27 | self._v = value
28 |
29 | def value(self, t):
30 | """See Schedule.value"""
31 | return self._v
32 |
33 |
34 | def linear_interpolation(l, r, alpha):
35 | return l + alpha * (r - l)
36 |
37 |
38 | class PiecewiseSchedule(object):
39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
40 | """Piecewise schedule.
41 |
42 | endpoints: [(int, int)]
43 | list of pairs `(time, value)` meanining that schedule should output
44 | `value` when `t==time`. All the values for time must be sorted in
45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)`
46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
48 | time passed between `time_a` and `time_b` for time `t`.
49 | interpolation: lambda float, float, float: float
50 | a function that takes value to the left and to the right of t according
51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to
52 | right endpoint that t has covered. See linear_interpolation for example.
53 | outside_value: float
54 | if the value is requested outside of all the intervals sepecified in
55 | `endpoints` this value is returned. If None then AssertionError is
56 | raised when outside value is requested.
57 | """
58 | idxes = [e[0] for e in endpoints]
59 | assert idxes == sorted(idxes)
60 | self._interpolation = interpolation
61 | self._outside_value = outside_value
62 | self._endpoints = endpoints
63 |
64 | def value(self, t):
65 | """See Schedule.value"""
66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
67 | if l_t <= t and t < r_t:
68 | alpha = float(t - l_t) / (r_t - l_t)
69 | return self._interpolation(l, r, alpha)
70 |
71 | # t does not belong to any of the pieces, so doom.
72 | assert self._outside_value is not None
73 | return self._outside_value
74 |
75 |
76 | class LinearSchedule(object):
77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
78 | """Linear interpolation between initial_p and final_p over
79 | schedule_timesteps. After this many timesteps pass final_p is
80 | returned.
81 |
82 | Parameters
83 | ----------
84 | schedule_timesteps: int
85 | Number of timesteps for which to linearly anneal initial_p
86 | to final_p
87 | initial_p: float
88 | initial output value
89 | final_p: float
90 | final output value
91 | """
92 | self.schedule_timesteps = schedule_timesteps
93 | self.final_p = final_p
94 | self.initial_p = initial_p
95 |
96 | def value(self, t):
97 | """See Schedule.value"""
98 | fraction = min(float(t) / self.schedule_timesteps, 1.0)
99 | return self.initial_p + fraction * (self.final_p - self.initial_p)
100 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/baselines/common/tests/__init__.py
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/baselines/common/tests/envs/__init__.py
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/envs/fixed_sequence_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import Env
3 | from gym.spaces import Discrete
4 |
5 |
6 | class FixedSequenceEnv(Env):
7 | def __init__(
8 | self,
9 | n_actions=10,
10 | seed=0,
11 | episode_len=100
12 | ):
13 | self.np_random = np.random.RandomState()
14 | self.np_random.seed(seed)
15 | self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)]
16 |
17 | self.action_space = Discrete(n_actions)
18 | self.observation_space = Discrete(1)
19 |
20 | self.episode_len = episode_len
21 | self.time = 0
22 | self.reset()
23 |
24 | def reset(self):
25 | self.time = 0
26 | return 0
27 |
28 | def step(self, actions):
29 | rew = self._get_reward(actions)
30 | self._choose_next_state()
31 | done = False
32 | if self.episode_len and self.time >= self.episode_len:
33 | rew = 0
34 | done = True
35 |
36 | return 0, rew, done, {}
37 |
38 | def _choose_next_state(self):
39 | self.time += 1
40 |
41 | def _get_reward(self, actions):
42 | return 1 if actions == self.sequence[self.time] else 0
43 |
44 |
45 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/envs/identity_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from abc import abstractmethod
3 | from gym import Env
4 | from gym.spaces import MultiDiscrete, Discrete, Box
5 |
6 |
7 | class IdentityEnv(Env):
8 | def __init__(
9 | self,
10 | episode_len=None
11 | ):
12 |
13 | self.episode_len = episode_len
14 | self.time = 0
15 | self.reset()
16 |
17 | def reset(self):
18 | self._choose_next_state()
19 | self.time = 0
20 | self.observation_space = self.action_space
21 |
22 | return self.state
23 |
24 | def step(self, actions):
25 | rew = self._get_reward(actions)
26 | self._choose_next_state()
27 | done = False
28 | if self.episode_len and self.time >= self.episode_len:
29 | rew = 0
30 | done = True
31 |
32 | return self.state, rew, done, {}
33 |
34 | def _choose_next_state(self):
35 | self.state = self.action_space.sample()
36 | self.time += 1
37 |
38 | @abstractmethod
39 | def _get_reward(self, actions):
40 | raise NotImplementedError
41 |
42 |
43 | class DiscreteIdentityEnv(IdentityEnv):
44 | def __init__(
45 | self,
46 | dim,
47 | episode_len=None,
48 | ):
49 |
50 | self.action_space = Discrete(dim)
51 | super().__init__(episode_len=episode_len)
52 |
53 | def _get_reward(self, actions):
54 | return 1 if self.state == actions else 0
55 |
56 | class MultiDiscreteIdentityEnv(IdentityEnv):
57 | def __init__(
58 | self,
59 | dims,
60 | episode_len=None,
61 | ):
62 |
63 | self.action_space = MultiDiscrete(dims)
64 | super().__init__(episode_len=episode_len)
65 |
66 | def _get_reward(self, actions):
67 | return 1 if all(self.state == actions) else 0
68 |
69 |
70 | class BoxIdentityEnv(IdentityEnv):
71 | def __init__(
72 | self,
73 | shape,
74 | episode_len=None,
75 | ):
76 |
77 | self.action_space = Box(low=-1.0, high=1.0, shape=shape)
78 | super().__init__(episode_len=episode_len)
79 |
80 | def _get_reward(self, actions):
81 | diff = actions - self.state
82 | diff = diff[:]
83 | return -0.5 * np.dot(diff, diff)
84 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/envs/mnist_env.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import numpy as np
3 | import tempfile
4 | from gym import Env
5 | from gym.spaces import Discrete, Box
6 |
7 |
8 |
9 | class MnistEnv(Env):
10 | def __init__(
11 | self,
12 | seed=0,
13 | episode_len=None,
14 | no_images=None
15 | ):
16 | import filelock
17 | from tensorflow.examples.tutorials.mnist import input_data
18 | # we could use temporary directory for this with a context manager and
19 | # TemporaryDirecotry, but then each test that uses mnist would re-download the data
20 | # this way the data is not cleaned up, but we only download it once per machine
21 | mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
22 | with filelock.FileLock(mnist_path + '.lock'):
23 | self.mnist = input_data.read_data_sets(mnist_path)
24 |
25 | self.np_random = np.random.RandomState()
26 | self.np_random.seed(seed)
27 |
28 | self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
29 | self.action_space = Discrete(10)
30 | self.episode_len = episode_len
31 | self.time = 0
32 | self.no_images = no_images
33 |
34 | self.train_mode()
35 | self.reset()
36 |
37 | def reset(self):
38 | self._choose_next_state()
39 | self.time = 0
40 |
41 | return self.state[0]
42 |
43 | def step(self, actions):
44 | rew = self._get_reward(actions)
45 | self._choose_next_state()
46 | done = False
47 | if self.episode_len and self.time >= self.episode_len:
48 | rew = 0
49 | done = True
50 |
51 | return self.state[0], rew, done, {}
52 |
53 | def train_mode(self):
54 | self.dataset = self.mnist.train
55 |
56 | def test_mode(self):
57 | self.dataset = self.mnist.test
58 |
59 | def _choose_next_state(self):
60 | max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1
61 | index = self.np_random.randint(0, max_index)
62 | image = self.dataset.images[index].reshape(28,28,1)*255
63 | label = self.dataset.labels[index]
64 | self.state = (image, label)
65 | self.time += 1
66 |
67 | def _get_reward(self, actions):
68 | return 1 if self.state[1] == actions else 0
69 |
70 |
71 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_cartpole.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 |
4 | from baselines.run import get_learn_function
5 | from baselines.common.tests.util import reward_per_episode_test
6 |
7 | common_kwargs = dict(
8 | total_timesteps=30000,
9 | network='mlp',
10 | gamma=1.0,
11 | seed=0,
12 | )
13 |
14 | learn_kwargs = {
15 | 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
16 | 'acer': dict(value_network='copy'),
17 | 'acktr': dict(nsteps=32, value_network='copy', is_async=False),
18 | 'deepq': dict(total_timesteps=20000),
19 | 'ppo2': dict(value_network='copy'),
20 | 'trpo_mpi': {}
21 | }
22 |
23 | @pytest.mark.slow
24 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
25 | def test_cartpole(alg):
26 | '''
27 | Test if the algorithm (with an mlp policy)
28 | can learn to balance the cartpole
29 | '''
30 |
31 | kwargs = common_kwargs.copy()
32 | kwargs.update(learn_kwargs[alg])
33 |
34 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
35 | def env_fn():
36 |
37 | env = gym.make('CartPole-v0')
38 | env.seed(0)
39 | return env
40 |
41 | reward_per_episode_test(env_fn, learn_fn, 100)
42 |
43 | if __name__ == '__main__':
44 | test_cartpole('acer')
45 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_doc_examples.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | try:
3 | import mujoco_py
4 | _mujoco_present = True
5 | except BaseException:
6 | mujoco_py = None
7 | _mujoco_present = False
8 |
9 |
10 | @pytest.mark.skipif(
11 | not _mujoco_present,
12 | reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
13 | )
14 | def test_lstm_example():
15 | import tensorflow as tf
16 | from baselines.common import policies, models, cmd_util
17 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
18 |
19 | # create vectorized environment
20 | venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])
21 |
22 | with tf.Session() as sess:
23 | # build policy based on lstm network with 128 units
24 | policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1)
25 |
26 | # initialize tensorflow variables
27 | sess.run(tf.global_variables_initializer())
28 |
29 | # prepare environment variables
30 | ob = venv.reset()
31 | state = policy.initial_state
32 | done = [False]
33 | step_counter = 0
34 |
35 | # run a single episode until the end (i.e. until done)
36 | while True:
37 | action, _, state, _ = policy.step(ob, S=state, M=done)
38 | ob, reward, done, _ = venv.step(action)
39 | step_counter += 1
40 | if done:
41 | break
42 |
43 |
44 | assert step_counter > 5
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_env_after_learn.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 | import tensorflow as tf
4 |
5 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
6 | from baselines.run import get_learn_function
7 | from baselines.common.tf_util import make_session
8 |
9 | algos = ['a2c', 'acer', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
10 |
11 | @pytest.mark.parametrize('algo', algos)
12 | def test_env_after_learn(algo):
13 | def make_env():
14 | # acktr requires too much RAM, fails on travis
15 | env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
16 | return env
17 |
18 | make_session(make_default=True, graph=tf.Graph())
19 | env = SubprocVecEnv([make_env])
20 |
21 | learn = get_learn_function(algo)
22 |
23 | # Commenting out the following line resolves the issue, though crash happens at env.reset().
24 | learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)
25 |
26 | env.reset()
27 | env.close()
28 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_fetchreach.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 |
4 | from baselines.run import get_learn_function
5 | from baselines.common.tests.util import reward_per_episode_test
6 |
7 | pytest.importorskip('mujoco_py')
8 |
9 | common_kwargs = dict(
10 | network='mlp',
11 | seed=0,
12 | )
13 |
14 | learn_kwargs = {
15 | 'her': dict(total_timesteps=2000)
16 | }
17 |
18 | @pytest.mark.slow
19 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
20 | def test_fetchreach(alg):
21 | '''
22 | Test if the algorithm (with an mlp policy)
23 | can learn the FetchReach task
24 | '''
25 |
26 | kwargs = common_kwargs.copy()
27 | kwargs.update(learn_kwargs[alg])
28 |
29 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
30 | def env_fn():
31 |
32 | env = gym.make('FetchReach-v1')
33 | env.seed(0)
34 | return env
35 |
36 | reward_per_episode_test(env_fn, learn_fn, -15)
37 |
38 | if __name__ == '__main__':
39 | test_fetchreach('her')
40 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_fixed_sequence.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
3 |
4 | from baselines.common.tests.util import simple_test
5 | from baselines.run import get_learn_function
6 |
7 | common_kwargs = dict(
8 | seed=0,
9 | total_timesteps=50000,
10 | )
11 |
12 | learn_kwargs = {
13 | 'a2c': {},
14 | 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
15 | # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
16 | # github issue: https://github.com/openai/baselines/issues/188
17 | # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
18 | }
19 |
20 |
21 | alg_list = learn_kwargs.keys()
22 | rnn_list = ['lstm']
23 |
24 | @pytest.mark.slow
25 | @pytest.mark.parametrize("alg", alg_list)
26 | @pytest.mark.parametrize("rnn", rnn_list)
27 | def test_fixed_sequence(alg, rnn):
28 | '''
29 | Test if the algorithm (with a given policy)
30 | can learn an identity transformation (i.e. return observation as an action)
31 | '''
32 |
33 | kwargs = learn_kwargs[alg]
34 | kwargs.update(common_kwargs)
35 |
36 | episode_len = 5
37 | env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
38 | learn = lambda e: get_learn_function(alg)(
39 | env=e,
40 | network=rnn,
41 | **kwargs
42 | )
43 |
44 | simple_test(env_fn, learn, 0.7)
45 |
46 |
47 | if __name__ == '__main__':
48 | test_fixed_sequence('ppo2', 'lstm')
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_identity.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv
3 | from baselines.run import get_learn_function
4 | from baselines.common.tests.util import simple_test
5 |
6 | common_kwargs = dict(
7 | total_timesteps=30000,
8 | network='mlp',
9 | gamma=0.9,
10 | seed=0,
11 | )
12 |
13 | learn_kwargs = {
14 | 'a2c' : {},
15 | 'acktr': {},
16 | 'deepq': {},
17 | 'ddpg': dict(layer_norm=True),
18 | 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
19 | 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
20 | }
21 |
22 |
23 | algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
24 | algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi']
25 | algos_cont = ['a2c', 'acktr', 'ddpg', 'ppo2', 'trpo_mpi']
26 |
27 | @pytest.mark.slow
28 | @pytest.mark.parametrize("alg", algos_disc)
29 | def test_discrete_identity(alg):
30 | '''
31 | Test if the algorithm (with an mlp policy)
32 | can learn an identity transformation (i.e. return observation as an action)
33 | '''
34 |
35 | kwargs = learn_kwargs[alg]
36 | kwargs.update(common_kwargs)
37 |
38 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
39 | env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
40 | simple_test(env_fn, learn_fn, 0.9)
41 |
42 | @pytest.mark.slow
43 | @pytest.mark.parametrize("alg", algos_multidisc)
44 | def test_multidiscrete_identity(alg):
45 | '''
46 | Test if the algorithm (with an mlp policy)
47 | can learn an identity transformation (i.e. return observation as an action)
48 | '''
49 |
50 | kwargs = learn_kwargs[alg]
51 | kwargs.update(common_kwargs)
52 |
53 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
54 | env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100)
55 | simple_test(env_fn, learn_fn, 0.9)
56 |
57 | @pytest.mark.slow
58 | @pytest.mark.parametrize("alg", algos_cont)
59 | def test_continuous_identity(alg):
60 | '''
61 | Test if the algorithm (with an mlp policy)
62 | can learn an identity transformation (i.e. return observation as an action)
63 | to a required precision
64 | '''
65 |
66 | kwargs = learn_kwargs[alg]
67 | kwargs.update(common_kwargs)
68 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
69 |
70 | env_fn = lambda: BoxIdentityEnv((1,), episode_len=100)
71 | simple_test(env_fn, learn_fn, -0.1)
72 |
73 | if __name__ == '__main__':
74 | test_multidiscrete_identity('acktr')
75 |
76 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_mnist.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | # from baselines.acer import acer_simple as acer
4 | from baselines.common.tests.envs.mnist_env import MnistEnv
5 | from baselines.common.tests.util import simple_test
6 | from baselines.run import get_learn_function
7 |
8 |
9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
10 | # GitHub issue https://github.com/openai/baselines/issues/189
11 | common_kwargs = {
12 | 'seed': 0,
13 | 'network':'cnn',
14 | 'gamma':0.9,
15 | 'pad':'SAME'
16 | }
17 |
18 | learn_args = {
19 | 'a2c': dict(total_timesteps=50000),
20 | 'acer': dict(total_timesteps=20000),
21 | 'deepq': dict(total_timesteps=5000),
22 | 'acktr': dict(total_timesteps=30000),
23 | 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0),
24 | 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
25 | }
26 |
27 |
28 | #tests pass, but are too slow on travis. Same algorithms are covered
29 | # by other tests with less compute-hungry nn's and by benchmarks
30 | @pytest.mark.skip
31 | @pytest.mark.slow
32 | @pytest.mark.parametrize("alg", learn_args.keys())
33 | def test_mnist(alg):
34 | '''
35 | Test if the algorithm can learn to classify MNIST digits.
36 | Uses CNN policy.
37 | '''
38 |
39 | learn_kwargs = learn_args[alg]
40 | learn_kwargs.update(common_kwargs)
41 |
42 | learn = get_learn_function(alg)
43 | learn_fn = lambda e: learn(env=e, **learn_kwargs)
44 | env_fn = lambda: MnistEnv(seed=0, episode_len=100)
45 |
46 | simple_test(env_fn, learn_fn, 0.6)
47 |
48 | if __name__ == '__main__':
49 | test_mnist('acer')
50 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
4 |
5 |
6 | def test_piecewise_schedule():
7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
8 |
9 | assert np.isclose(ps.value(-10), 500)
10 | assert np.isclose(ps.value(0), 150)
11 | assert np.isclose(ps.value(5), 200)
12 | assert np.isclose(ps.value(9), 80)
13 | assert np.isclose(ps.value(50), 50)
14 | assert np.isclose(ps.value(80), 50)
15 | assert np.isclose(ps.value(150), 0)
16 | assert np.isclose(ps.value(175), -25)
17 | assert np.isclose(ps.value(201), 500)
18 | assert np.isclose(ps.value(500), 500)
19 |
20 | assert np.isclose(ps.value(200 - 1e-10), -50)
21 |
22 |
23 | def test_constant_schedule():
24 | cs = ConstantSchedule(5)
25 | for i in range(-100, 100):
26 | assert np.isclose(cs.value(i), 5)
27 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
4 |
5 |
6 | def test_tree_set():
7 | tree = SumSegmentTree(4)
8 |
9 | tree[2] = 1.0
10 | tree[3] = 3.0
11 |
12 | assert np.isclose(tree.sum(), 4.0)
13 | assert np.isclose(tree.sum(0, 2), 0.0)
14 | assert np.isclose(tree.sum(0, 3), 1.0)
15 | assert np.isclose(tree.sum(2, 3), 1.0)
16 | assert np.isclose(tree.sum(2, -1), 1.0)
17 | assert np.isclose(tree.sum(2, 4), 4.0)
18 |
19 |
20 | def test_tree_set_overlap():
21 | tree = SumSegmentTree(4)
22 |
23 | tree[2] = 1.0
24 | tree[2] = 3.0
25 |
26 | assert np.isclose(tree.sum(), 3.0)
27 | assert np.isclose(tree.sum(2, 3), 3.0)
28 | assert np.isclose(tree.sum(2, -1), 3.0)
29 | assert np.isclose(tree.sum(2, 4), 3.0)
30 | assert np.isclose(tree.sum(1, 2), 0.0)
31 |
32 |
33 | def test_prefixsum_idx():
34 | tree = SumSegmentTree(4)
35 |
36 | tree[2] = 1.0
37 | tree[3] = 3.0
38 |
39 | assert tree.find_prefixsum_idx(0.0) == 2
40 | assert tree.find_prefixsum_idx(0.5) == 2
41 | assert tree.find_prefixsum_idx(0.99) == 2
42 | assert tree.find_prefixsum_idx(1.01) == 3
43 | assert tree.find_prefixsum_idx(3.00) == 3
44 | assert tree.find_prefixsum_idx(4.00) == 3
45 |
46 |
47 | def test_prefixsum_idx2():
48 | tree = SumSegmentTree(4)
49 |
50 | tree[0] = 0.5
51 | tree[1] = 1.0
52 | tree[2] = 1.0
53 | tree[3] = 3.0
54 |
55 | assert tree.find_prefixsum_idx(0.00) == 0
56 | assert tree.find_prefixsum_idx(0.55) == 1
57 | assert tree.find_prefixsum_idx(0.99) == 1
58 | assert tree.find_prefixsum_idx(1.51) == 2
59 | assert tree.find_prefixsum_idx(3.00) == 3
60 | assert tree.find_prefixsum_idx(5.50) == 3
61 |
62 |
63 | def test_max_interval_tree():
64 | tree = MinSegmentTree(4)
65 |
66 | tree[0] = 1.0
67 | tree[2] = 0.5
68 | tree[3] = 3.0
69 |
70 | assert np.isclose(tree.min(), 0.5)
71 | assert np.isclose(tree.min(0, 2), 1.0)
72 | assert np.isclose(tree.min(0, 3), 0.5)
73 | assert np.isclose(tree.min(0, -1), 0.5)
74 | assert np.isclose(tree.min(2, 4), 0.5)
75 | assert np.isclose(tree.min(3, 4), 3.0)
76 |
77 | tree[2] = 0.7
78 |
79 | assert np.isclose(tree.min(), 0.7)
80 | assert np.isclose(tree.min(0, 2), 1.0)
81 | assert np.isclose(tree.min(0, 3), 0.7)
82 | assert np.isclose(tree.min(0, -1), 0.7)
83 | assert np.isclose(tree.min(2, 4), 0.7)
84 | assert np.isclose(tree.min(3, 4), 3.0)
85 |
86 | tree[2] = 4.0
87 |
88 | assert np.isclose(tree.min(), 1.0)
89 | assert np.isclose(tree.min(0, 2), 1.0)
90 | assert np.isclose(tree.min(0, 3), 1.0)
91 | assert np.isclose(tree.min(0, -1), 1.0)
92 | assert np.isclose(tree.min(2, 4), 3.0)
93 | assert np.isclose(tree.min(2, 3), 4.0)
94 | assert np.isclose(tree.min(2, -1), 4.0)
95 | assert np.isclose(tree.min(3, 4), 3.0)
96 |
97 |
98 | if __name__ == '__main__':
99 | test_tree_set()
100 | test_tree_set_overlap()
101 | test_prefixsum_idx()
102 | test_prefixsum_idx2()
103 | test_max_interval_tree()
104 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
1 | # tests for tf_util
2 | import tensorflow as tf
3 | from baselines.common.tf_util import (
4 | function,
5 | initialize,
6 | single_threaded_session
7 | )
8 |
9 |
10 | def test_function():
11 | with tf.Graph().as_default():
12 | x = tf.placeholder(tf.int32, (), name="x")
13 | y = tf.placeholder(tf.int32, (), name="y")
14 | z = 3 * x + 2 * y
15 | lin = function([x, y], z, givens={y: 0})
16 |
17 | with single_threaded_session():
18 | initialize()
19 |
20 | assert lin(2) == 6
21 | assert lin(2, 2) == 10
22 |
23 |
24 | def test_multikwargs():
25 | with tf.Graph().as_default():
26 | x = tf.placeholder(tf.int32, (), name="x")
27 | with tf.variable_scope("other"):
28 | x2 = tf.placeholder(tf.int32, (), name="x")
29 | z = 3 * x + 2 * x2
30 |
31 | lin = function([x, x2], z, givens={x2: 0})
32 | with single_threaded_session():
33 | initialize()
34 | assert lin(2) == 6
35 | assert lin(2, 2) == 10
36 |
37 |
38 | if __name__ == '__main__':
39 | test_function()
40 | test_multikwargs()
41 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tests/util.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | from gym.spaces import np_random
4 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
5 |
6 | N_TRIALS = 10000
7 | N_EPISODES = 100
8 |
9 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
10 | np.random.seed(0)
11 | np_random.seed(0)
12 |
13 | env = DummyVecEnv([env_fn])
14 |
15 |
16 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
17 | tf.set_random_seed(0)
18 |
19 | model = learn_fn(env)
20 |
21 | sum_rew = 0
22 | done = True
23 |
24 | for i in range(n_trials):
25 | if done:
26 | obs = env.reset()
27 | state = model.initial_state
28 |
29 | if state is not None:
30 | a, v, state, _ = model.step(obs, S=state, M=[False])
31 | else:
32 | a, v, _, _ = model.step(obs)
33 |
34 | obs, rew, done, _ = env.step(a)
35 | sum_rew += float(rew)
36 |
37 | print("Reward in {} trials is {}".format(n_trials, sum_rew))
38 | assert sum_rew > min_reward_fraction * n_trials, \
39 | 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
40 |
41 |
42 |
43 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
44 | env = DummyVecEnv([env_fn])
45 |
46 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
47 | model = learn_fn(env)
48 |
49 | N_TRIALS = 100
50 |
51 | observations, actions, rewards = rollout(env, model, N_TRIALS)
52 | rewards = [sum(r) for r in rewards]
53 |
54 | avg_rew = sum(rewards) / N_TRIALS
55 | print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
56 | assert avg_rew > min_avg_reward, \
57 | 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
58 |
59 | def rollout(env, model, n_trials):
60 | rewards = []
61 | actions = []
62 | observations = []
63 |
64 | for i in range(n_trials):
65 | obs = env.reset()
66 | state = model.initial_state if hasattr(model, 'initial_state') else None
67 | episode_rew = []
68 | episode_actions = []
69 | episode_obs = []
70 |
71 | while True:
72 | if state is not None:
73 | a, v, state, _ = model.step(obs, S=state, M=[False])
74 | else:
75 | a,v, _, _ = model.step(obs)
76 |
77 | obs, rew, done, _ = env.step(a)
78 |
79 | episode_rew.append(rew)
80 | episode_actions.append(a)
81 | episode_obs.append(obs)
82 |
83 | if done:
84 | break
85 |
86 | rewards.append(episode_rew)
87 | actions.append(episode_actions)
88 | observations.append(episode_obs)
89 |
90 | return observations, actions, rewards
91 |
92 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/tile_images.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def tile_images(img_nhwc):
4 | """
5 | Tile N images into one big PxQ image
6 | (P,Q) are chosen to be as close as possible, and if N
7 | is square, then P=Q.
8 |
9 | input: img_nhwc, list or array of images, ndim=4 once turned into array
10 | n = batch index, h = height, w = width, c = channel
11 | returns:
12 | bigim_HWc, ndarray with ndim=3
13 | """
14 | img_nhwc = np.asarray(img_nhwc)
15 | N, h, w, c = img_nhwc.shape
16 | H = int(np.ceil(np.sqrt(N)))
17 | W = int(np.ceil(float(N)/H))
18 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
19 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
20 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
21 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
22 | return img_Hh_Ww_c
23 |
24 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import spaces
3 | from . import VecEnv
4 | from .util import copy_obs_dict, dict_to_obs, obs_space_info
5 |
6 | class DummyVecEnv(VecEnv):
7 | """
8 | VecEnv that does runs multiple environments sequentially, that is,
9 | the step and reset commands are send to one environment at a time.
10 | Useful when debugging and when num_env == 1 (in the latter case,
11 | avoids communication overhead)
12 | """
13 | def __init__(self, env_fns):
14 | """
15 | Arguments:
16 |
17 | env_fns: iterable of callables functions that build environments
18 | """
19 | self.envs = [fn() for fn in env_fns]
20 | env = self.envs[0]
21 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
22 | obs_space = env.observation_space
23 | self.keys, shapes, dtypes = obs_space_info(obs_space)
24 |
25 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
26 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
27 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
28 | self.buf_infos = [{} for _ in range(self.num_envs)]
29 | self.actions = None
30 | self.specs = [e.spec for e in self.envs]
31 |
32 | def step_async(self, actions):
33 | listify = True
34 | try:
35 | if len(actions) == self.num_envs:
36 | listify = False
37 | except TypeError:
38 | pass
39 |
40 | if not listify:
41 | self.actions = actions
42 | else:
43 | assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
44 | self.actions = [actions]
45 |
46 | def step_wait(self):
47 | for e in range(self.num_envs):
48 | action = self.actions[e]
49 | if isinstance(self.envs[e].action_space, spaces.Discrete):
50 | action = int(action)
51 |
52 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
53 | if self.buf_dones[e]:
54 | obs = self.envs[e].reset()
55 | self._save_obs(e, obs)
56 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
57 | self.buf_infos.copy())
58 |
59 | def reset(self):
60 | for e in range(self.num_envs):
61 | obs = self.envs[e].reset()
62 | self._save_obs(e, obs)
63 | return self._obs_from_buf()
64 |
65 | def _save_obs(self, e, obs):
66 | for k in self.keys:
67 | if k is None:
68 | self.buf_obs[k][e] = obs
69 | else:
70 | self.buf_obs[k][e] = obs[k]
71 |
72 | def _obs_from_buf(self):
73 | return dict_to_obs(copy_obs_dict(self.buf_obs))
74 |
75 | def get_images(self):
76 | return [env.render(mode='rgb_array') for env in self.envs]
77 |
78 | def render(self, mode='human'):
79 | if self.num_envs == 1:
80 | return self.envs[0].render(mode=mode)
81 | else:
82 | return super().render(mode=mode)
83 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/vec_env/test_vec_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for asynchronous vectorized environments.
3 | """
4 |
5 | import gym
6 | import numpy as np
7 | import pytest
8 | from .dummy_vec_env import DummyVecEnv
9 | from .shmem_vec_env import ShmemVecEnv
10 | from .subproc_vec_env import SubprocVecEnv
11 |
12 |
13 | def assert_envs_equal(env1, env2, num_steps):
14 | """
15 | Compare two environments over num_steps steps and make sure
16 | that the observations produced by each are the same when given
17 | the same actions.
18 | """
19 | assert env1.num_envs == env2.num_envs
20 | assert env1.action_space.shape == env2.action_space.shape
21 | assert env1.action_space.dtype == env2.action_space.dtype
22 | joint_shape = (env1.num_envs,) + env1.action_space.shape
23 |
24 | try:
25 | obs1, obs2 = env1.reset(), env2.reset()
26 | assert np.array(obs1).shape == np.array(obs2).shape
27 | assert np.array(obs1).shape == joint_shape
28 | assert np.allclose(obs1, obs2)
29 | np.random.seed(1337)
30 | for _ in range(num_steps):
31 | actions = np.array(np.random.randint(0, 0x100, size=joint_shape),
32 | dtype=env1.action_space.dtype)
33 | for env in [env1, env2]:
34 | env.step_async(actions)
35 | outs1 = env1.step_wait()
36 | outs2 = env2.step_wait()
37 | for out1, out2 in zip(outs1[:3], outs2[:3]):
38 | assert np.array(out1).shape == np.array(out2).shape
39 | assert np.allclose(out1, out2)
40 | assert list(outs1[3]) == list(outs2[3])
41 | finally:
42 | env1.close()
43 | env2.close()
44 |
45 |
46 | @pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv))
47 | @pytest.mark.parametrize('dtype', ('uint8', 'float32'))
48 | def test_vec_env(klass, dtype): # pylint: disable=R0914
49 | """
50 | Test that a vectorized environment is equivalent to
51 | DummyVecEnv, since DummyVecEnv is less likely to be
52 | error prone.
53 | """
54 | num_envs = 3
55 | num_steps = 100
56 | shape = (3, 8)
57 |
58 | def make_fn(seed):
59 | """
60 | Get an environment constructor with a seed.
61 | """
62 | return lambda: SimpleEnv(seed, shape, dtype)
63 | fns = [make_fn(i) for i in range(num_envs)]
64 | env1 = DummyVecEnv(fns)
65 | env2 = klass(fns)
66 | assert_envs_equal(env1, env2, num_steps=num_steps)
67 |
68 |
69 | class SimpleEnv(gym.Env):
70 | """
71 | An environment with a pre-determined observation space
72 | and RNG seed.
73 | """
74 |
75 | def __init__(self, seed, shape, dtype):
76 | np.random.seed(seed)
77 | self._dtype = dtype
78 | self._start_obs = np.array(np.random.randint(0, 0x100, size=shape),
79 | dtype=dtype)
80 | self._max_steps = seed + 1
81 | self._cur_obs = None
82 | self._cur_step = 0
83 | # this is 0xFF instead of 0x100 because the Box space includes
84 | # the high end, while randint does not
85 | self.action_space = gym.spaces.Box(low=0, high=0xFF, shape=shape, dtype=dtype)
86 | self.observation_space = self.action_space
87 |
88 | def step(self, action):
89 | self._cur_obs += np.array(action, dtype=self._dtype)
90 | self._cur_step += 1
91 | done = self._cur_step >= self._max_steps
92 | reward = self._cur_step / self._max_steps
93 | return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)}
94 |
95 | def reset(self):
96 | self._cur_obs = self._start_obs
97 | self._cur_step = 0
98 | return self._cur_obs
99 |
100 | def render(self, mode=None):
101 | raise NotImplementedError
102 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/vec_env/test_video_recorder.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for asynchronous vectorized environments.
3 | """
4 |
5 | import gym
6 | import pytest
7 | import os
8 | import glob
9 | import tempfile
10 |
11 | from .dummy_vec_env import DummyVecEnv
12 | from .shmem_vec_env import ShmemVecEnv
13 | from .subproc_vec_env import SubprocVecEnv
14 | from .vec_video_recorder import VecVideoRecorder
15 |
16 | @pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv))
17 | @pytest.mark.parametrize('num_envs', (1, 4))
18 | @pytest.mark.parametrize('video_length', (10, 100))
19 | @pytest.mark.parametrize('video_interval', (1, 50))
20 | def test_video_recorder(klass, num_envs, video_length, video_interval):
21 | """
22 | Wrap an existing VecEnv with VevVideoRecorder,
23 | Make (video_interval + video_length + 1) steps,
24 | then check that the file is present
25 | """
26 |
27 | def make_fn():
28 | env = gym.make('PongNoFrameskip-v4')
29 | return env
30 | fns = [make_fn for _ in range(num_envs)]
31 | env = klass(fns)
32 |
33 | with tempfile.TemporaryDirectory() as video_path:
34 | env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length)
35 |
36 | env.reset()
37 | for _ in range(video_interval + video_length + 1):
38 | env.step([0] * num_envs)
39 | env.close()
40 |
41 |
42 | recorded_video = glob.glob(os.path.join(video_path, "*.mp4"))
43 |
44 | # first and second step
45 | assert len(recorded_video) == 2
46 | # Files are not empty
47 | assert all(os.stat(p).st_size != 0 for p in recorded_video)
48 |
49 |
50 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/vec_env/util.py:
--------------------------------------------------------------------------------
1 | """
2 | Helpers for dealing with vectorized environments.
3 | """
4 |
5 | from collections import OrderedDict
6 |
7 | import gym
8 | import numpy as np
9 |
10 |
11 | def copy_obs_dict(obs):
12 | """
13 | Deep-copy an observation dict.
14 | """
15 | return {k: np.copy(v) for k, v in obs.items()}
16 |
17 |
18 | def dict_to_obs(obs_dict):
19 | """
20 | Convert an observation dict into a raw array if the
21 | original observation space was not a Dict space.
22 | """
23 | if set(obs_dict.keys()) == {None}:
24 | return obs_dict[None]
25 | return obs_dict
26 |
27 |
28 | def obs_space_info(obs_space):
29 | """
30 | Get dict-structured information about a gym.Space.
31 |
32 | Returns:
33 | A tuple (keys, shapes, dtypes):
34 | keys: a list of dict keys.
35 | shapes: a dict mapping keys to shapes.
36 | dtypes: a dict mapping keys to dtypes.
37 | """
38 | if isinstance(obs_space, gym.spaces.Dict):
39 | assert isinstance(obs_space.spaces, OrderedDict)
40 | subspaces = obs_space.spaces
41 | else:
42 | subspaces = {None: obs_space}
43 | keys = []
44 | shapes = {}
45 | dtypes = {}
46 | for key, box in subspaces.items():
47 | keys.append(key)
48 | shapes[key] = box.shape
49 | dtypes[key] = box.dtype
50 | return keys, shapes, dtypes
51 |
52 |
53 | def obs_to_dict(obs):
54 | """
55 | Convert an observation into a dict.
56 | """
57 | if isinstance(obs, dict):
58 | return obs
59 | return {None: obs}
60 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | import numpy as np
3 | from gym import spaces
4 |
5 |
6 | class VecFrameStack(VecEnvWrapper):
7 | def __init__(self, venv, nstack):
8 | self.venv = venv
9 | self.nstack = nstack
10 | wos = venv.observation_space # wrapped ob space
11 | low = np.repeat(wos.low, self.nstack, axis=-1)
12 | high = np.repeat(wos.high, self.nstack, axis=-1)
13 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
14 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
15 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
16 |
17 | def step_wait(self):
18 | obs, rews, news, infos = self.venv.step_wait()
19 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
20 | for (i, new) in enumerate(news):
21 | if new:
22 | self.stackedobs[i] = 0
23 | self.stackedobs[..., -obs.shape[-1]:] = obs
24 | return self.stackedobs, rews, news, infos
25 |
26 | def reset(self):
27 | obs = self.venv.reset()
28 | self.stackedobs[...] = 0
29 | self.stackedobs[..., -obs.shape[-1]:] = obs
30 | return self.stackedobs
31 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/vec_env/vec_monitor.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | from baselines.bench.monitor import ResultsWriter
3 | import numpy as np
4 | import time
5 |
6 |
7 | class VecMonitor(VecEnvWrapper):
8 | def __init__(self, venv, filename=None):
9 | VecEnvWrapper.__init__(self, venv)
10 | self.eprets = None
11 | self.eplens = None
12 | self.tstart = time.time()
13 | self.results_writer = ResultsWriter(filename, header={'t_start': self.tstart})
14 |
15 | def reset(self):
16 | obs = self.venv.reset()
17 | self.eprets = np.zeros(self.num_envs, 'f')
18 | self.eplens = np.zeros(self.num_envs, 'i')
19 | return obs
20 |
21 | def step_wait(self):
22 | obs, rews, dones, infos = self.venv.step_wait()
23 | self.eprets += rews
24 | self.eplens += 1
25 | newinfos = []
26 | for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)):
27 | info = info.copy()
28 | if done:
29 | epinfo = {'r': ret, 'l': eplen, 't': round(time.time() - self.tstart, 6)}
30 | info['episode'] = epinfo
31 | self.eprets[i] = 0
32 | self.eplens[i] = 0
33 | self.results_writer.write_row(epinfo)
34 |
35 | newinfos.append(info)
36 |
37 | return obs, rews, dones, newinfos
38 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | from baselines.common.running_mean_std import RunningMeanStd
3 | import numpy as np
4 |
5 |
6 | class VecNormalize(VecEnvWrapper):
7 | """
8 | A vectorized wrapper that normalizes the observations
9 | and returns from an environment.
10 | """
11 |
12 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
13 | VecEnvWrapper.__init__(self, venv)
14 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
15 | self.ret_rms = RunningMeanStd(shape=()) if ret else None
16 | self.clipob = clipob
17 | self.cliprew = cliprew
18 | self.ret = np.zeros(self.num_envs)
19 | self.gamma = gamma
20 | self.epsilon = epsilon
21 |
22 | def step_wait(self):
23 | obs, rews, news, infos = self.venv.step_wait()
24 | self.ret = self.ret * self.gamma + rews
25 | obs = self._obfilt(obs)
26 | if self.ret_rms:
27 | self.ret_rms.update(self.ret)
28 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
29 | self.ret[news] = 0.
30 | return obs, rews, news, infos
31 |
32 | def _obfilt(self, obs):
33 | if self.ob_rms:
34 | self.ob_rms.update(obs)
35 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
36 | return obs
37 | else:
38 | return obs
39 |
40 | def reset(self):
41 | self.ret = np.zeros(self.num_envs)
42 | obs = self.venv.reset()
43 | return self._obfilt(obs)
44 |
--------------------------------------------------------------------------------
/miniworld/baselines/common/vec_env/vec_video_recorder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from baselines import logger
3 | from baselines.common.vec_env import VecEnvWrapper
4 | from gym.wrappers.monitoring import video_recorder
5 |
6 |
7 | class VecVideoRecorder(VecEnvWrapper):
8 | """
9 | Wrap VecEnv to record rendered image as mp4 video.
10 | """
11 |
12 | def __init__(self, venv, directory, record_video_trigger, video_length=200):
13 | """
14 | # Arguments
15 | venv: VecEnv to wrap
16 | directory: Where to save videos
17 | record_video_trigger:
18 | Function that defines when to start recording.
19 | The function takes the current number of step,
20 | and returns whether we should start recording or not.
21 | video_length: Length of recorded video
22 | """
23 |
24 | VecEnvWrapper.__init__(self, venv)
25 | self.record_video_trigger = record_video_trigger
26 | self.video_recorder = None
27 |
28 | self.directory = os.path.abspath(directory)
29 | if not os.path.exists(self.directory): os.mkdir(self.directory)
30 |
31 | self.file_prefix = "vecenv"
32 | self.file_infix = '{}'.format(os.getpid())
33 | self.step_id = 0
34 | self.video_length = video_length
35 |
36 | self.recording = False
37 | self.recorded_frames = 0
38 |
39 | def reset(self):
40 | obs = self.venv.reset()
41 |
42 | self.start_video_recorder()
43 |
44 | return obs
45 |
46 | def start_video_recorder(self):
47 | self.close_video_recorder()
48 |
49 | base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id))
50 | self.video_recorder = video_recorder.VideoRecorder(
51 | env=self.venv,
52 | base_path=base_path,
53 | metadata={'step_id': self.step_id}
54 | )
55 |
56 | self.video_recorder.capture_frame()
57 | self.recorded_frames = 1
58 | self.recording = True
59 |
60 | def _video_enabled(self):
61 | return self.record_video_trigger(self.step_id)
62 |
63 | def step_wait(self):
64 | obs, rews, dones, infos = self.venv.step_wait()
65 |
66 | self.step_id += 1
67 | if self.recording:
68 | self.video_recorder.capture_frame()
69 | self.recorded_frames += 1
70 | if self.recorded_frames > self.video_length:
71 | logger.info("Saving video to ", self.video_recorder.path)
72 | self.close_video_recorder()
73 | elif self._video_enabled():
74 | self.start_video_recorder()
75 |
76 | return obs, rews, dones, infos
77 |
78 | def close_video_recorder(self):
79 | if self.recording:
80 | self.video_recorder.close()
81 | self.recording = False
82 | self.recorded_frames = 0
83 |
84 | def close(self):
85 | VecEnvWrapper.close(self)
86 | self.close_video_recorder()
87 |
88 | def __del__(self):
89 | self.close()
90 |
--------------------------------------------------------------------------------
/miniworld/baselines/ppoc_int/README.md:
--------------------------------------------------------------------------------
1 | # PPOSGD
2 |
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
7 |
8 | - Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model`
9 | - Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model`
10 |
--------------------------------------------------------------------------------
/miniworld/baselines/ppoc_int/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/baselines/ppoc_int/__init__.py
--------------------------------------------------------------------------------
/miniworld/baselines/ppoc_int/muj.py:
--------------------------------------------------------------------------------
1 |
2 | # from rllab.envs.box2d.cartpole_swingup_env import CartpoleSwingupEnv
3 | # from rllab.envs.mujoco.maze.point_maze_env import PointMazeEnv
4 | # from rllab.envs.mujoco.maze.ant_maze_env import AntMazeEnv
5 |
6 | # from rllab.envs.mujoco.hill.half_cheetah_hill_env import HalfCheetahHillEnv
7 | # from rllab.envs.mujoco.hill.swimmer3d_hill_env import Swimmer3DHillEnv
8 | import pdb
9 | import time
10 | import gym
11 | import numpy as np
12 | # import my_gym;
13 | #from rllab.envs.mujoco.gather.swimmer_gather_env import SwimmerGatherEnv
14 | # from rllab.envs.mujoco.gather.ant_gather_env import AntGatherEnv
15 | # from rllab.envs.mujoco.gather.point_gather_env import PointGatherEnv
16 | # from rllab.envs.box2d.mountain_car_env import MountainCarEnv
17 | #from twod_tmaze2 import TMaze2
18 | #from antwalls import AntWallsEnv
19 |
20 | import time
21 | import gym_miniworld
22 | from gym_miniworld.entity import Box as miniBox
23 | from gym_miniworld.envs.oneroom import OneRoom
24 |
25 |
26 | #from antmaze import AntMazeEnv
27 |
28 | # from wheeled import WheeledEnv
29 | # from wheeled_maze import WheeledMazeEnv
30 | # from blockplaypen import BlockPlayPen
31 | # from twod_multi import TwoDMultiEnv
32 | # env = BlockPlayPen()
33 | # env = TwoDMaze()
34 | # env = TwoDMultiEnv()
35 |
36 |
37 | #env = SwimmerGatherEnv()
38 | #env = AntMazeEnv()
39 |
40 | #env = gym.make('MiniWorld-Hallway-v0')
41 | #env = gym.make('MiniWorld-OneRoom-v0')
42 | #env = gym.make('MiniWorld-PutNext-v0')
43 | env = gym.make('MiniWorld-PickupObjs-v0')
44 |
45 |
46 | #env=AntWallsEnv()
47 | #env= TMaze2()
48 | # env= gym.make("Reacher-v1")
49 | # env.seed(0)
50 | # pdb.set_trace()
51 | # env= PointMazeEnv()
52 | # env = gym.make("Acrobot-v1")
53 | #env.reset()
54 | # env.render()
55 | # state,reward, done, _ = env.step(np.array([0.,10.]))
56 | # env.render()
57 | # state,reward, done, _ = env.step(np.array([0.,10.]))
58 | # env.render()
59 | # state,reward, done, _ = env.step(np.array([0.,10.]))
60 |
61 | episodes = 0
62 |
63 | for step in range(500):
64 | env.render()
65 | time.sleep(1)
66 | # pdb.set_trace()
67 | # print(t)
68 | # if True:
69 | # continue
70 | # print("aaa")
71 | # state,reward, done, _ = env.step(np.array([0.,0.]))
72 | # pdb.set_trace()
73 | state,reward, done, _ = env.step(env.action_space.sample())
74 | #print(env.box.pos)
75 |
76 | done = True
77 | if done:
78 | #pdb.set_trace()
79 | env.reset()
80 | episodes += 1
81 |
82 | # if episodes == 10:
83 | # env = OneRoom(change_goal=True)
84 | #
85 | # time.sleep(0.1)
--------------------------------------------------------------------------------
/miniworld/baselines/ppoc_int/oneroom.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import math
3 | from ..miniworld import MiniWorldEnv, Room
4 | from ..entity import Box
5 |
6 | class OneRoom(MiniWorldEnv):
7 | """
8 | Environment in which the goal is to go to a red box
9 | placed randomly in one big room.
10 | """
11 |
12 | def __init__(self, size=10,change_goal=None, **kwargs):
13 | assert size >= 2
14 | self.size = size
15 | self.change_goal = change_goal
16 |
17 | super().__init__(
18 | max_episode_steps=180,
19 | **kwargs
20 | )
21 |
22 | def _gen_world(self):
23 | room = self.add_rect_room(
24 | min_x=0,
25 | max_x=self.size,
26 | min_z=0,
27 | max_z=self.size
28 | )
29 |
30 | if not self.change_goal:
31 | self.box = self.place_entity(Box(color='red'))
32 | else:
33 | self.box = self.place_entity(Box(color='blue'))
34 | self.place_agent()
35 |
36 | def step(self, action):
37 | obs, reward, done, info = super().step(action)
38 |
39 | if self.near(self.box):
40 | reward += self._reward()
41 | done = True
42 |
43 | return obs, reward, done, info
44 |
--------------------------------------------------------------------------------
/miniworld/baselines/ppoc_int/run_miniw.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | from baselines.common import set_global_seeds, tf_util as U
3 | from mpi4py import MPI
4 | from baselines import bench
5 | import os.path as osp
6 | import gym, logging
7 | import gym_miniworld
8 | from baselines import logger
9 |
10 |
11 |
12 | def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch, dc, plots, w_intfc, switch, mainlr, intlr, piolr, fewshot):
13 | from baselines.ppoc_int import cnn_policy, pposgd_simple
14 | rank = MPI.COMM_WORLD.Get_rank()
15 | sess = U.single_threaded_session()
16 | sess.__enter__()
17 | if rank == 0:
18 | logger.configure()
19 | else:
20 | logger.configure(format_strs=[])
21 | workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
22 | set_global_seeds(workerseed)
23 |
24 | env = gym.make(env_id)
25 | env.seed(workerseed)
26 |
27 |
28 | def policy_fn(name, ob_space, ac_space):
29 | return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, num_options=num_options, dc=dc, w_intfc=w_intfc)
30 |
31 | env = bench.Monitor(env, logger.get_dir() and
32 | osp.join(logger.get_dir(), str(rank)))
33 |
34 | optimsize = int(64 / num_options)
35 |
36 |
37 | num_timesteps = num_timesteps
38 | tperbatch = 2048 if not epoch else int(1e4)
39 | pposgd_simple.learn(env, policy_fn,
40 | max_timesteps=num_timesteps,
41 | timesteps_per_batch=tperbatch,
42 | clip_param=0.2, entcoeff=0.01,
43 | optim_epochs=4, optim_stepsize=mainlr, optim_batchsize=optimsize,
44 | gamma=0.99, lam=0.95, schedule='linear', num_options=num_options,
45 | app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed, dc=dc, plots=plots,
46 | w_intfc=w_intfc, switch=switch, intlr=intlr, piolr=piolr, fewshot=fewshot
47 | )
48 | env.close()
49 |
50 |
51 | def main():
52 | import argparse
53 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
54 | parser.add_argument('--env', help='environment ID', default='MiniWorld-OneRoom-v0')
55 | parser.add_argument('--timesteps', help='number of timesteps', type=int, default=1000000)
56 | parser.add_argument('--seed', help='RNG seed', type=int, default=1)
57 | parser.add_argument('--opt', help='number of options', type=int, default=2)
58 | parser.add_argument('--app', help='Append to folder name', type=str, default='')
59 | parser.add_argument('--saves', dest='saves', action='store_true', default=False)
60 | parser.add_argument('--wsaves', dest='wsaves', action='store_true', default=False)
61 | parser.add_argument('--plots', dest='plots', action='store_true', default=False)
62 | parser.add_argument('--switch', dest='switch', help='switch task after 150 iterations', action='store_true', default=False)
63 | parser.add_argument('--fewshot', dest='fewshot', help='value learning after 150 iterations', action='store_true', default=False)
64 | parser.add_argument('--nointfc', dest='w_intfc', help='disables interet functions', action='store_false', default=True)
65 | parser.add_argument('--epoch', help='Epoch', type=int, default=0)
66 | parser.add_argument('--dc', type=float, default=0.)
67 | parser.add_argument('--mainlr', type=float, default=3e-4)
68 | parser.add_argument('--intlr', type=float, default=1e-4)
69 | parser.add_argument('--piolr', type=float, default=1e-4)
70 |
71 |
72 | args = parser.parse_args()
73 |
74 | train(args.env, num_timesteps=args.timesteps, seed=args.seed, num_options=args.opt, app=args.app, saves=args.saves,
75 | wsaves=args.wsaves, epoch=args.epoch, dc=args.dc, plots=args.plots, w_intfc=args.w_intfc, switch=args.switch,
76 | mainlr=args.mainlr, intlr=args.intlr, piolr=args.piolr, fewshot=args.fewshot)
77 |
78 |
79 | if __name__ == '__main__':
80 | main()
--------------------------------------------------------------------------------
/miniworld/baselines/ppoc_int/run_mujoco.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | from baselines.common import set_global_seeds, tf_util as U
3 | from baselines import bench
4 | import os.path as osp
5 | import gym, logging
6 | # import gym_miniworld
7 | import pdb
8 | from baselines import logger
9 | import sys
10 | # from gym_miniworld.wrappers import GreyscaleWrapper
11 |
12 | def train(env_id, num_timesteps, seed, num_options,app, saves ,wsaves, epoch,dc,plots,w_intfc,switch,mainlr,intlr,fewshot):
13 | from baselines.ppoc_int import mlp_policy, pposgd_simple
14 | U.make_session(num_cpu=1).__enter__()
15 | set_global_seeds(seed)
16 |
17 | if env_id=="TMaze":
18 | from twod_tmaze import TMaze
19 | env=TMaze()
20 | env.seed(seed)
21 | elif env_id=="TMaze2":
22 | from twod_tmaze2 import TMaze2
23 | env=TMaze2()
24 | env.seed(seed)
25 | elif env_id=="AntWalls":
26 | from antwalls import AntWallsEnv
27 | env=AntWallsEnv()
28 | env.seed(seed)
29 | elif env_id=="AntMaze":
30 | from ant_maze_env import AntMazeEnv
31 | mazeid = 'Maze'
32 | env = AntMazeEnv(mazeid)
33 | env.seed(seed)
34 | else:
35 | env = gym.make(env_id)
36 | env._seed(seed)
37 |
38 |
39 | def policy_fn(name, ob_space, ac_space):
40 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
41 | hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc, w_intfc=w_intfc)
42 |
43 | gym.logger.setLevel(logging.WARN)
44 |
45 | optimsize=int(64/num_options)
46 |
47 | # pdb.set_trace()
48 | num_timesteps = num_timesteps if env_id!="TMaze" else 5e5
49 | tperbatch = 2048 if not epoch else int(1e4)
50 | pposgd_simple.learn(env, policy_fn,
51 | max_timesteps=num_timesteps,
52 | timesteps_per_batch=tperbatch,
53 | clip_param=0.2, entcoeff=0.0,
54 | optim_epochs=10, optim_stepsize=mainlr, optim_batchsize=optimsize,
55 | gamma=0.99, lam=0.95, schedule='constant', num_options=num_options,
56 | app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed,dc=dc,plots=plots,
57 | w_intfc=w_intfc,switch=switch,intlr=intlr,fewshot=fewshot
58 | )
59 | env.close()
60 |
61 | def main():
62 | import argparse
63 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
64 | parser.add_argument('--env', help='environment ID', default='TMaze')
65 | parser.add_argument('--timesteps', help='number of timesteps', type=int, default=1000000)
66 | parser.add_argument('--seed', help='RNG seed', type=int, default=1)
67 | parser.add_argument('--opt', help='number of options', type=int, default=2)
68 | parser.add_argument('--app', help='Append to folder name', type=str, default='')
69 | parser.add_argument('--saves', dest='saves', action='store_true', default=False)
70 | parser.add_argument('--wsaves', dest='wsaves', action='store_true', default=False)
71 | parser.add_argument('--plots', dest='plots', action='store_true', default=False)
72 | parser.add_argument('--switch', dest='switch', action='store_true', default=False)
73 | parser.add_argument('--fewshot', dest='fewshot', action='store_true', default=False)
74 | parser.add_argument('--nointfc', dest='w_intfc', action='store_false', default=True)
75 | parser.add_argument('--epoch', help='Epoch', type=int, default=0)
76 | parser.add_argument('--dc', type=float, default=0.)
77 | parser.add_argument('--mainlr', type=float, default=3e-4)
78 | parser.add_argument('--intlr', type=float, default=1e-4)
79 |
80 | # pdb.set_trace()
81 | args = parser.parse_args()
82 |
83 | train(args.env, num_timesteps=args.timesteps, seed=args.seed, num_options=args.opt, app=args.app,
84 | saves=args.saves, wsaves=args.wsaves, epoch=args.epoch,dc=args.dc,plots=args.plots,
85 | w_intfc=args.w_intfc,switch=args.switch,mainlr=args.mainlr,intlr=args.intlr,fewshot=args.fewshot)
86 |
87 |
88 | if __name__ == '__main__':
89 | main()
--------------------------------------------------------------------------------
/miniworld/baselines/results_plotter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib
3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
4 |
5 | import matplotlib.pyplot as plt
6 | plt.rcParams['svg.fonttype'] = 'none'
7 |
8 | from baselines.common import plot_util
9 |
10 | X_TIMESTEPS = 'timesteps'
11 | X_EPISODES = 'episodes'
12 | X_WALLTIME = 'walltime_hrs'
13 | Y_REWARD = 'reward'
14 | Y_TIMESTEPS = 'timesteps'
15 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
16 | EPISODES_WINDOW = 100
17 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
18 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
19 | 'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue']
20 |
21 | def rolling_window(a, window):
22 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
23 | strides = a.strides + (a.strides[-1],)
24 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
25 |
26 | def window_func(x, y, window, func):
27 | yw = rolling_window(y, window)
28 | yw_func = func(yw, axis=-1)
29 | return x[window-1:], yw_func
30 |
31 | def ts2xy(ts, xaxis, yaxis):
32 | if xaxis == X_TIMESTEPS:
33 | x = np.cumsum(ts.l.values)
34 | elif xaxis == X_EPISODES:
35 | x = np.arange(len(ts))
36 | elif xaxis == X_WALLTIME:
37 | x = ts.t.values / 3600.
38 | else:
39 | raise NotImplementedError
40 | if yaxis == Y_REWARD:
41 | y = ts.r.values
42 | elif yaxis == Y_TIMESTEPS:
43 | y = ts.l.values
44 | else:
45 | raise NotImplementedError
46 | return x, y
47 |
48 | def plot_curves(xy_list, xaxis, yaxis, title):
49 | fig = plt.figure(figsize=(8,2))
50 | maxx = max(xy[0][-1] for xy in xy_list)
51 | minx = 0
52 | for (i, (x, y)) in enumerate(xy_list):
53 | color = COLORS[i % len(COLORS)]
54 | plt.scatter(x, y, s=2)
55 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
56 | plt.plot(x, y_mean, color=color)
57 | plt.xlim(minx, maxx)
58 | plt.title(title)
59 | plt.xlabel(xaxis)
60 | plt.ylabel(yaxis)
61 | plt.tight_layout()
62 | fig.canvas.mpl_connect('resize_event', lambda event: plt.tight_layout())
63 | plt.grid(True)
64 |
65 |
66 | def split_by_task(taskpath):
67 | return taskpath['dirname'].split('/')[-1].split('-')[0]
68 |
69 | def plot_results(dirs, num_timesteps=10e6, xaxis=X_TIMESTEPS, yaxis=Y_REWARD, title='', split_fn=split_by_task):
70 | results = plot_util.load_results(dirs)
71 | plot_util.plot_results(results, xy_fn=lambda r: ts2xy(r['monitor'], xaxis, yaxis), split_fn=split_fn, average_group=True, resample=int(1e6))
72 |
73 | # Example usage in jupyter-notebook
74 | # from baselines.results_plotter import plot_results
75 | # %matplotlib inline
76 | # plot_results("./log")
77 | # Here ./log is a directory containing the monitor.csv files
78 |
79 | def main():
80 | import argparse
81 | import os
82 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
83 | parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log'])
84 | parser.add_argument('--num_timesteps', type=int, default=int(10e6))
85 | parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
86 | parser.add_argument('--yaxis', help = 'Varible on Y-axis', default = Y_REWARD)
87 | parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout')
88 | args = parser.parse_args()
89 | args.dirs = [os.path.abspath(dir) for dir in args.dirs]
90 | plot_results(args.dirs, args.num_timesteps, args.xaxis, args.yaxis, args.task_name)
91 | plt.show()
92 |
93 | if __name__ == '__main__':
94 | main()
95 |
--------------------------------------------------------------------------------
/miniworld/data/cartpole.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/data/cartpole.gif
--------------------------------------------------------------------------------
/miniworld/data/fetchPickAndPlaceContrast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/data/fetchPickAndPlaceContrast.png
--------------------------------------------------------------------------------
/miniworld/data/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/miniworld/data/logo.jpg
--------------------------------------------------------------------------------
/miniworld/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | select = F,E999,W291,W293
3 | exclude =
4 | .git,
5 | __pycache__,
6 | baselines/ppo1,
7 | baselines/bench,
8 |
--------------------------------------------------------------------------------
/miniworld/setup.py:
--------------------------------------------------------------------------------
1 | import re
2 | from setuptools import setup, find_packages
3 | import sys
4 |
5 | if sys.version_info.major != 3:
6 | print('This Python is only compatible with Python 3, but you are running '
7 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major))
8 |
9 |
10 | extras = {
11 | 'test': [
12 | 'filelock',
13 | 'pytest',
14 | 'pytest-forked',
15 | 'atari-py'
16 | ],
17 | 'bullet': [
18 | 'pybullet',
19 | ],
20 | 'mpi': [
21 | 'mpi4py'
22 | ]
23 | }
24 |
25 | all_deps = []
26 | for group_name in extras:
27 | all_deps += extras[group_name]
28 |
29 | extras['all'] = all_deps
30 |
31 | setup(name='baselines',
32 | packages=[package for package in find_packages()
33 | if package.startswith('baselines')],
34 | install_requires=[
35 | 'gym',
36 | 'scipy',
37 | 'tqdm',
38 | 'joblib',
39 | 'dill',
40 | 'progressbar2',
41 | 'cloudpickle',
42 | 'click',
43 | 'opencv-python'
44 | ],
45 | extras_require=extras,
46 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
47 | author='OpenAI',
48 | url='https://github.com/openai/baselines',
49 | author_email='gym@openai.com',
50 | version='0.1.5')
51 |
52 |
53 | # ensure there is some tensorflow build with version above 1.4
54 | import pkg_resources
55 | tf_pkg = None
56 | for tf_pkg_name in ['tensorflow', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-gpu']:
57 | try:
58 | tf_pkg = pkg_resources.get_distribution(tf_pkg_name)
59 | except pkg_resources.DistributionNotFound:
60 | pass
61 | assert tf_pkg is not None, 'TensorFlow needed, of version above 1.4'
62 | from distutils.version import LooseVersion
63 | assert LooseVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= LooseVersion('1.4.0')
64 |
--------------------------------------------------------------------------------
/tabular/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/.DS_Store
--------------------------------------------------------------------------------
/tabular/FR_Return_10Runs_Smooth_MisspecifiedPiO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/FR_Return_10Runs_Smooth_MisspecifiedPiO.pdf
--------------------------------------------------------------------------------
/tabular/FR_Steps_10Runs_Smooth_MisspecifiedPiO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/FR_Steps_10Runs_Smooth_MisspecifiedPiO.pdf
--------------------------------------------------------------------------------
/tabular/GoalG62.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/GoalG62.png
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/History.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/History.npy
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_0.png
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_1.png
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_2.png
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/IOC_Task1_IntraOptionPolicy_Opt_3.png
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Params.txt:
--------------------------------------------------------------------------------
1 | baseline:True
2 | discount:0.99
3 | epsilon:0.01
4 | lr_critic:0.5
5 | lr_interestfn:0.15
6 | lr_intra:0.25
7 | lr_reg:0.0
8 | lr_term:0.25
9 | nepisodes:500
10 | noptions:4
11 | nruns:10
12 | nsteps:2000
13 | primitive:False
14 | regularize:False
15 | seed:7200
16 | seed_startstate:10
17 | temperature:0.01
18 |
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/StateFreq.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/StateFreq.npy
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_ActionValueFunction.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_ActionValueFunction.npy
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_InterestFunction.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_InterestFunction.npy
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_IntraOption.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_IntraOption.npy
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_OptionValueFunction.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_OptionValueFunction.npy
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_Policy.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_Policy.npy
--------------------------------------------------------------------------------
/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_Termination.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/InterestOptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_IF0.15_LReg0.0_seed7200/Weights_Termination.npy
--------------------------------------------------------------------------------
/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/History.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/History.npy
--------------------------------------------------------------------------------
/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Params.txt:
--------------------------------------------------------------------------------
1 | baseline:True
2 | discount:0.99
3 | epsilon:0.01
4 | lr_critic:0.5
5 | lr_intra:0.25
6 | lr_term:0.25
7 | nepisodes:500
8 | noptions:4
9 | nruns:10
10 | nsteps:2000
11 | primitive:False
12 | seed:7200
13 | seed_startstate:10
14 | temperature:0.01
15 |
--------------------------------------------------------------------------------
/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/StateFreq.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/StateFreq.npy
--------------------------------------------------------------------------------
/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_ActionValueFunction.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_ActionValueFunction.npy
--------------------------------------------------------------------------------
/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_IntraOption.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_IntraOption.npy
--------------------------------------------------------------------------------
/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_OptionValueFunction.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_OptionValueFunction.npy
--------------------------------------------------------------------------------
/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_Termination.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/OptionCritic/Runs10_Epsds500_Eps0.01_NOpt4_LRT0.25_LRI0.25_LRC0.5_temp0.01_seed7200/Weights_Termination.npy
--------------------------------------------------------------------------------
/tabular/TransferVisual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/TransferVisual.png
--------------------------------------------------------------------------------
/tabular/__pycache__/fourrooms.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kkhetarpal/ioc/fb88d5b881e0b5c317020b23874495e614b3ddc7/tabular/__pycache__/fourrooms.cpython-36.pyc
--------------------------------------------------------------------------------
/tabular/fourrooms.py:
--------------------------------------------------------------------------------
1 | #Environment File for Classic Fourrooms Grid World
2 | import numpy as np
3 | import gym
4 | from gym import core, spaces
5 | from gym.envs.registration import register
6 | from random import uniform
7 |
8 | #class Fourrooms(gym.Env):
9 | class Fourrooms():
10 | def __init__(self,initstate_seed):
11 | layout = """\
12 | wwwwwwwwwwwww
13 | w w w
14 | w w w
15 | w w
16 | w w w
17 | w w w
18 | ww wwww w
19 | w www www
20 | w w w
21 | w w w
22 | w w
23 | w w w
24 | wwwwwwwwwwwww
25 | """
26 |
27 |
28 | self.occupancy = np.array([list(map(lambda c: 1 if c=='w' else 0, line)) for line in layout.splitlines()])
29 |
30 | # Action Space: from any state the agent can perform one of the four actions; Up, Down, Left and Right
31 | self.action_space = spaces.Discrete(4)
32 |
33 | # Observation Space
34 | self.observation_space = spaces.Discrete(np.sum(self.occupancy == 0))
35 |
36 | self.directions = [np.array((-1,0)), np.array((1,0)), np.array((0,-1)), np.array((0,1))]
37 |
38 | self.rng = np.random.RandomState(1234)
39 |
40 | self.initstate_seed = initstate_seed
41 | self.rng_init_state = np.random.RandomState(self.initstate_seed)
42 |
43 | self.tostate = {}
44 |
45 | self.occ_dict = dict(zip(range(self.observation_space.n),
46 | np.argwhere(self.occupancy.flatten() == 0).squeeze()))
47 |
48 |
49 | statenum = 0
50 | for i in range(13):
51 | for j in range(13):
52 | if self.occupancy[i, j] == 0:
53 | self.tostate[(i, j)] = statenum
54 | statenum += 1
55 |
56 | self.tocell = {v:k for k,v in self.tostate.items()}
57 |
58 | self.goal = 62
59 | self.init_states = list(range(self.observation_space.n))
60 | self.init_states.remove(self.goal)
61 |
62 |
63 | def empty_around(self, cell):
64 | avail = []
65 | for action in range(self.action_space.n):
66 | nextcell = tuple(cell + self.directions[action])
67 | if not self.occupancy[nextcell]:
68 | avail.append(nextcell)
69 | return avail
70 |
71 | # def reset(self):
72 | # state = self.rng.choice(self.init_states)
73 | # self.currentcell = self.tocell[state]
74 | # return state
75 |
76 |
77 | def reset(self):
78 | state = self.rng_init_state.choice(self.init_states)
79 | self.currentcell = self.tocell[state]
80 | return state
81 |
82 | def step(self, action):
83 | """
84 | The agent can perform one of four actions,
85 | up, down, left or right, which have a stochastic effect.
86 | We consider a case in which rewards are zero on all state transitions
87 | except the goal state which has a reward of +50.
88 | """
89 |
90 | reward = 0
91 | if self.rng.uniform() < 1/3:
92 | empty_cells = self.empty_around(self.currentcell)
93 | nextcell = empty_cells[self.rng.randint(len(empty_cells))]
94 | else:
95 | nextcell = tuple(self.currentcell + self.directions[action])
96 |
97 | if not self.occupancy[nextcell]:
98 | self.currentcell = nextcell
99 |
100 | state = self.tostate[self.currentcell]
101 |
102 | if state == self.goal:
103 | reward = 50
104 |
105 | done = state == self.goal
106 | return state, reward, float(done), None
107 |
108 | register(
109 | id='Fourrooms-v0',
110 | entry_point='fourrooms:Fourrooms',
111 | timestep_limit=20000,
112 | reward_threshold=1,
113 | )
114 |
--------------------------------------------------------------------------------