├── __init__.py
├── roboenvs
├── __init__.py
├── roboschool_hockey.xml
├── roboschool_pong.xml
├── joint_hockey.py
└── joint_pong.py
├── mpi_utils.py
├── .gitignore
├── README.md
├── recorder.py
├── cnn_policy.py
├── dynamics.py
├── vec_env.py
├── auxiliary_tasks.py
├── utils.py
├── rollouts.py
├── run.py
├── cppo_agent.py
└── wrappers.py
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/roboenvs/__init__.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from gym.envs.registration import register
3 |
4 | from .joint_pong import DiscretizeActionWrapper, MultiDiscreteToUsual
5 |
6 | register(
7 | id='RoboschoolPong-v2',
8 | entry_point='.joint_pong:RoboschoolPongJoint',
9 | max_episode_steps=10000,
10 | tags={"pg_complexity": 20 * 1000000},
11 | )
12 |
13 | register(
14 | id='RoboschoolHockey-v1',
15 | entry_point='.joint_hockey:RoboschoolHockeyJoint',
16 | max_episode_steps=1000,
17 | tags={"pg_complexity": 20 * 1000000},
18 | )
19 |
20 |
21 | def make_robopong():
22 | return gym.make("RoboschoolPong-v2")
23 |
24 |
25 | def make_robohockey():
26 | return gym.make("RoboschoolHockey-v1")
27 |
--------------------------------------------------------------------------------
/mpi_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from mpi4py import MPI
4 |
5 |
6 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
7 | """Adam optimizer that averages gradients across mpi processes."""
8 |
9 | def __init__(self, comm, **kwargs):
10 | self.comm = comm
11 | tf.train.AdamOptimizer.__init__(self, **kwargs)
12 |
13 | def compute_gradients(self, loss, var_list, **kwargs):
14 | grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
15 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
16 | flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
17 | shapes = [v.shape.as_list() for g, v in grads_and_vars]
18 | sizes = [int(np.prod(s)) for s in shapes]
19 |
20 | _task_id, num_tasks = self.comm.Get_rank(), self.comm.Get_size()
21 | buf = np.zeros(sum(sizes), np.float32)
22 |
23 | def _collect_grads(flat_grad):
24 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
25 | np.divide(buf, float(num_tasks), out=buf)
26 | return buf
27 |
28 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
29 | avg_flat_grad.set_shape(flat_grad.shape)
30 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
31 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
32 | for g, (_, v) in zip(avg_grads, grads_and_vars)]
33 |
34 | return avg_grads_and_vars
35 |
--------------------------------------------------------------------------------
/roboenvs/roboschool_hockey.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
24 |
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/roboenvs/roboschool_pong.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
24 |
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 | .idea/
109 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Large-Scale Study of Curiosity-Driven Learning ##
2 | #### [[Project Website]](https://pathak22.github.io/large-scale-curiosity/) [[Demo Video]](https://youtu.be/l1FqtAHfJLI)
3 |
4 | [Yuri Burda*](https://sites.google.com/site/yburda/), [Harri Edwards*](https://github.com/harri-edwards/), [Deepak Pathak*](https://people.eecs.berkeley.edu/~pathak/),
[Amos Storkey](http://homepages.inf.ed.ac.uk/amos/), [Trevor Darrell](https://people.eecs.berkeley.edu/~trevor/), [Alexei A. Efros](https://people.eecs.berkeley.edu/~efros/)
5 | (* alphabetical ordering, equal contribution)
6 |
7 | University of California, Berkeley
8 | OpenAI
9 | University of Edinburgh
10 |
11 |
12 |
13 |
14 |
15 | This is a TensorFlow based implementation for our [paper on large-scale study of curiosity-driven learning](https://pathak22.github.io/large-scale-curiosity/) across
16 | 54 environments. Curiosity is a type of intrinsic reward function which uses prediction error as reward signal. In this paper, We perform the first large-scale study of purely curiosity-driven learning, i.e. without any extrinsic rewards, across 54 standard benchmark environments. We further investigate the effect of using different feature spaces for computing prediction error and show that random features are sufficient for many popular RL game benchmarks, but learned features appear to generalize better (e.g. to novel game levels in Super Mario Bros.). If you find this work useful in your research, please cite:
17 |
18 | @inproceedings{largeScaleCuriosity2018,
19 | Author = {Burda, Yuri and Edwards, Harri and
20 | Pathak, Deepak and Storkey, Amos and
21 | Darrell, Trevor and Efros, Alexei A.},
22 | Title = {Large-Scale Study of Curiosity-Driven Learning},
23 | Booktitle = {arXiv:1808.04355},
24 | Year = {2018}
25 | }
26 |
27 | ### Installation and Usage
28 | The following command should train a pure exploration agent on Breakout with default experiment parameters.
29 | ```bash
30 | python run.py
31 | ```
32 | To use more than one gpu/machine, use MPI (e.g. `mpiexec -n 8 python run.py` should use 1024 parallel environments to collect experience instead of the default 128 on an 8 gpu machine).
33 |
34 | ### Other helpful pointers
35 | - [Paper](https://pathak22.github.io/large-scale-curiosity/resources/largeScaleCuriosity2018.pdf)
36 | - [Project Website](https://pathak22.github.io/large-scale-curiosity/)
37 | - [Demo Video](https://youtu.be/l1FqtAHfJLI)
38 |
--------------------------------------------------------------------------------
/recorder.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle
3 |
4 | from baselines import logger
5 | from mpi4py import MPI
6 |
7 |
8 | class Recorder(object):
9 | def __init__(self, nenvs, nlumps):
10 | self.nenvs = nenvs
11 | self.nlumps = nlumps
12 | self.nenvs_per_lump = nenvs // nlumps
13 | self.acs = [[] for _ in range(nenvs)]
14 | self.int_rews = [[] for _ in range(nenvs)]
15 | self.ext_rews = [[] for _ in range(nenvs)]
16 | self.ep_infos = [{} for _ in range(nenvs)]
17 | self.filenames = [self.get_filename(i) for i in range(nenvs)]
18 | if MPI.COMM_WORLD.Get_rank() == 0:
19 | logger.info("episode recordings saved to ", self.filenames[0])
20 |
21 | def record(self, timestep, lump, acs, infos, int_rew, ext_rew, news):
22 | for out_index in range(self.nenvs_per_lump):
23 | in_index = out_index + lump * self.nenvs_per_lump
24 | if timestep == 0:
25 | self.acs[in_index].append(acs[out_index])
26 | else:
27 | if self.is_first_episode_step(in_index):
28 | try:
29 | self.ep_infos[in_index]['random_state'] = infos[out_index]['random_state']
30 | except:
31 | pass
32 |
33 | self.int_rews[in_index].append(int_rew[out_index])
34 | self.ext_rews[in_index].append(ext_rew[out_index])
35 |
36 | if news[out_index]:
37 | self.ep_infos[in_index]['ret'] = infos[out_index]['episode']['r']
38 | self.ep_infos[in_index]['len'] = infos[out_index]['episode']['l']
39 | self.dump_episode(in_index)
40 |
41 | self.acs[in_index].append(acs[out_index])
42 |
43 | def dump_episode(self, i):
44 | episode = {'acs': self.acs[i],
45 | 'int_rew': self.int_rews[i],
46 | 'info': self.ep_infos[i]}
47 | filename = self.filenames[i]
48 | if self.episode_worth_saving(i):
49 | with open(filename, 'ab') as f:
50 | pickle.dump(episode, f, protocol=-1)
51 | self.acs[i].clear()
52 | self.int_rews[i].clear()
53 | self.ext_rews[i].clear()
54 | self.ep_infos[i].clear()
55 |
56 | def episode_worth_saving(self, i):
57 | return (i == 0 and MPI.COMM_WORLD.Get_rank() == 0)
58 |
59 | def is_first_episode_step(self, i):
60 | return len(self.int_rews[i]) == 0
61 |
62 | def get_filename(self, i):
63 | filename = os.path.join(logger.get_dir(), 'env{}_{}.pk'.format(MPI.COMM_WORLD.Get_rank(), i))
64 | return filename
65 |
--------------------------------------------------------------------------------
/cnn_policy.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from baselines.common.distributions import make_pdtype
3 |
4 | from utils import getsess, small_convnet, activ, fc, flatten_two_dims, unflatten_first_dim
5 |
6 |
7 | class CnnPolicy(object):
8 | def __init__(self, ob_space, ac_space, hidsize,
9 | ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"):
10 | if layernormalize:
11 | print("Warning: policy is operating on top of layer-normed features. It might slow down the training.")
12 | self.layernormalize = layernormalize
13 | self.nl = nl
14 | self.ob_mean = ob_mean
15 | self.ob_std = ob_std
16 | with tf.variable_scope(scope):
17 | self.ob_space = ob_space
18 | self.ac_space = ac_space
19 | self.ac_pdtype = make_pdtype(ac_space)
20 | self.ph_ob = tf.placeholder(dtype=tf.int32,
21 | shape=(None, None) + ob_space.shape, name='ob')
22 | self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac')
23 | self.pd = self.vpred = None
24 | self.hidsize = hidsize
25 | self.feat_dim = feat_dim
26 | self.scope = scope
27 | pdparamsize = self.ac_pdtype.param_shape()[0]
28 |
29 | sh = tf.shape(self.ph_ob)
30 | x = flatten_two_dims(self.ph_ob)
31 | self.flat_features = self.get_features(x, reuse=False)
32 | self.features = unflatten_first_dim(self.flat_features, sh)
33 |
34 | with tf.variable_scope(scope, reuse=False):
35 | x = fc(self.flat_features, units=hidsize, activation=activ)
36 | x = fc(x, units=hidsize, activation=activ)
37 | pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
38 | vpred = fc(x, name='value_function_output', units=1, activation=None)
39 | pdparam = unflatten_first_dim(pdparam, sh)
40 | self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
41 | self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
42 | self.a_samp = pd.sample()
43 | self.entropy = pd.entropy()
44 | self.nlp_samp = pd.neglogp(self.a_samp)
45 |
46 | def get_features(self, x, reuse):
47 | x_has_timesteps = (x.get_shape().ndims == 5)
48 | if x_has_timesteps:
49 | sh = tf.shape(x)
50 | x = flatten_two_dims(x)
51 |
52 | with tf.variable_scope(self.scope + "_features", reuse=reuse):
53 | x = (tf.to_float(x) - self.ob_mean) / self.ob_std
54 | x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize)
55 |
56 | if x_has_timesteps:
57 | x = unflatten_first_dim(x, sh)
58 | return x
59 |
60 | def get_ac_value_nlp(self, ob):
61 | a, vpred, nlp = \
62 | getsess().run([self.a_samp, self.vpred, self.nlp_samp],
63 | feed_dict={self.ph_ob: ob[:, None]})
64 | return a[:, 0], vpred[:, 0], nlp[:, 0]
65 |
--------------------------------------------------------------------------------
/dynamics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from auxiliary_tasks import JustPixels
5 | from utils import small_convnet, flatten_two_dims, unflatten_first_dim, getsess, unet
6 |
7 |
8 | class Dynamics(object):
9 | def __init__(self, auxiliary_task, predict_from_pixels, feat_dim=None, scope='dynamics'):
10 | self.scope = scope
11 | self.auxiliary_task = auxiliary_task
12 | self.hidsize = self.auxiliary_task.hidsize
13 | self.feat_dim = feat_dim
14 | self.obs = self.auxiliary_task.obs
15 | self.last_ob = self.auxiliary_task.last_ob
16 | self.ac = self.auxiliary_task.ac
17 | self.ac_space = self.auxiliary_task.ac_space
18 | self.ob_mean = self.auxiliary_task.ob_mean
19 | self.ob_std = self.auxiliary_task.ob_std
20 | if predict_from_pixels:
21 | self.features = self.get_features(self.obs, reuse=False)
22 | else:
23 | self.features = tf.stop_gradient(self.auxiliary_task.features)
24 |
25 | self.out_features = self.auxiliary_task.next_features
26 |
27 | with tf.variable_scope(self.scope + "_loss"):
28 | self.loss = self.get_loss()
29 |
30 | def get_features(self, x, reuse):
31 | nl = tf.nn.leaky_relu
32 | x_has_timesteps = (x.get_shape().ndims == 5)
33 | if x_has_timesteps:
34 | sh = tf.shape(x)
35 | x = flatten_two_dims(x)
36 | with tf.variable_scope(self.scope + "_features", reuse=reuse):
37 | x = (tf.to_float(x) - self.ob_mean) / self.ob_std
38 | x = small_convnet(x, nl=nl, feat_dim=self.feat_dim, last_nl=nl, layernormalize=False)
39 | if x_has_timesteps:
40 | x = unflatten_first_dim(x, sh)
41 | return x
42 |
43 | def get_loss(self):
44 | ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
45 | sh = tf.shape(ac)
46 | ac = flatten_two_dims(ac)
47 |
48 | def add_ac(x):
49 | return tf.concat([x, ac], axis=-1)
50 |
51 | with tf.variable_scope(self.scope):
52 | x = flatten_two_dims(self.features)
53 | x = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu)
54 |
55 | def residual(x):
56 | res = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu)
57 | res = tf.layers.dense(add_ac(res), self.hidsize, activation=None)
58 | return x + res
59 |
60 | for _ in range(4):
61 | x = residual(x)
62 | n_out_features = self.out_features.get_shape()[-1].value
63 | x = tf.layers.dense(add_ac(x), n_out_features, activation=None)
64 | x = unflatten_first_dim(x, sh)
65 | return tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, -1)
66 |
67 | def calculate_loss(self, ob, last_ob, acs):
68 | n_chunks = 8
69 | n = ob.shape[0]
70 | chunk_size = n // n_chunks
71 | assert n % n_chunks == 0
72 | sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size)
73 | return np.concatenate([getsess().run(self.loss,
74 | {self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)],
75 | self.ac: acs[sli(i)]}) for i in range(n_chunks)], 0)
76 |
77 |
78 | class UNet(Dynamics):
79 | def __init__(self, auxiliary_task, predict_from_pixels, feat_dim=None, scope='pixel_dynamics'):
80 | assert isinstance(auxiliary_task, JustPixels)
81 | assert not predict_from_pixels, "predict from pixels must be False, it's set up to predict from features that are normalized pixels."
82 | super(UNet, self).__init__(auxiliary_task=auxiliary_task,
83 | predict_from_pixels=predict_from_pixels,
84 | feat_dim=feat_dim,
85 | scope=scope)
86 |
87 | def get_features(self, x, reuse):
88 | raise NotImplementedError
89 |
90 | def get_loss(self):
91 | nl = tf.nn.leaky_relu
92 | ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
93 | sh = tf.shape(ac)
94 | ac = flatten_two_dims(ac)
95 | ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1)
96 |
97 | def add_ac(x):
98 | if x.get_shape().ndims == 2:
99 | return tf.concat([x, ac], axis=-1)
100 | elif x.get_shape().ndims == 4:
101 | sh = tf.shape(x)
102 | return tf.concat(
103 | [x, ac_four_dim + tf.zeros([sh[0], sh[1], sh[2], ac_four_dim.get_shape()[3].value], tf.float32)],
104 | axis=-1)
105 |
106 | with tf.variable_scope(self.scope):
107 | x = flatten_two_dims(self.features)
108 | x = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac)
109 | x = unflatten_first_dim(x, sh)
110 | self.prediction_pixels = x * self.ob_std + self.ob_mean
111 | return tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, [2, 3, 4])
112 |
--------------------------------------------------------------------------------
/vec_env.py:
--------------------------------------------------------------------------------
1 | """
2 | An interface for asynchronous vectorized environments.
3 | """
4 |
5 | import ctypes
6 | from abc import ABC, abstractmethod
7 | from multiprocessing import Pipe, Array, Process
8 |
9 | import gym
10 | import numpy as np
11 | from baselines import logger
12 |
13 | _NP_TO_CT = {np.float32: ctypes.c_float,
14 | np.int32: ctypes.c_int32,
15 | np.int8: ctypes.c_int8,
16 | np.uint8: ctypes.c_char,
17 | np.bool: ctypes.c_bool}
18 | _CT_TO_NP = {v: k for k, v in _NP_TO_CT.items()}
19 |
20 |
21 | class CloudpickleWrapper(object):
22 | """
23 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
24 | """
25 |
26 | def __init__(self, x):
27 | self.x = x
28 |
29 | def __getstate__(self):
30 | import cloudpickle
31 | return cloudpickle.dumps(self.x)
32 |
33 | def __setstate__(self, ob):
34 | import pickle
35 | self.x = pickle.loads(ob)
36 |
37 |
38 | class VecEnv(ABC):
39 | """
40 | An abstract asynchronous, vectorized environment.
41 | """
42 |
43 | def __init__(self, num_envs, observation_space, action_space):
44 | self.num_envs = num_envs
45 | self.observation_space = observation_space
46 | self.action_space = action_space
47 |
48 | @abstractmethod
49 | def reset(self):
50 | """
51 | Reset all the environments and return an array of
52 | observations, or a tuple of observation arrays.
53 |
54 | If step_async is still doing work, that work will
55 | be cancelled and step_wait() should not be called
56 | until step_async() is invoked again.
57 | """
58 | pass
59 |
60 | @abstractmethod
61 | def step_async(self, actions):
62 | """
63 | Tell all the environments to start taking a step
64 | with the given actions.
65 | Call step_wait() to get the results of the step.
66 |
67 | You should not call this if a step_async run is
68 | already pending.
69 | """
70 | pass
71 |
72 | @abstractmethod
73 | def step_wait(self):
74 | """
75 | Wait for the step taken with step_async().
76 |
77 | Returns (obs, rews, dones, infos):
78 | - obs: an array of observations, or a tuple of
79 | arrays of observations.
80 | - rews: an array of rewards
81 | - dones: an array of "episode done" booleans
82 | - infos: a sequence of info objects
83 | """
84 | pass
85 |
86 | @abstractmethod
87 | def close(self):
88 | """
89 | Clean up the environments' resources.
90 | """
91 | pass
92 |
93 | def step(self, actions):
94 | self.step_async(actions)
95 | return self.step_wait()
96 |
97 | def render(self):
98 | logger.warn('Render not defined for %s' % self)
99 |
100 |
101 | class ShmemVecEnv(VecEnv):
102 | """
103 | An AsyncEnv that uses multiprocessing to run multiple
104 | environments in parallel.
105 | """
106 |
107 | def __init__(self, env_fns, spaces=None):
108 | """
109 | If you don't specify observation_space, we'll have to create a dummy
110 | environment to get it.
111 | """
112 | if spaces:
113 | observation_space, action_space = spaces
114 | else:
115 | logger.log('Creating dummy env object to get spaces')
116 | with logger.scoped_configure(format_strs=[]):
117 | dummy = env_fns[0]()
118 | observation_space, action_space = dummy.observation_space, dummy.action_space
119 | dummy.close()
120 | del dummy
121 | VecEnv.__init__(self, len(env_fns), observation_space, action_space)
122 |
123 | obs_spaces = observation_space.spaces if isinstance(self.observation_space, gym.spaces.Tuple) else (
124 | self.observation_space,)
125 | self.obs_bufs = [tuple(Array(_NP_TO_CT[s.dtype.type], int(np.prod(s.shape))) for s in obs_spaces) for _ in
126 | env_fns]
127 | self.obs_shapes = [s.shape for s in obs_spaces]
128 | self.obs_dtypes = [s.dtype for s in obs_spaces]
129 |
130 | self.parent_pipes = []
131 | self.procs = []
132 | for env_fn, obs_buf in zip(env_fns, self.obs_bufs):
133 | wrapped_fn = CloudpickleWrapper(env_fn)
134 | parent_pipe, child_pipe = Pipe()
135 | proc = Process(target=_subproc_worker,
136 | args=(child_pipe, parent_pipe, wrapped_fn, obs_buf, self.obs_shapes))
137 | proc.daemon = True
138 | self.procs.append(proc)
139 | self.parent_pipes.append(parent_pipe)
140 | proc.start()
141 | child_pipe.close()
142 | self.waiting_step = False
143 |
144 | def reset(self):
145 | if self.waiting_step:
146 | logger.warn('Called reset() while waiting for the step to complete')
147 | self.step_wait()
148 | for pipe in self.parent_pipes:
149 | pipe.send(('reset', None))
150 | return self._decode_obses([pipe.recv() for pipe in self.parent_pipes])
151 |
152 | def step_async(self, actions):
153 | assert len(actions) == len(self.parent_pipes)
154 | for pipe, act in zip(self.parent_pipes, actions):
155 | pipe.send(('step', act))
156 |
157 | def step_wait(self):
158 | outs = [pipe.recv() for pipe in self.parent_pipes]
159 | obs, rews, dones, infos = zip(*outs)
160 | return self._decode_obses(obs), np.array(rews), np.array(dones), infos
161 |
162 | def close(self):
163 | if self.waiting_step:
164 | self.step_wait()
165 | for pipe in self.parent_pipes:
166 | pipe.send(('close', None))
167 | for pipe in self.parent_pipes:
168 | pipe.recv()
169 | pipe.close()
170 | for proc in self.procs:
171 | proc.join()
172 |
173 | def _decode_obses(self, obs):
174 | """
175 | Turn the observation responses into a single numpy
176 | array, possibly via shared memory.
177 | """
178 | obs = []
179 | for i, shape in enumerate(self.obs_shapes):
180 | bufs = [b[i] for b in self.obs_bufs]
181 | o = [np.frombuffer(b.get_obj(), dtype=self.obs_dtypes[i]).reshape(shape) for b in bufs]
182 | obs.append(np.array(o))
183 | return tuple(obs) if len(obs) > 1 else obs[0]
184 |
185 |
186 | def _subproc_worker(pipe, parent_pipe, env_fn_wrapper, obs_buf, obs_shape):
187 | """
188 | Control a single environment instance using IPC and
189 | shared memory.
190 |
191 | If obs_buf is not None, it is a shared-memory buffer
192 | for communicating observations.
193 | """
194 |
195 | def _write_obs(obs):
196 | if not isinstance(obs, tuple):
197 | obs = (obs,)
198 | for o, b, s in zip(obs, obs_buf, obs_shape):
199 | dst = b.get_obj()
200 | dst_np = np.frombuffer(dst, dtype=_CT_TO_NP[dst._type_]).reshape(s) # pylint: disable=W0212
201 | np.copyto(dst_np, o)
202 |
203 | env = env_fn_wrapper.x()
204 | parent_pipe.close()
205 | try:
206 | while True:
207 | cmd, data = pipe.recv()
208 | if cmd == 'reset':
209 | pipe.send(_write_obs(env.reset()))
210 | elif cmd == 'step':
211 | obs, reward, done, info = env.step(data)
212 | if done:
213 | obs = env.reset()
214 | pipe.send((_write_obs(obs), reward, done, info))
215 | elif cmd == 'close':
216 | pipe.send(None)
217 | break
218 | else:
219 | raise RuntimeError('Got unrecognized cmd %s' % cmd)
220 | finally:
221 | env.close()
222 |
--------------------------------------------------------------------------------
/auxiliary_tasks.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from utils import small_convnet, fc, activ, flatten_two_dims, unflatten_first_dim, small_deconvnet
4 |
5 |
6 | class FeatureExtractor(object):
7 | def __init__(self, policy, features_shared_with_policy, feat_dim=None, layernormalize=None,
8 | scope='feature_extractor'):
9 | self.scope = scope
10 | self.features_shared_with_policy = features_shared_with_policy
11 | self.feat_dim = feat_dim
12 | self.layernormalize = layernormalize
13 | self.policy = policy
14 | self.hidsize = policy.hidsize
15 | self.ob_space = policy.ob_space
16 | self.ac_space = policy.ac_space
17 | self.obs = self.policy.ph_ob
18 | self.ob_mean = self.policy.ob_mean
19 | self.ob_std = self.policy.ob_std
20 | with tf.variable_scope(scope):
21 | self.last_ob = tf.placeholder(dtype=tf.int32,
22 | shape=(None, 1) + self.ob_space.shape, name='last_ob')
23 | self.next_ob = tf.concat([self.obs[:, 1:], self.last_ob], 1)
24 |
25 | if features_shared_with_policy:
26 | self.features = self.policy.features
27 | self.last_features = self.policy.get_features(self.last_ob, reuse=True)
28 | else:
29 | self.features = self.get_features(self.obs, reuse=False)
30 | self.last_features = self.get_features(self.last_ob, reuse=True)
31 | self.next_features = tf.concat([self.features[:, 1:], self.last_features], 1)
32 |
33 | self.ac = self.policy.ph_ac
34 | self.scope = scope
35 |
36 | self.loss = self.get_loss()
37 |
38 | def get_features(self, x, reuse):
39 | nl = tf.nn.leaky_relu
40 | x_has_timesteps = (x.get_shape().ndims == 5)
41 | if x_has_timesteps:
42 | sh = tf.shape(x)
43 | x = flatten_two_dims(x)
44 | with tf.variable_scope(self.scope + "_features", reuse=reuse):
45 | x = (tf.to_float(x) - self.ob_mean) / self.ob_std
46 | x = small_convnet(x, nl=nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize)
47 | if x_has_timesteps:
48 | x = unflatten_first_dim(x, sh)
49 | return x
50 |
51 | def get_loss(self):
52 | return tf.zeros((), dtype=tf.float32)
53 |
54 |
55 | class InverseDynamics(FeatureExtractor):
56 | def __init__(self, policy, features_shared_with_policy, feat_dim=None, layernormalize=None):
57 | super(InverseDynamics, self).__init__(scope="inverse_dynamics", policy=policy,
58 | features_shared_with_policy=features_shared_with_policy,
59 | feat_dim=feat_dim, layernormalize=layernormalize)
60 |
61 | def get_loss(self):
62 | with tf.variable_scope(self.scope):
63 | x = tf.concat([self.features, self.next_features], 2)
64 | sh = tf.shape(x)
65 | x = flatten_two_dims(x)
66 | x = fc(x, units=self.policy.hidsize, activation=activ)
67 | x = fc(x, units=self.ac_space.n, activation=None)
68 | param = unflatten_first_dim(x, sh)
69 | idfpd = self.policy.ac_pdtype.pdfromflat(param)
70 | return idfpd.neglogp(self.ac)
71 |
72 |
73 | class VAE(FeatureExtractor):
74 | def __init__(self, policy, features_shared_with_policy, feat_dim=None, layernormalize=False, spherical_obs=False):
75 | assert not layernormalize, "VAE features should already have reasonable size, no need to layer normalize them"
76 | self.spherical_obs = spherical_obs
77 | super(VAE, self).__init__(scope="vae", policy=policy,
78 | features_shared_with_policy=features_shared_with_policy,
79 | feat_dim=feat_dim, layernormalize=False)
80 | self.features = tf.split(self.features, 2, -1)[0] # use mean only for features exposed to the dynamics
81 | self.next_features = tf.split(self.next_features, 2, -1)[0]
82 |
83 | def get_features(self, x, reuse):
84 | nl = tf.nn.leaky_relu
85 | x_has_timesteps = (x.get_shape().ndims == 5)
86 | if x_has_timesteps:
87 | sh = tf.shape(x)
88 | x = flatten_two_dims(x)
89 | with tf.variable_scope(self.scope + "_features", reuse=reuse):
90 | x = (tf.to_float(x) - self.ob_mean) / self.ob_std
91 | x = small_convnet(x, nl=nl, feat_dim=2 * self.feat_dim, last_nl=None, layernormalize=False)
92 | if x_has_timesteps:
93 | x = unflatten_first_dim(x, sh)
94 | return x
95 |
96 | def get_loss(self):
97 | with tf.variable_scope(self.scope):
98 | posterior_mean, posterior_scale = tf.split(self.features, 2, -1)
99 | posterior_scale = tf.nn.softplus(posterior_scale)
100 | posterior_distribution = tf.distributions.Normal(loc=posterior_mean, scale=posterior_scale)
101 |
102 | sh = tf.shape(posterior_mean)
103 | prior = tf.distributions.Normal(loc=tf.zeros(sh), scale=tf.ones(sh))
104 |
105 | posterior_kl = tf.distributions.kl_divergence(posterior_distribution, prior)
106 |
107 | posterior_kl = tf.reduce_sum(posterior_kl, [-1])
108 | assert posterior_kl.get_shape().ndims == 2
109 |
110 | posterior_sample = posterior_distribution.sample()
111 | reconstruction_distribution = self.decoder(posterior_sample)
112 | norm_obs = self.add_noise_and_normalize(self.obs)
113 | reconstruction_likelihood = reconstruction_distribution.log_prob(norm_obs)
114 | assert reconstruction_likelihood.get_shape().as_list()[2:] == [84, 84, 4]
115 | reconstruction_likelihood = tf.reduce_sum(reconstruction_likelihood, [2, 3, 4])
116 |
117 | likelihood_lower_bound = reconstruction_likelihood - posterior_kl
118 | return - likelihood_lower_bound
119 |
120 | def add_noise_and_normalize(self, x):
121 | x = tf.to_float(x) + tf.random_uniform(shape=tf.shape(x), minval=0., maxval=1.)
122 | x = (x - self.ob_mean) / self.ob_std
123 | return x
124 |
125 | def decoder(self, z):
126 | nl = tf.nn.leaky_relu
127 | z_has_timesteps = (z.get_shape().ndims == 3)
128 | if z_has_timesteps:
129 | sh = tf.shape(z)
130 | z = flatten_two_dims(z)
131 | with tf.variable_scope(self.scope + "decoder"):
132 | z = small_deconvnet(z, nl=nl, ch=4 if self.spherical_obs else 8, positional_bias=True)
133 | if z_has_timesteps:
134 | z = unflatten_first_dim(z, sh)
135 | if self.spherical_obs:
136 | scale = tf.get_variable(name="scale", shape=(), dtype=tf.float32,
137 | initializer=tf.ones_initializer())
138 | scale = tf.maximum(scale, -4.)
139 | scale = tf.nn.softplus(scale)
140 | scale = scale * tf.ones_like(z)
141 | else:
142 | z, scale = tf.split(z, 2, -1)
143 | scale = tf.nn.softplus(scale)
144 | # scale = tf.Print(scale, [scale])
145 | return tf.distributions.Normal(loc=z, scale=scale)
146 |
147 |
148 | class JustPixels(FeatureExtractor):
149 | def __init__(self, policy, features_shared_with_policy, feat_dim=None, layernormalize=None,
150 | scope='just_pixels'):
151 | assert not layernormalize
152 | assert not features_shared_with_policy
153 | super(JustPixels, self).__init__(scope=scope, policy=policy,
154 | features_shared_with_policy=False,
155 | feat_dim=None, layernormalize=None)
156 |
157 | def get_features(self, x, reuse):
158 | with tf.variable_scope(self.scope + "_features", reuse=reuse):
159 | x = (tf.to_float(x) - self.ob_mean) / self.ob_std
160 | return x
161 |
162 | def get_loss(self):
163 | return tf.zeros((), dtype=tf.float32)
164 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import os
3 | import platform
4 | from functools import partial
5 |
6 | import numpy as np
7 | import tensorflow as tf
8 | from baselines.common.tf_util import normc_initializer
9 | from mpi4py import MPI
10 |
11 |
12 | def bcast_tf_vars_from_root(sess, vars):
13 | """
14 | Send the root node's parameters to every worker.
15 |
16 | Arguments:
17 | sess: the TensorFlow session.
18 | vars: all parameter variables including optimizer's
19 | """
20 | rank = MPI.COMM_WORLD.Get_rank()
21 | for var in vars:
22 | if rank == 0:
23 | MPI.COMM_WORLD.bcast(sess.run(var))
24 | else:
25 | sess.run(tf.assign(var, MPI.COMM_WORLD.bcast(None)))
26 |
27 |
28 | def get_mean_and_std(array):
29 | comm = MPI.COMM_WORLD
30 | task_id, num_tasks = comm.Get_rank(), comm.Get_size()
31 | local_mean = np.array(np.mean(array))
32 | sum_of_means = np.zeros((), dtype=np.float32)
33 | comm.Allreduce(local_mean, sum_of_means, op=MPI.SUM)
34 | mean = sum_of_means / num_tasks
35 |
36 | n_array = array - mean
37 | sqs = n_array ** 2
38 | local_mean = np.array(np.mean(sqs))
39 | sum_of_means = np.zeros((), dtype=np.float32)
40 | comm.Allreduce(local_mean, sum_of_means, op=MPI.SUM)
41 | var = sum_of_means / num_tasks
42 | std = var ** 0.5
43 | return mean, std
44 |
45 |
46 | def guess_available_gpus(n_gpus=None):
47 | if n_gpus is not None:
48 | return list(range(n_gpus))
49 | if 'CUDA_VISIBLE_DEVICES' in os.environ:
50 | cuda_visible_divices = os.environ['CUDA_VISIBLE_DEVICES']
51 | cuda_visible_divices = cuda_visible_divices.split(',')
52 | return [int(n) for n in cuda_visible_divices]
53 | nvidia_dir = '/proc/driver/nvidia/gpus/'
54 | if os.path.exists(nvidia_dir):
55 | n_gpus = len(os.listdir(nvidia_dir))
56 | return list(range(n_gpus))
57 | raise Exception("Couldn't guess the available gpus on this machine")
58 |
59 |
60 | def setup_mpi_gpus():
61 | """
62 | Set CUDA_VISIBLE_DEVICES using MPI.
63 | """
64 | available_gpus = guess_available_gpus()
65 |
66 | node_id = platform.node()
67 | nodes_ordered_by_rank = MPI.COMM_WORLD.allgather(node_id)
68 | processes_outranked_on_this_node = [n for n in nodes_ordered_by_rank[:MPI.COMM_WORLD.Get_rank()] if n == node_id]
69 | local_rank = len(processes_outranked_on_this_node)
70 | os.environ['CUDA_VISIBLE_DEVICES'] = str(available_gpus[local_rank])
71 |
72 |
73 | def guess_available_cpus():
74 | return int(multiprocessing.cpu_count())
75 |
76 |
77 | def setup_tensorflow_session():
78 | num_cpu = guess_available_cpus()
79 |
80 | tf_config = tf.ConfigProto(
81 | inter_op_parallelism_threads=num_cpu,
82 | intra_op_parallelism_threads=num_cpu
83 | )
84 | return tf.Session(config=tf_config)
85 |
86 |
87 | def random_agent_ob_mean_std(env, nsteps=10000):
88 | ob = np.asarray(env.reset())
89 | if MPI.COMM_WORLD.Get_rank() == 0:
90 | obs = [ob]
91 | for _ in range(nsteps):
92 | ac = env.action_space.sample()
93 | ob, _, done, _ = env.step(ac)
94 | if done:
95 | ob = env.reset()
96 | obs.append(np.asarray(ob))
97 | mean = np.mean(obs, 0).astype(np.float32)
98 | std = np.std(obs, 0).mean().astype(np.float32)
99 | else:
100 | mean = np.empty(shape=ob.shape, dtype=np.float32)
101 | std = np.empty(shape=(), dtype=np.float32)
102 | MPI.COMM_WORLD.Bcast(mean, root=0)
103 | MPI.COMM_WORLD.Bcast(std, root=0)
104 | return mean, std
105 |
106 |
107 | def layernorm(x):
108 | m, v = tf.nn.moments(x, -1, keep_dims=True)
109 | return (x - m) / (tf.sqrt(v) + 1e-8)
110 |
111 |
112 | getsess = tf.get_default_session
113 |
114 | fc = partial(tf.layers.dense, kernel_initializer=normc_initializer(1.))
115 | activ = tf.nn.relu
116 |
117 |
118 | def flatten_two_dims(x):
119 | return tf.reshape(x, [-1] + x.get_shape().as_list()[2:])
120 |
121 |
122 | def unflatten_first_dim(x, sh):
123 | return tf.reshape(x, [sh[0], sh[1]] + x.get_shape().as_list()[1:])
124 |
125 |
126 | def add_pos_bias(x):
127 | with tf.variable_scope(name_or_scope=None, default_name="pos_bias"):
128 | b = tf.get_variable(name="pos_bias", shape=[1] + x.get_shape().as_list()[1:], dtype=tf.float32,
129 | initializer=tf.zeros_initializer())
130 | return x + b
131 |
132 |
133 | def small_convnet(x, nl, feat_dim, last_nl, layernormalize, batchnorm=False):
134 | bn = tf.layers.batch_normalization if batchnorm else lambda x: x
135 | x = bn(tf.layers.conv2d(x, filters=32, kernel_size=8, strides=(4, 4), activation=nl))
136 | x = bn(tf.layers.conv2d(x, filters=64, kernel_size=4, strides=(2, 2), activation=nl))
137 | x = bn(tf.layers.conv2d(x, filters=64, kernel_size=3, strides=(1, 1), activation=nl))
138 | x = tf.reshape(x, (-1, np.prod(x.get_shape().as_list()[1:])))
139 | x = bn(fc(x, units=feat_dim, activation=None))
140 | if last_nl is not None:
141 | x = last_nl(x)
142 | if layernormalize:
143 | x = layernorm(x)
144 | return x
145 |
146 |
147 | def small_deconvnet(z, nl, ch, positional_bias):
148 | sh = (8, 8, 64)
149 | z = fc(z, np.prod(sh), activation=nl)
150 | z = tf.reshape(z, (-1, *sh))
151 | z = tf.layers.conv2d_transpose(z, 128, kernel_size=4, strides=(2, 2), activation=nl, padding='same')
152 | assert z.get_shape().as_list()[1:3] == [16, 16]
153 | z = tf.layers.conv2d_transpose(z, 64, kernel_size=8, strides=(2, 2), activation=nl, padding='same')
154 | assert z.get_shape().as_list()[1:3] == [32, 32]
155 | z = tf.layers.conv2d_transpose(z, ch, kernel_size=8, strides=(3, 3), activation=None, padding='same')
156 | assert z.get_shape().as_list()[1:3] == [96, 96]
157 | z = z[:, 6:-6, 6:-6]
158 | assert z.get_shape().as_list()[1:3] == [84, 84]
159 | if positional_bias:
160 | z = add_pos_bias(z)
161 | return z
162 |
163 |
164 | def unet(x, nl, feat_dim, cond, batchnorm=False):
165 | bn = tf.layers.batch_normalization if batchnorm else lambda x: x
166 | layers = []
167 | x = tf.pad(x, [[0, 0], [6, 6], [6, 6], [0, 0]])
168 | x = bn(tf.layers.conv2d(cond(x), filters=32, kernel_size=8, strides=(3, 3), activation=nl, padding='same'))
169 | assert x.get_shape().as_list()[1:3] == [32, 32]
170 | layers.append(x)
171 | x = bn(tf.layers.conv2d(cond(x), filters=64, kernel_size=8, strides=(2, 2), activation=nl, padding='same'))
172 | layers.append(x)
173 | assert x.get_shape().as_list()[1:3] == [16, 16]
174 | x = bn(tf.layers.conv2d(cond(x), filters=64, kernel_size=4, strides=(2, 2), activation=nl, padding='same'))
175 | layers.append(x)
176 | assert x.get_shape().as_list()[1:3] == [8, 8]
177 |
178 | x = tf.reshape(x, (-1, np.prod(x.get_shape().as_list()[1:])))
179 | x = fc(cond(x), units=feat_dim, activation=nl)
180 |
181 | def residual(x):
182 | res = bn(tf.layers.dense(cond(x), feat_dim, activation=tf.nn.leaky_relu))
183 | res = tf.layers.dense(cond(res), feat_dim, activation=None)
184 | return x + res
185 |
186 | for _ in range(4):
187 | x = residual(x)
188 |
189 | sh = (8, 8, 64)
190 | x = fc(cond(x), np.prod(sh), activation=nl)
191 | x = tf.reshape(x, (-1, *sh))
192 | x += layers.pop()
193 | x = bn(tf.layers.conv2d_transpose(cond(x), 64, kernel_size=4, strides=(2, 2), activation=nl, padding='same'))
194 | assert x.get_shape().as_list()[1:3] == [16, 16]
195 | x += layers.pop()
196 | x = bn(tf.layers.conv2d_transpose(cond(x), 32, kernel_size=8, strides=(2, 2), activation=nl, padding='same'))
197 | assert x.get_shape().as_list()[1:3] == [32, 32]
198 | x += layers.pop()
199 | x = tf.layers.conv2d_transpose(cond(x), 4, kernel_size=8, strides=(3, 3), activation=None, padding='same')
200 | assert x.get_shape().as_list()[1:3] == [96, 96]
201 | x = x[:, 6:-6, 6:-6]
202 | assert x.get_shape().as_list()[1:3] == [84, 84]
203 | assert layers == []
204 | return x
205 |
206 |
207 | def tile_images(array, n_cols=None, max_images=None, div=1):
208 | if max_images is not None:
209 | array = array[:max_images]
210 | if len(array.shape) == 4 and array.shape[3] == 1:
211 | array = array[:, :, :, 0]
212 | assert len(array.shape) in [3, 4], "wrong number of dimensions - shape {}".format(array.shape)
213 | if len(array.shape) == 4:
214 | assert array.shape[3] == 3, "wrong number of channels- shape {}".format(array.shape)
215 | if n_cols is None:
216 | n_cols = max(int(np.sqrt(array.shape[0])) // div * div, div)
217 | n_rows = int(np.ceil(float(array.shape[0]) / n_cols))
218 |
219 | def cell(i, j):
220 | ind = i * n_cols + j
221 | return array[ind] if ind < array.shape[0] else np.zeros(array[0].shape)
222 |
223 | def row(i):
224 | return np.concatenate([cell(i, j) for j in range(n_cols)], axis=1)
225 |
226 | return np.concatenate([row(i) for i in range(n_rows)], axis=0)
227 |
228 |
--------------------------------------------------------------------------------
/rollouts.py:
--------------------------------------------------------------------------------
1 | from collections import deque, defaultdict
2 |
3 | import numpy as np
4 | from mpi4py import MPI
5 |
6 | from recorder import Recorder
7 |
8 |
9 | class Rollout(object):
10 | def __init__(self, ob_space, ac_space, nenvs, nsteps_per_seg, nsegs_per_env, nlumps, envs, policy,
11 | int_rew_coeff, ext_rew_coeff, record_rollouts, dynamics):
12 | self.nenvs = nenvs
13 | self.nsteps_per_seg = nsteps_per_seg
14 | self.nsegs_per_env = nsegs_per_env
15 | self.nsteps = self.nsteps_per_seg * self.nsegs_per_env
16 | self.ob_space = ob_space
17 | self.ac_space = ac_space
18 | self.nlumps = nlumps
19 | self.lump_stride = nenvs // self.nlumps
20 | self.envs = envs
21 | self.policy = policy
22 | self.dynamics = dynamics
23 |
24 | self.reward_fun = lambda ext_rew, int_rew: ext_rew_coeff * np.clip(ext_rew, -1., 1.) + int_rew_coeff * int_rew
25 |
26 | self.buf_vpreds = np.empty((nenvs, self.nsteps), np.float32)
27 | self.buf_nlps = np.empty((nenvs, self.nsteps), np.float32)
28 | self.buf_rews = np.empty((nenvs, self.nsteps), np.float32)
29 | self.buf_ext_rews = np.empty((nenvs, self.nsteps), np.float32)
30 | self.buf_acs = np.empty((nenvs, self.nsteps, *self.ac_space.shape), self.ac_space.dtype)
31 | self.buf_obs = np.empty((nenvs, self.nsteps, *self.ob_space.shape), self.ob_space.dtype)
32 | self.buf_obs_last = np.empty((nenvs, self.nsegs_per_env, *self.ob_space.shape), np.float32)
33 |
34 | self.buf_news = np.zeros((nenvs, self.nsteps), np.float32)
35 | self.buf_new_last = self.buf_news[:, 0, ...].copy()
36 | self.buf_vpred_last = self.buf_vpreds[:, 0, ...].copy()
37 |
38 | self.env_results = [None] * self.nlumps
39 | # self.prev_feat = [None for _ in range(self.nlumps)]
40 | # self.prev_acs = [None for _ in range(self.nlumps)]
41 | self.int_rew = np.zeros((nenvs,), np.float32)
42 |
43 | self.recorder = Recorder(nenvs=self.nenvs, nlumps=self.nlumps) if record_rollouts else None
44 | self.statlists = defaultdict(lambda: deque([], maxlen=100))
45 | self.stats = defaultdict(float)
46 | self.best_ext_ret = None
47 | self.all_visited_rooms = []
48 | self.all_scores = []
49 |
50 | self.step_count = 0
51 |
52 | def collect_rollout(self):
53 | self.ep_infos_new = []
54 | for t in range(self.nsteps):
55 | self.rollout_step()
56 | self.calculate_reward()
57 | self.update_info()
58 |
59 | def calculate_reward(self):
60 | int_rew = self.dynamics.calculate_loss(ob=self.buf_obs,
61 | last_ob=self.buf_obs_last,
62 | acs=self.buf_acs)
63 | self.buf_rews[:] = self.reward_fun(int_rew=int_rew, ext_rew=self.buf_ext_rews)
64 |
65 | def rollout_step(self):
66 | t = self.step_count % self.nsteps
67 | s = t % self.nsteps_per_seg
68 | for l in range(self.nlumps):
69 | obs, prevrews, news, infos = self.env_get(l)
70 | # if t > 0:
71 | # prev_feat = self.prev_feat[l]
72 | # prev_acs = self.prev_acs[l]
73 | for info in infos:
74 | epinfo = info.get('episode', {})
75 | mzepinfo = info.get('mz_episode', {})
76 | retroepinfo = info.get('retro_episode', {})
77 | epinfo.update(mzepinfo)
78 | epinfo.update(retroepinfo)
79 | if epinfo:
80 | if "n_states_visited" in info:
81 | epinfo["n_states_visited"] = info["n_states_visited"]
82 | epinfo["states_visited"] = info["states_visited"]
83 | self.ep_infos_new.append((self.step_count, epinfo))
84 |
85 | sli = slice(l * self.lump_stride, (l + 1) * self.lump_stride)
86 |
87 | acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs)
88 | self.env_step(l, acs)
89 |
90 | # self.prev_feat[l] = dyn_feat
91 | # self.prev_acs[l] = acs
92 | self.buf_obs[sli, t] = obs
93 | self.buf_news[sli, t] = news
94 | self.buf_vpreds[sli, t] = vpreds
95 | self.buf_nlps[sli, t] = nlps
96 | self.buf_acs[sli, t] = acs
97 | if t > 0:
98 | self.buf_ext_rews[sli, t - 1] = prevrews
99 | # if t > 0:
100 | # dyn_logp = self.policy.call_reward(prev_feat, pol_feat, prev_acs)
101 | #
102 | # int_rew = dyn_logp.reshape(-1, )
103 | #
104 | # self.int_rew[sli] = int_rew
105 | # self.buf_rews[sli, t - 1] = self.reward_fun(ext_rew=prevrews, int_rew=int_rew)
106 | if self.recorder is not None:
107 | self.recorder.record(timestep=self.step_count, lump=l, acs=acs, infos=infos, int_rew=self.int_rew[sli],
108 | ext_rew=prevrews, news=news)
109 | self.step_count += 1
110 | if s == self.nsteps_per_seg - 1:
111 | for l in range(self.nlumps):
112 | sli = slice(l * self.lump_stride, (l + 1) * self.lump_stride)
113 | nextobs, ext_rews, nextnews, _ = self.env_get(l)
114 | self.buf_obs_last[sli, t // self.nsteps_per_seg] = nextobs
115 | if t == self.nsteps - 1:
116 | self.buf_new_last[sli] = nextnews
117 | self.buf_ext_rews[sli, t] = ext_rews
118 | _, self.buf_vpred_last[sli], _ = self.policy.get_ac_value_nlp(nextobs)
119 | # dyn_logp = self.policy.call_reward(self.prev_feat[l], last_pol_feat, prev_acs)
120 | # dyn_logp = dyn_logp.reshape(-1, )
121 | # int_rew = dyn_logp
122 | #
123 | # self.int_rew[sli] = int_rew
124 | # self.buf_rews[sli, t] = self.reward_fun(ext_rew=ext_rews, int_rew=int_rew)
125 |
126 | def update_info(self):
127 | all_ep_infos = MPI.COMM_WORLD.allgather(self.ep_infos_new)
128 | all_ep_infos = sorted(sum(all_ep_infos, []), key=lambda x: x[0])
129 | if all_ep_infos:
130 | all_ep_infos = [i_[1] for i_ in all_ep_infos] # remove the step_count
131 | keys_ = all_ep_infos[0].keys()
132 | all_ep_infos = {k: [i[k] for i in all_ep_infos] for k in keys_}
133 |
134 | self.statlists['eprew'].extend(all_ep_infos['r'])
135 | self.stats['eprew_recent'] = np.mean(all_ep_infos['r'])
136 | self.statlists['eplen'].extend(all_ep_infos['l'])
137 | self.stats['epcount'] += len(all_ep_infos['l'])
138 | self.stats['tcount'] += sum(all_ep_infos['l'])
139 | if 'visited_rooms' in keys_:
140 | # Montezuma specific logging.
141 | self.stats['visited_rooms'] = sorted(list(set.union(*all_ep_infos['visited_rooms'])))
142 | self.stats['pos_count'] = np.mean(all_ep_infos['pos_count'])
143 | self.all_visited_rooms.extend(self.stats['visited_rooms'])
144 | self.all_scores.extend(all_ep_infos["r"])
145 | self.all_scores = sorted(list(set(self.all_scores)))
146 | self.all_visited_rooms = sorted(list(set(self.all_visited_rooms)))
147 | if MPI.COMM_WORLD.Get_rank() == 0:
148 | print("All visited rooms")
149 | print(self.all_visited_rooms)
150 | print("All scores")
151 | print(self.all_scores)
152 | if 'levels' in keys_:
153 | # Retro logging
154 | temp = sorted(list(set.union(*all_ep_infos['levels'])))
155 | self.all_visited_rooms.extend(temp)
156 | self.all_visited_rooms = sorted(list(set(self.all_visited_rooms)))
157 | if MPI.COMM_WORLD.Get_rank() == 0:
158 | print("All visited levels")
159 | print(self.all_visited_rooms)
160 |
161 | current_max = np.max(all_ep_infos['r'])
162 | else:
163 | current_max = None
164 | self.ep_infos_new = []
165 |
166 | if current_max is not None:
167 | if (self.best_ext_ret is None) or (current_max > self.best_ext_ret):
168 | self.best_ext_ret = current_max
169 | self.current_max = current_max
170 |
171 | def env_step(self, l, acs):
172 | self.envs[l].step_async(acs)
173 | self.env_results[l] = None
174 |
175 | def env_get(self, l):
176 | if self.step_count == 0:
177 | ob = self.envs[l].reset()
178 | out = self.env_results[l] = (ob, None, np.ones(self.lump_stride, bool), {})
179 | else:
180 | if self.env_results[l] is None:
181 | out = self.env_results[l] = self.envs[l].step_wait()
182 | else:
183 | out = self.env_results[l]
184 | return out
185 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | try:
3 | from OpenGL import GLU
4 | except:
5 | print("no OpenGL.GLU")
6 | import functools
7 | import os.path as osp
8 | from functools import partial
9 |
10 | import gym
11 | import tensorflow as tf
12 | from baselines import logger
13 | from baselines.bench import Monitor
14 | from baselines.common.atari_wrappers import NoopResetEnv, FrameStack
15 | from mpi4py import MPI
16 |
17 | from auxiliary_tasks import FeatureExtractor, InverseDynamics, VAE, JustPixels
18 | from cnn_policy import CnnPolicy
19 | from cppo_agent import PpoOptimizer
20 | from dynamics import Dynamics, UNet
21 | from utils import random_agent_ob_mean_std
22 | from wrappers import MontezumaInfoWrapper, make_mario_env, make_robo_pong, make_robo_hockey, \
23 | make_multi_pong, AddRandomStateToInfo, MaxAndSkipEnv, ProcessFrame84, ExtraTimeLimit
24 |
25 |
26 | def start_experiment(**args):
27 | make_env = partial(make_env_all_params, add_monitor=True, args=args)
28 |
29 | trainer = Trainer(make_env=make_env,
30 | num_timesteps=args['num_timesteps'], hps=args,
31 | envs_per_process=args['envs_per_process'])
32 | log, tf_sess = get_experiment_environment(**args)
33 | with log, tf_sess:
34 | logdir = logger.get_dir()
35 | print("results will be saved to ", logdir)
36 | trainer.train()
37 |
38 |
39 | class Trainer(object):
40 | def __init__(self, make_env, hps, num_timesteps, envs_per_process):
41 | self.make_env = make_env
42 | self.hps = hps
43 | self.envs_per_process = envs_per_process
44 | self.num_timesteps = num_timesteps
45 | self._set_env_vars()
46 |
47 | self.policy = CnnPolicy(
48 | scope='pol',
49 | ob_space=self.ob_space,
50 | ac_space=self.ac_space,
51 | hidsize=512,
52 | feat_dim=512,
53 | ob_mean=self.ob_mean,
54 | ob_std=self.ob_std,
55 | layernormalize=False,
56 | nl=tf.nn.leaky_relu)
57 |
58 | self.feature_extractor = {"none": FeatureExtractor,
59 | "idf": InverseDynamics,
60 | "vaesph": partial(VAE, spherical_obs=True),
61 | "vaenonsph": partial(VAE, spherical_obs=False),
62 | "pix2pix": JustPixels}[hps['feat_learning']]
63 | self.feature_extractor = self.feature_extractor(policy=self.policy,
64 | features_shared_with_policy=False,
65 | feat_dim=512,
66 | layernormalize=hps['layernorm'])
67 |
68 | self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
69 | self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor,
70 | predict_from_pixels=hps['dyn_from_pixels'],
71 | feat_dim=512)
72 |
73 | self.agent = PpoOptimizer(
74 | scope='ppo',
75 | ob_space=self.ob_space,
76 | ac_space=self.ac_space,
77 | stochpol=self.policy,
78 | use_news=hps['use_news'],
79 | gamma=hps['gamma'],
80 | lam=hps["lambda"],
81 | nepochs=hps['nepochs'],
82 | nminibatches=hps['nminibatches'],
83 | lr=hps['lr'],
84 | cliprange=0.1,
85 | nsteps_per_seg=hps['nsteps_per_seg'],
86 | nsegs_per_env=hps['nsegs_per_env'],
87 | ent_coef=hps['ent_coeff'],
88 | normrew=hps['norm_rew'],
89 | normadv=hps['norm_adv'],
90 | ext_coeff=hps['ext_coeff'],
91 | int_coeff=hps['int_coeff'],
92 | dynamics=self.dynamics
93 | )
94 |
95 | self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
96 | self.agent.total_loss += self.agent.to_report['aux']
97 | self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
98 | self.agent.total_loss += self.agent.to_report['dyn_loss']
99 | self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
100 |
101 | def _set_env_vars(self):
102 | env = self.make_env(0, add_monitor=False)
103 | self.ob_space, self.ac_space = env.observation_space, env.action_space
104 | self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
105 | del env
106 | self.envs = [functools.partial(self.make_env, i) for i in range(self.envs_per_process)]
107 |
108 | def train(self):
109 | self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics)
110 | while True:
111 | info = self.agent.step()
112 | if info['update']:
113 | logger.logkvs(info['update'])
114 | logger.dumpkvs()
115 | if self.agent.rollout.stats['tcount'] > self.num_timesteps:
116 | break
117 |
118 | self.agent.stop_interaction()
119 |
120 |
121 | def make_env_all_params(rank, add_monitor, args):
122 | if args["env_kind"] == 'atari':
123 | env = gym.make(args['env'])
124 | assert 'NoFrameskip' in env.spec.id
125 | env = NoopResetEnv(env, noop_max=args['noop_max'])
126 | env = MaxAndSkipEnv(env, skip=4)
127 | env = ProcessFrame84(env, crop=False)
128 | env = FrameStack(env, 4)
129 | env = ExtraTimeLimit(env, args['max_episode_steps'])
130 | if 'Montezuma' in args['env']:
131 | env = MontezumaInfoWrapper(env)
132 | env = AddRandomStateToInfo(env)
133 | elif args["env_kind"] == 'mario':
134 | env = make_mario_env()
135 | elif args["env_kind"] == "retro_multi":
136 | env = make_multi_pong()
137 | elif args["env_kind"] == 'robopong':
138 | if args["env"] == "pong":
139 | env = make_robo_pong()
140 | elif args["env"] == "hockey":
141 | env = make_robo_hockey()
142 |
143 | if add_monitor:
144 | env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
145 | return env
146 |
147 |
148 | def get_experiment_environment(**args):
149 | from utils import setup_mpi_gpus, setup_tensorflow_session
150 | from baselines.common import set_global_seeds
151 | from gym.utils.seeding import hash_seed
152 | process_seed = args["seed"] + 1000 * MPI.COMM_WORLD.Get_rank()
153 | process_seed = hash_seed(process_seed, max_bytes=4)
154 | set_global_seeds(process_seed)
155 | setup_mpi_gpus()
156 |
157 | logger_context = logger.scoped_configure(dir=None,
158 | format_strs=['stdout', 'log',
159 | 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else ['log'])
160 | tf_context = setup_tensorflow_session()
161 | return logger_context, tf_context
162 |
163 |
164 | def add_environments_params(parser):
165 | parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4',
166 | type=str)
167 | parser.add_argument('--max-episode-steps', help='maximum number of timesteps for episode', default=4500, type=int)
168 | parser.add_argument('--env_kind', type=str, default="atari")
169 | parser.add_argument('--noop_max', type=int, default=30)
170 |
171 |
172 | def add_optimization_params(parser):
173 | parser.add_argument('--lambda', type=float, default=0.95)
174 | parser.add_argument('--gamma', type=float, default=0.99)
175 | parser.add_argument('--nminibatches', type=int, default=8)
176 | parser.add_argument('--norm_adv', type=int, default=1)
177 | parser.add_argument('--norm_rew', type=int, default=1)
178 | parser.add_argument('--lr', type=float, default=1e-4)
179 | parser.add_argument('--ent_coeff', type=float, default=0.001)
180 | parser.add_argument('--nepochs', type=int, default=3)
181 | parser.add_argument('--num_timesteps', type=int, default=int(1e8))
182 |
183 |
184 | def add_rollout_params(parser):
185 | parser.add_argument('--nsteps_per_seg', type=int, default=128)
186 | parser.add_argument('--nsegs_per_env', type=int, default=1)
187 | parser.add_argument('--envs_per_process', type=int, default=128)
188 | parser.add_argument('--nlumps', type=int, default=1)
189 |
190 |
191 | if __name__ == '__main__':
192 | import argparse
193 |
194 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
195 | add_environments_params(parser)
196 | add_optimization_params(parser)
197 | add_rollout_params(parser)
198 |
199 | parser.add_argument('--exp_name', type=str, default='')
200 | parser.add_argument('--seed', help='RNG seed', type=int, default=0)
201 | parser.add_argument('--dyn_from_pixels', type=int, default=0)
202 | parser.add_argument('--use_news', type=int, default=0)
203 | parser.add_argument('--ext_coeff', type=float, default=0.)
204 | parser.add_argument('--int_coeff', type=float, default=1.)
205 | parser.add_argument('--layernorm', type=int, default=0)
206 | parser.add_argument('--feat_learning', type=str, default="none",
207 | choices=["none", "idf", "vaesph", "vaenonsph", "pix2pix"])
208 |
209 | args = parser.parse_args()
210 |
211 | start_experiment(**args.__dict__)
212 |
--------------------------------------------------------------------------------
/roboenvs/joint_hockey.py:
--------------------------------------------------------------------------------
1 | # Make a basic version of pong, run it with random agent.
2 |
3 | import os
4 | import sys
5 |
6 | import gym
7 | import gym.spaces
8 | import gym.utils
9 | import gym.utils.seeding
10 | import numpy as np
11 | import roboschool
12 | from roboschool.scene_abstract import Scene, cpp_household
13 |
14 |
15 | class HockeyScene(Scene):
16 | # multiplayer = False
17 | # players_count = 1
18 | VIDEO_W = 84
19 | VIDEO_H = 84
20 | TIMEOUT = 300
21 |
22 | def __init__(self):
23 | Scene.__init__(self, gravity=9.8, timestep=0.0165 / 4, frame_skip=8)
24 | self.score_left = 0
25 | self.score_right = 0
26 |
27 | def actor_introduce(self, robot):
28 | i = robot.player_n - 1
29 |
30 | def episode_restart(self):
31 | Scene.episode_restart(self)
32 | if self.score_right + self.score_left > 0:
33 | sys.stdout.write("%i:%i " % (self.score_left, self.score_right))
34 | sys.stdout.flush()
35 | self.mjcf = self.cpp_world.load_mjcf(os.path.join(os.path.dirname(__file__), "roboschool_hockey.xml"))
36 | dump = 0
37 | for r in self.mjcf:
38 | if dump: print("ROBOT '%s'" % r.root_part.name)
39 | for part in r.parts:
40 | if dump: print("\tPART '%s'" % part.name)
41 | # if part.name==self.robot_name:
42 | for j in r.joints:
43 | if j.name == "p0x": self.p0x = j
44 | if j.name == "p0y": self.p0y = j
45 | if j.name == "p1x": self.p1x = j
46 | if j.name == "p1y": self.p1y = j
47 | if j.name == "ballx": self.ballx = j
48 | if j.name == "bally": self.bally = j
49 | self.ballx.set_motor_torque(0.0)
50 | self.bally.set_motor_torque(0.0)
51 | for r in self.mjcf:
52 | r.query_position()
53 | fpose = cpp_household.Pose()
54 | fpose.set_xyz(0, 0, -0.04)
55 | self.field = self.cpp_world.load_thingy(
56 | os.path.join(os.path.dirname(roboschool.__file__), "models_outdoor/stadium/pong1.obj"),
57 | fpose, 1.0, 0, 0xFFFFFF, True)
58 | self.camera = self.cpp_world.new_camera_free_float(self.VIDEO_W, self.VIDEO_H, "video_camera")
59 | self.camera_itertia = 0
60 | self.frame = 0
61 | self.jstate_for_frame = -1
62 | self.score_left = 0
63 | self.score_right = 0
64 | self.bounce_n = 0
65 | self.restart_from_center(self.np_random.randint(2) == 0)
66 |
67 | def restart_from_center(self, leftwards):
68 | self.ballx.reset_current_position(0, 0)
69 | self.bally.reset_current_position(0, 0)
70 | self.timeout = self.TIMEOUT
71 | self.timeout_dir = (-1 if leftwards else +1)
72 | # self.bounce_n = 0
73 | self.ball_x, ball_vx = self.ballx.current_position()
74 | self.ball_y, ball_vy = self.bally.current_position()
75 |
76 | def global_step(self):
77 | self.frame += 1
78 |
79 | # if not self.multiplayer:
80 | # # Trainer
81 | # self.p1x.set_servo_target( self.trainer_x, 0.02, 0.02, 4 )
82 | # self.p1y.set_servo_target( self.trainer_y, 0.02, 0.02, 4 )
83 |
84 | Scene.global_step(self)
85 |
86 | self.ball_x, ball_vx = self.ballx.current_position()
87 | self.ball_y, ball_vy = self.bally.current_position()
88 |
89 | if np.abs(self.ball_y) > 1.0 and self.ball_y * ball_vy > 0:
90 | self.bally.reset_current_position(self.ball_y, -ball_vy)
91 |
92 | if ball_vx * self.timeout_dir < 0:
93 | # if self.timeout_dir < 0:
94 | # self.score_left += 0.00*np.abs(ball_vx) # hint for early learning: hit the ball!
95 | # else:
96 | # self.score_right += 0.00*np.abs(ball_vx)
97 | self.timeout_dir *= -1
98 | self.timeout = self.TIMEOUT
99 | self.bounce_n += 1
100 |
101 | def global_state(self):
102 | if self.frame == self.jstate_for_frame:
103 | return self.jstate
104 | self.jstate_for_frame = self.frame
105 | j = np.array(
106 | [j.current_relative_position() for j in [self.p0x, self.p0y, self.p1x, self.p1y, self.ballx, self.bally]]
107 | ).flatten()
108 | self.jstate = np.concatenate([j, [(self.timeout - self.TIMEOUT) / self.TIMEOUT]])
109 | return self.jstate
110 |
111 | def HUD(self, a, s):
112 | self.cpp_world.test_window_history_advance()
113 | self.cpp_world.test_window_observations(s.tolist())
114 | self.cpp_world.test_window_actions(a[:2].tolist())
115 | s = "%04i TIMEOUT%3i %0.2f:%0.2f" % (
116 | self.frame, self.timeout, self.score_left, self.score_right
117 | )
118 |
119 | def camera_adjust(self):
120 | "Looks like first 3 coordinates specify position of the camera and the last three the orientation."
121 | self.camera.move_and_look_at(0.1, -0.1, 1.9, 0.1, 0.2, 0)
122 |
123 |
124 | class RoboschoolHockeyJoint(gym.Env):
125 | VIDEO_W = 84
126 | VIDEO_H = 84
127 |
128 | def __init__(self):
129 | self.player_n = 0
130 | self.scene = None
131 | action_dim = 4
132 | # obs_dim = 13
133 | high = np.ones([action_dim])
134 | self.action_space = gym.spaces.Box(-high, high)
135 | self.observation_space = gym.spaces.Box(low=0, high=255,
136 | shape=(self.VIDEO_W, self.VIDEO_H, 3), dtype=np.uint8)
137 | self._seed()
138 |
139 | def create_single_player_scene(self):
140 | self.player_n = 0
141 | s = HockeyScene()
142 | s.np_random = self.np_random
143 | return s
144 |
145 | def _seed(self, seed=None):
146 | self.np_random, seed = gym.utils.seeding.np_random(seed)
147 | return [seed]
148 |
149 | def reset(self):
150 | if self.scene is None:
151 | self.scene = self.create_single_player_scene()
152 | self.scene.episode_restart()
153 | s = self.calc_state()
154 | self.score_reported = 0
155 | obs = self.render("rgb_array")
156 | return obs
157 |
158 | def calc_state(self):
159 | j = self.scene.global_state()
160 | if self.player_n == 1:
161 | # [ 0,1, 2,3, 4, 5, 6,7, 8,9,10,11,12]
162 | # [p0x,v,p0y,v, p1x,v,p1y,v, bx,v,by,v, T]
163 | signflip = np.array([-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, 1])
164 | reorder = np.array([4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12])
165 | j = (j * signflip)[reorder]
166 | return j
167 |
168 | def apply_action(self, a0, a1):
169 | assert (np.isfinite(a0).all())
170 | assert (np.isfinite(a1).all())
171 | a0 = np.clip(a0, -1, +1)
172 | a1 = np.clip(a1, -1, +1)
173 | self.scene.p0x.set_target_speed(3 * float(a0[0]), 0.05, 7)
174 | self.scene.p0y.set_target_speed(3 * float(a0[1]), 0.05, 7)
175 | self.scene.p1x.set_target_speed(-3 * float(a1[0]), 0.05, 7)
176 | self.scene.p1y.set_target_speed(3 * float(a1[1]), 0.05, 7)
177 |
178 | def step(self, a):
179 | a0 = a[:2]
180 | a1 = a[2:]
181 | self.apply_action(a0, a1)
182 | self.scene.global_step()
183 | state = self.calc_state()
184 | self.scene.HUD(a, state)
185 | new_score = self.scene.bounce_n
186 | # new_score = int(new_score)
187 | self.rewards = new_score - self.score_reported
188 | self.score_reported = new_score
189 | if (self.scene.score_left > 10) or (self.scene.score_right > 10):
190 | done = True
191 | else:
192 | done = False
193 | obs = self.render("rgb_array")
194 | return obs, self.rewards, done, {}
195 |
196 | def render(self, mode):
197 | if mode == "human":
198 | return self.scene.cpp_world.test_window()
199 | elif mode == "rgb_array":
200 | self.scene.camera_adjust()
201 | rgb, _, _, _, _ = self.scene.camera.render(False, False,
202 | False) # render_depth, render_labeling, print_timing)
203 | rendered_rgb = np.fromstring(rgb, dtype=np.uint8).reshape((self.VIDEO_H, self.VIDEO_W, 3))
204 | return rendered_rgb
205 | else:
206 | assert (0)
207 |
208 |
209 | class MultiDiscreteToUsual(gym.ActionWrapper):
210 | def __init__(self, env):
211 | gym.ActionWrapper.__init__(self, env)
212 |
213 | self._inp_act_size = self.env.action_space.nvec
214 | self.action_space = gym.spaces.Discrete(np.prod(self._inp_act_size))
215 |
216 | def action(self, a):
217 | vec = np.zeros(dtype=np.int8, shape=self._inp_act_size.shape)
218 | for i, n in enumerate(self._inp_act_size):
219 | vec[i] = a % n
220 | a /= n
221 | return vec
222 |
223 |
224 | class DiscretizeActionWrapper(gym.ActionWrapper):
225 | def __init__(self, env=None, nsamples=11):
226 | super().__init__(env)
227 | assert isinstance(env.action_space, gym.spaces.Box)
228 | self._dist_to_cont = []
229 | for low, high in zip(env.action_space.low, env.action_space.high):
230 | self._dist_to_cont.append(np.linspace(low, high, nsamples))
231 | temp = [nsamples for _ in self._dist_to_cont]
232 | self.action_space = gym.spaces.MultiDiscrete(temp)
233 |
234 | def action(self, action):
235 | assert len(action) == len(self._dist_to_cont)
236 | return np.array([m[a] for a, m in zip(action, self._dist_to_cont)], dtype=np.float32)
237 |
--------------------------------------------------------------------------------
/roboenvs/joint_pong.py:
--------------------------------------------------------------------------------
1 | # Make a basic version of pong, run it with random agent.
2 |
3 | import os
4 | import sys
5 |
6 | import gym
7 | import gym.spaces
8 | import gym.utils
9 | import gym.utils.seeding
10 | import numpy as np
11 | import roboschool
12 | from roboschool.scene_abstract import Scene, cpp_household
13 |
14 |
15 | class PongScene(Scene):
16 | VIDEO_W = 84
17 | VIDEO_H = 84
18 | TIMEOUT = 300
19 |
20 | def __init__(self):
21 | Scene.__init__(self, gravity=9.8, timestep=0.0165 / 4, frame_skip=8)
22 | self.score_left = 0
23 | self.score_right = 0
24 |
25 | def actor_introduce(self, robot):
26 | i = robot.player_n - 1
27 |
28 | def episode_restart(self):
29 | Scene.episode_restart(self)
30 | if self.score_right + self.score_left > 0:
31 | sys.stdout.write("%i:%i " % (self.score_left, self.score_right))
32 | sys.stdout.flush()
33 | self.mjcf = self.cpp_world.load_mjcf(os.path.join(os.path.dirname(__file__), "roboschool_pong.xml"))
34 | dump = 0
35 | for r in self.mjcf:
36 | if dump: print("ROBOT '%s'" % r.root_part.name)
37 | for part in r.parts:
38 | if dump: print("\tPART '%s'" % part.name)
39 | # if part.name==self.robot_name:
40 | for j in r.joints:
41 | if j.name == "p0x": self.p0x = j
42 | if j.name == "p0y": self.p0y = j
43 | if j.name == "p1x": self.p1x = j
44 | if j.name == "p1y": self.p1y = j
45 | if j.name == "ballx": self.ballx = j
46 | if j.name == "bally": self.bally = j
47 | self.ballx.set_motor_torque(0.0)
48 | self.bally.set_motor_torque(0.0)
49 | for r in self.mjcf:
50 | r.query_position()
51 | fpose = cpp_household.Pose()
52 | fpose.set_xyz(0, 0, -0.04)
53 | self.field = self.cpp_world.load_thingy(
54 | os.path.join(os.path.dirname(roboschool.__file__), "models_outdoor/stadium/pong1.obj"),
55 | fpose, 1.0, 0, 0xFFFFFF, True)
56 | self.camera = self.cpp_world.new_camera_free_float(self.VIDEO_W, self.VIDEO_H, "video_camera")
57 | self.camera_itertia = 0
58 | self.frame = 0
59 | self.jstate_for_frame = -1
60 | self.score_left = 0
61 | self.score_right = 0
62 | self.bounce_n = 0
63 | self.restart_from_center(self.np_random.randint(2) == 0)
64 |
65 | def restart_from_center(self, leftwards):
66 | self.ballx.reset_current_position(0, self.np_random.uniform(low=2.0, high=2.5) * (-1 if leftwards else +1))
67 | self.bally.reset_current_position(self.np_random.uniform(low=-0.9, high=+0.9),
68 | self.np_random.uniform(low=-2, high=+2))
69 | self.timeout = self.TIMEOUT
70 | self.timeout_dir = (-1 if leftwards else +1)
71 | # self.bounce_n = 0
72 | self.trainer_x = self.np_random.uniform(low=-0.9, high=+0.9)
73 | self.trainer_y = self.np_random.uniform(low=-0.9, high=+0.9)
74 |
75 | self.ball_x, ball_vx = self.ballx.current_position()
76 | self.ball_y, ball_vy = self.bally.current_position()
77 |
78 | def global_step(self):
79 | self.frame += 1
80 |
81 | Scene.global_step(self)
82 |
83 | self.ball_x, ball_vx = self.ballx.current_position()
84 | self.ball_y, ball_vy = self.bally.current_position()
85 |
86 | if np.abs(self.ball_y) > 1.0 and self.ball_y * ball_vy > 0:
87 | self.bally.reset_current_position(self.ball_y, -ball_vy)
88 |
89 | if ball_vx * self.timeout_dir < 0:
90 | # if self.timeout_dir < 0:
91 | # self.score_left += 0.00*np.abs(ball_vx) # hint for early learning: hit the ball!
92 | # else:
93 | # self.score_right += 0.00*np.abs(ball_vx)
94 | self.timeout_dir *= -1
95 | # self.timeout = self.TIMEOUT
96 | self.bounce_n += 1
97 | # print("bounce", self.bounce_n)
98 | # else:
99 | # self.timeout -= 1
100 |
101 | if np.abs(self.ball_x) > 1.65 or self.timeout == 0:
102 | if self.timeout == 0:
103 | self.restart_from_center(ball_vx < 0) # send ball in same dir on timeout
104 | # if self.score_right + self.score_left >
105 | elif ball_vx > 0:
106 | self.score_left += 1
107 | self.restart_from_center(ball_vx > 0) # winning streak, let it hit more
108 | else:
109 | self.score_right += 1.0
110 | self.restart_from_center(ball_vx > 0)
111 | self.timeout = self.TIMEOUT
112 | else:
113 | self.timeout -= 1
114 |
115 | def global_state(self):
116 | if self.frame == self.jstate_for_frame:
117 | return self.jstate
118 | self.jstate_for_frame = self.frame
119 | j = np.array(
120 | [j.current_relative_position() for j in [self.p0x, self.p0y, self.p1x, self.p1y, self.ballx, self.bally]]
121 | ).flatten()
122 | self.jstate = np.concatenate([j, [(self.timeout - self.TIMEOUT) / self.TIMEOUT]])
123 | return self.jstate
124 |
125 | def HUD(self, a, s):
126 | self.cpp_world.test_window_history_advance()
127 | self.cpp_world.test_window_observations(s.tolist())
128 | self.cpp_world.test_window_actions(a[:2].tolist())
129 | s = "%04i TIMEOUT%3i %0.2f:%0.2f" % (
130 | self.frame, self.timeout, self.score_left, self.score_right
131 | )
132 |
133 | def camera_adjust(self):
134 | "Looks like first 3 coordinates specify position of the camera and the last three the orientation."
135 | # self.camera.move_and_look_at(0, -1.0, 1.5, self.camera_itertia, -0.1, 0)
136 | self.camera.move_and_look_at(0.1, -0.1, 1.9, 0.1, 0.2, 0)
137 |
138 |
139 | class RoboschoolPongJoint(gym.Env):
140 | VIDEO_W = 84
141 | VIDEO_H = 84
142 |
143 | def __init__(self):
144 | self.player_n = 0
145 | self.scene = None
146 | action_dim = 4
147 | # obs_dim = 13
148 | high = np.ones([action_dim])
149 | self.action_space = gym.spaces.Box(-high, high)
150 | self.observation_space = gym.spaces.Box(low=0, high=255,
151 | shape=(self.VIDEO_W, self.VIDEO_H, 3), dtype=np.uint8)
152 | self._seed()
153 |
154 | def create_single_player_scene(self):
155 | self.player_n = 0
156 | s = PongScene()
157 | s.np_random = self.np_random
158 | return s
159 |
160 | def _seed(self, seed=None):
161 | self.np_random, seed = gym.utils.seeding.np_random(seed)
162 | return [seed]
163 |
164 | def reset(self):
165 | if self.scene is None:
166 | self.scene = self.create_single_player_scene()
167 | self.scene.episode_restart()
168 | s = self.calc_state()
169 | self.score_reported = 0
170 | obs = self.render("rgb_array")
171 | return obs
172 |
173 | def calc_state(self):
174 | j = self.scene.global_state()
175 | if self.player_n == 1:
176 | import ipdb;
177 | ipdb.set_trace()
178 | # [ 0,1, 2,3, 4, 5, 6,7, 8,9,10,11,12]
179 | # [p0x,v,p0y,v, p1x,v,p1y,v, bx,v,by,v, T]
180 | signflip = np.array([-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, 1])
181 | reorder = np.array([4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12])
182 | j = (j * signflip)[reorder]
183 | return j
184 |
185 | def apply_action(self, a0, a1):
186 | assert (np.isfinite(a0).all())
187 | assert (np.isfinite(a1).all())
188 | a0 = np.clip(a0, -1, +1)
189 | a1 = np.clip(a1, -1, +1)
190 | self.scene.p0x.set_target_speed(3 * float(a0[0]), 0.05, 7)
191 | self.scene.p0y.set_target_speed(3 * float(a0[1]), 0.05, 7)
192 | self.scene.p1x.set_target_speed(-3 * float(a1[0]), 0.05, 7)
193 | self.scene.p1y.set_target_speed(3 * float(a1[1]), 0.05, 7)
194 |
195 | def step(self, a):
196 | a0 = a[:2]
197 | a1 = a[2:]
198 | self.apply_action(a0, a1)
199 | self.scene.global_step()
200 | state = self.calc_state()
201 | self.scene.HUD(a, state)
202 | new_score = self.scene.bounce_n
203 | # new_score = int(new_score)
204 | self.rewards = new_score - self.score_reported
205 | self.score_reported = new_score
206 | if (self.scene.score_left > 10) or (self.scene.score_right > 10):
207 | done = True
208 | else:
209 | done = False
210 | obs = self.render("rgb_array")
211 | # rew = sum(self.rewards)
212 | # if self.rewards > 0:
213 | # print("got reward", self.rewards)
214 | return obs, self.rewards, done, {}
215 |
216 | def render(self, mode):
217 | if mode == "human":
218 | return self.scene.cpp_world.test_window()
219 | elif mode == "rgb_array":
220 | self.scene.camera_adjust()
221 | rgb, _, _, _, _ = self.scene.camera.render(False, False,
222 | False) # render_depth, render_labeling, print_timing)
223 | rendered_rgb = np.fromstring(rgb, dtype=np.uint8).reshape((self.VIDEO_H, self.VIDEO_W, 3))
224 | return rendered_rgb
225 | else:
226 | assert (0)
227 |
228 |
229 | class MultiDiscreteToUsual(gym.ActionWrapper):
230 | def __init__(self, env):
231 | gym.ActionWrapper.__init__(self, env)
232 |
233 | self._inp_act_size = self.env.action_space.nvec
234 | self.action_space = gym.spaces.Discrete(np.prod(self._inp_act_size))
235 |
236 | def action(self, a):
237 | vec = np.zeros(dtype=np.int8, shape=self._inp_act_size.shape)
238 | for i, n in enumerate(self._inp_act_size):
239 | vec[i] = a % n
240 | a /= n
241 | return vec
242 |
243 |
244 | class DiscretizeActionWrapper(gym.ActionWrapper):
245 | def __init__(self, env=None, nsamples=11):
246 | super().__init__(env)
247 | assert isinstance(env.action_space, gym.spaces.Box)
248 | self._dist_to_cont = []
249 | for low, high in zip(env.action_space.low, env.action_space.high):
250 | self._dist_to_cont.append(np.linspace(low, high, nsamples))
251 | temp = [nsamples for _ in self._dist_to_cont]
252 | self.action_space = gym.spaces.MultiDiscrete(temp)
253 |
254 | def action(self, action):
255 | assert len(action) == len(self._dist_to_cont)
256 | return np.array([m[a] for a, m in zip(action, self._dist_to_cont)], dtype=np.float32)
257 |
--------------------------------------------------------------------------------
/cppo_agent.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 | from baselines.common import explained_variance
6 | from baselines.common.mpi_moments import mpi_moments
7 | from baselines.common.running_mean_std import RunningMeanStd
8 | from mpi4py import MPI
9 |
10 | from mpi_utils import MpiAdamOptimizer
11 | from rollouts import Rollout
12 | from utils import bcast_tf_vars_from_root, get_mean_and_std
13 | from vec_env import ShmemVecEnv as VecEnv
14 |
15 | getsess = tf.get_default_session
16 |
17 |
18 | class PpoOptimizer(object):
19 | envs = None
20 |
21 | def __init__(self, *, scope, ob_space, ac_space, stochpol,
22 | ent_coef, gamma, lam, nepochs, lr, cliprange,
23 | nminibatches,
24 | normrew, normadv, use_news, ext_coeff, int_coeff,
25 | nsteps_per_seg, nsegs_per_env, dynamics):
26 | self.dynamics = dynamics
27 | with tf.variable_scope(scope):
28 | self.use_recorder = True
29 | self.n_updates = 0
30 | self.scope = scope
31 | self.ob_space = ob_space
32 | self.ac_space = ac_space
33 | self.stochpol = stochpol
34 | self.nepochs = nepochs
35 | self.lr = lr
36 | self.cliprange = cliprange
37 | self.nsteps_per_seg = nsteps_per_seg
38 | self.nsegs_per_env = nsegs_per_env
39 | self.nminibatches = nminibatches
40 | self.gamma = gamma
41 | self.lam = lam
42 | self.normrew = normrew
43 | self.normadv = normadv
44 | self.use_news = use_news
45 | self.ext_coeff = ext_coeff
46 | self.int_coeff = int_coeff
47 | self.ph_adv = tf.placeholder(tf.float32, [None, None])
48 | self.ph_ret = tf.placeholder(tf.float32, [None, None])
49 | self.ph_rews = tf.placeholder(tf.float32, [None, None])
50 | self.ph_oldnlp = tf.placeholder(tf.float32, [None, None])
51 | self.ph_oldvpred = tf.placeholder(tf.float32, [None, None])
52 | self.ph_lr = tf.placeholder(tf.float32, [])
53 | self.ph_cliprange = tf.placeholder(tf.float32, [])
54 | neglogpac = self.stochpol.pd.neglogp(self.stochpol.ph_ac)
55 | entropy = tf.reduce_mean(self.stochpol.pd.entropy())
56 | vpred = self.stochpol.vpred
57 |
58 | vf_loss = 0.5 * tf.reduce_mean((vpred - self.ph_ret) ** 2)
59 | ratio = tf.exp(self.ph_oldnlp - neglogpac) # p_new / p_old
60 | negadv = - self.ph_adv
61 | pg_losses1 = negadv * ratio
62 | pg_losses2 = negadv * tf.clip_by_value(ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange)
63 | pg_loss_surr = tf.maximum(pg_losses1, pg_losses2)
64 | pg_loss = tf.reduce_mean(pg_loss_surr)
65 | ent_loss = (- ent_coef) * entropy
66 | approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.ph_oldnlp))
67 | clipfrac = tf.reduce_mean(tf.to_float(tf.abs(pg_losses2 - pg_loss_surr) > 1e-6))
68 |
69 | self.total_loss = pg_loss + ent_loss + vf_loss
70 | self.to_report = {'tot': self.total_loss, 'pg': pg_loss, 'vf': vf_loss, 'ent': entropy,
71 | 'approxkl': approxkl, 'clipfrac': clipfrac}
72 |
73 | def start_interaction(self, env_fns, dynamics, nlump=2):
74 | self.loss_names, self._losses = zip(*list(self.to_report.items()))
75 |
76 | params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
77 | if MPI.COMM_WORLD.Get_size() > 1:
78 | trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD)
79 | else:
80 | trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
81 | gradsandvars = trainer.compute_gradients(self.total_loss, params)
82 | self._train = trainer.apply_gradients(gradsandvars)
83 |
84 | if MPI.COMM_WORLD.Get_rank() == 0:
85 | getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
86 | bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
87 |
88 | self.all_visited_rooms = []
89 | self.all_scores = []
90 | self.nenvs = nenvs = len(env_fns)
91 | self.nlump = nlump
92 | self.lump_stride = nenvs // self.nlump
93 | self.envs = [
94 | VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for
95 | l in range(self.nlump)]
96 |
97 | self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs,
98 | nsteps_per_seg=self.nsteps_per_seg,
99 | nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump,
100 | envs=self.envs,
101 | policy=self.stochpol,
102 | int_rew_coeff=self.int_coeff,
103 | ext_rew_coeff=self.ext_coeff,
104 | record_rollouts=self.use_recorder,
105 | dynamics=dynamics)
106 |
107 | self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
108 | self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)
109 |
110 | if self.normrew:
111 | self.rff = RewardForwardFilter(self.gamma)
112 | self.rff_rms = RunningMeanStd()
113 |
114 | self.step_count = 0
115 | self.t_last_update = time.time()
116 | self.t_start = time.time()
117 |
118 | def stop_interaction(self):
119 | for env in self.envs:
120 | env.close()
121 |
122 | def calculate_advantages(self, rews, use_news, gamma, lam):
123 | nsteps = self.rollout.nsteps
124 | lastgaelam = 0
125 | for t in range(nsteps - 1, -1, -1): # nsteps-2 ... 0
126 | nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last
127 | if not use_news:
128 | nextnew = 0
129 | nextvals = self.rollout.buf_vpreds[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_last
130 | nextnotnew = 1 - nextnew
131 | delta = rews[:, t] + gamma * nextvals * nextnotnew - self.rollout.buf_vpreds[:, t]
132 | self.buf_advs[:, t] = lastgaelam = delta + gamma * lam * nextnotnew * lastgaelam
133 | self.buf_rets[:] = self.buf_advs + self.rollout.buf_vpreds
134 |
135 | def update(self):
136 | if self.normrew:
137 | rffs = np.array([self.rff.update(rew) for rew in self.rollout.buf_rews.T])
138 | rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
139 | self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count)
140 | rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
141 | else:
142 | rews = np.copy(self.rollout.buf_rews)
143 | self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam)
144 |
145 | info = dict(
146 | advmean=self.buf_advs.mean(),
147 | advstd=self.buf_advs.std(),
148 | retmean=self.buf_rets.mean(),
149 | retstd=self.buf_rets.std(),
150 | vpredmean=self.rollout.buf_vpreds.mean(),
151 | vpredstd=self.rollout.buf_vpreds.std(),
152 | ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()),
153 | rew_mean=np.mean(self.rollout.buf_rews),
154 | recent_best_ext_ret=self.rollout.current_max
155 | )
156 | if self.rollout.best_ext_ret is not None:
157 | info['best_ext_ret'] = self.rollout.best_ext_ret
158 |
159 | # normalize advantages
160 | if self.normadv:
161 | m, s = get_mean_and_std(self.buf_advs)
162 | self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
163 | envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
164 | envsperbatch = max(1, envsperbatch)
165 | envinds = np.arange(self.nenvs * self.nsegs_per_env)
166 |
167 | def resh(x):
168 | if self.nsegs_per_env == 1:
169 | return x
170 | sh = x.shape
171 | return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:])
172 |
173 | ph_buf = [
174 | (self.stochpol.ph_ac, resh(self.rollout.buf_acs)),
175 | (self.ph_rews, resh(self.rollout.buf_rews)),
176 | (self.ph_oldvpred, resh(self.rollout.buf_vpreds)),
177 | (self.ph_oldnlp, resh(self.rollout.buf_nlps)),
178 | (self.stochpol.ph_ob, resh(self.rollout.buf_obs)),
179 | (self.ph_ret, resh(self.buf_rets)),
180 | (self.ph_adv, resh(self.buf_advs)),
181 | ]
182 | ph_buf.extend([
183 | (self.dynamics.last_ob,
184 | self.rollout.buf_obs_last.reshape([self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape]))
185 | ])
186 | mblossvals = []
187 |
188 | for _ in range(self.nepochs):
189 | np.random.shuffle(envinds)
190 | for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch):
191 | end = start + envsperbatch
192 | mbenvinds = envinds[start:end]
193 | fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}
194 | fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange})
195 | mblossvals.append(getsess().run(self._losses + (self._train,), fd)[:-1])
196 |
197 | mblossvals = [mblossvals[0]]
198 | info.update(zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0)))
199 | info["rank"] = MPI.COMM_WORLD.Get_rank()
200 | self.n_updates += 1
201 | info["n_updates"] = self.n_updates
202 | info.update({dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items()})
203 | info.update(self.rollout.stats)
204 | if "states_visited" in info:
205 | info.pop("states_visited")
206 | tnow = time.time()
207 | info["ups"] = 1. / (tnow - self.t_last_update)
208 | info["total_secs"] = tnow - self.t_start
209 | info['tps'] = MPI.COMM_WORLD.Get_size() * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update)
210 | self.t_last_update = tnow
211 |
212 | return info
213 |
214 | def step(self):
215 | self.rollout.collect_rollout()
216 | update_info = self.update()
217 | return {'update': update_info}
218 |
219 | def get_var_values(self):
220 | return self.stochpol.get_var_values()
221 |
222 | def set_var_values(self, vv):
223 | self.stochpol.set_var_values(vv)
224 |
225 |
226 | class RewardForwardFilter(object):
227 | def __init__(self, gamma):
228 | self.rewems = None
229 | self.gamma = gamma
230 |
231 | def update(self, rews):
232 | if self.rewems is None:
233 | self.rewems = rews
234 | else:
235 | self.rewems = self.rewems * self.gamma + rews
236 | return self.rewems
237 |
--------------------------------------------------------------------------------
/wrappers.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from collections import deque
3 | from copy import copy
4 |
5 | import gym
6 | import numpy as np
7 | from PIL import Image
8 |
9 |
10 | def unwrap(env):
11 | if hasattr(env, "unwrapped"):
12 | return env.unwrapped
13 | elif hasattr(env, "env"):
14 | return unwrap(env.env)
15 | elif hasattr(env, "leg_env"):
16 | return unwrap(env.leg_env)
17 | else:
18 | return env
19 |
20 |
21 | class MaxAndSkipEnv(gym.Wrapper):
22 | def __init__(self, env, skip=4):
23 | """Return only every `skip`-th frame"""
24 | gym.Wrapper.__init__(self, env)
25 | # most recent raw observations (for max pooling across time steps)
26 | self._obs_buffer = deque(maxlen=2)
27 | self._skip = skip
28 |
29 | def step(self, action):
30 | """Repeat action, sum reward, and max over last observations."""
31 | total_reward = 0.0
32 | done = None
33 | acc_info = {}
34 | for _ in range(self._skip):
35 | obs, reward, done, info = self.env.step(action)
36 | acc_info.update(info)
37 | self._obs_buffer.append(obs)
38 | total_reward += reward
39 | if done:
40 | break
41 | max_frame = np.max(np.stack(self._obs_buffer), axis=0)
42 |
43 | return max_frame, total_reward, done, acc_info
44 |
45 | def reset(self):
46 | """Clear past frame buffer and init. to first obs. from inner env."""
47 | self._obs_buffer.clear()
48 | obs = self.env.reset()
49 | self._obs_buffer.append(obs)
50 | return obs
51 |
52 |
53 | class ProcessFrame84(gym.ObservationWrapper):
54 | def __init__(self, env, crop=True):
55 | self.crop = crop
56 | super(ProcessFrame84, self).__init__(env)
57 | self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
58 |
59 | def observation(self, obs):
60 | return ProcessFrame84.process(obs, crop=self.crop)
61 |
62 | @staticmethod
63 | def process(frame, crop=True):
64 | if frame.size == 210 * 160 * 3:
65 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
66 | elif frame.size == 250 * 160 * 3:
67 | img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
68 | elif frame.size == 224 * 240 * 3: # mario resolution
69 | img = np.reshape(frame, [224, 240, 3]).astype(np.float32)
70 | else:
71 | assert False, "Unknown resolution." + str(frame.size)
72 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
73 | size = (84, 110 if crop else 84)
74 | resized_screen = np.array(Image.fromarray(img).resize(size,
75 | resample=Image.BILINEAR), dtype=np.uint8)
76 | x_t = resized_screen[18:102, :] if crop else resized_screen
77 | x_t = np.reshape(x_t, [84, 84, 1])
78 | return x_t.astype(np.uint8)
79 |
80 |
81 | class ExtraTimeLimit(gym.Wrapper):
82 | def __init__(self, env, max_episode_steps=None):
83 | gym.Wrapper.__init__(self, env)
84 | self._max_episode_steps = max_episode_steps
85 | self._elapsed_steps = 0
86 |
87 | def step(self, action):
88 | observation, reward, done, info = self.env.step(action)
89 | self._elapsed_steps += 1
90 | if self._elapsed_steps > self._max_episode_steps:
91 | done = True
92 | return observation, reward, done, info
93 |
94 | def reset(self):
95 | self._elapsed_steps = 0
96 | return self.env.reset()
97 |
98 |
99 | class AddRandomStateToInfo(gym.Wrapper):
100 | def __init__(self, env):
101 | """Adds the random state to the info field on the first step after reset
102 | """
103 | gym.Wrapper.__init__(self, env)
104 |
105 | def step(self, action):
106 | ob, r, d, info = self.env.step(action)
107 | if self.random_state_copy is not None:
108 | info['random_state'] = self.random_state_copy
109 | self.random_state_copy = None
110 | return ob, r, d, info
111 |
112 | def reset(self, **kwargs):
113 | """ Do no-op action for a number of steps in [1, noop_max]."""
114 | self.random_state_copy = copy(self.unwrapped.np_random)
115 | return self.env.reset(**kwargs)
116 |
117 |
118 | class MontezumaInfoWrapper(gym.Wrapper):
119 | ram_map = {
120 | "room": dict(
121 | index=3,
122 | ),
123 | "x": dict(
124 | index=42,
125 | ),
126 | "y": dict(
127 | index=43,
128 | ),
129 | }
130 |
131 | def __init__(self, env):
132 | super(MontezumaInfoWrapper, self).__init__(env)
133 | self.visited = set()
134 | self.visited_rooms = set()
135 |
136 | def step(self, action):
137 | obs, rew, done, info = self.env.step(action)
138 | ram_state = unwrap(self.env).ale.getRAM()
139 | for name, properties in MontezumaInfoWrapper.ram_map.items():
140 | info[name] = ram_state[properties['index']]
141 | pos = (info['x'], info['y'], info['room'])
142 | self.visited.add(pos)
143 | self.visited_rooms.add(info["room"])
144 | if done:
145 | info['mz_episode'] = dict(pos_count=len(self.visited),
146 | visited_rooms=copy(self.visited_rooms))
147 | self.visited.clear()
148 | self.visited_rooms.clear()
149 | return obs, rew, done, info
150 |
151 | def reset(self):
152 | return self.env.reset()
153 |
154 |
155 | class MarioXReward(gym.Wrapper):
156 | def __init__(self, env):
157 | gym.Wrapper.__init__(self, env)
158 | self.current_level = [0, 0]
159 | self.visited_levels = set()
160 | self.visited_levels.add(tuple(self.current_level))
161 | self.current_max_x = 0.
162 |
163 | def reset(self):
164 | ob = self.env.reset()
165 | self.current_level = [0, 0]
166 | self.visited_levels = set()
167 | self.visited_levels.add(tuple(self.current_level))
168 | self.current_max_x = 0.
169 | return ob
170 |
171 | def step(self, action):
172 | ob, reward, done, info = self.env.step(action)
173 | levellow, levelhigh, xscrollHi, xscrollLo = \
174 | info["levelLo"], info["levelHi"], info["xscrollHi"], info["xscrollLo"]
175 | currentx = xscrollHi * 256 + xscrollLo
176 | new_level = [levellow, levelhigh]
177 | if new_level != self.current_level:
178 | self.current_level = new_level
179 | self.current_max_x = 0.
180 | reward = 0.
181 | self.visited_levels.add(tuple(self.current_level))
182 | else:
183 | if currentx > self.current_max_x:
184 | delta = currentx - self.current_max_x
185 | self.current_max_x = currentx
186 | reward = delta
187 | else:
188 | reward = 0.
189 | if done:
190 | info["levels"] = copy(self.visited_levels)
191 | info["retro_episode"] = dict(levels=copy(self.visited_levels))
192 | return ob, reward, done, info
193 |
194 |
195 | class LimitedDiscreteActions(gym.ActionWrapper):
196 | KNOWN_BUTTONS = {"A", "B"}
197 | KNOWN_SHOULDERS = {"L", "R"}
198 |
199 | '''
200 | Reproduces the action space from curiosity paper.
201 | '''
202 |
203 | def __init__(self, env, all_buttons, whitelist=KNOWN_BUTTONS | KNOWN_SHOULDERS):
204 | gym.ActionWrapper.__init__(self, env)
205 |
206 | self._num_buttons = len(all_buttons)
207 | button_keys = {i for i in range(len(all_buttons)) if all_buttons[i] in whitelist & self.KNOWN_BUTTONS}
208 | buttons = [(), *zip(button_keys), *itertools.combinations(button_keys, 2)]
209 | shoulder_keys = {i for i in range(len(all_buttons)) if all_buttons[i] in whitelist & self.KNOWN_SHOULDERS}
210 | shoulders = [(), *zip(shoulder_keys), *itertools.permutations(shoulder_keys, 2)]
211 | arrows = [(), (4,), (5,), (6,), (7,)] # (), up, down, left, right
212 | acts = []
213 | acts += arrows
214 | acts += buttons[1:]
215 | acts += [a + b for a in arrows[-2:] for b in buttons[1:]]
216 | self._actions = acts
217 | self.action_space = gym.spaces.Discrete(len(self._actions))
218 |
219 | def action(self, a):
220 | mask = np.zeros(self._num_buttons)
221 | for i in self._actions[a]:
222 | mask[i] = 1
223 | return mask
224 |
225 |
226 | class FrameSkip(gym.Wrapper):
227 | def __init__(self, env, n):
228 | gym.Wrapper.__init__(self, env)
229 | self.n = n
230 |
231 | def step(self, action):
232 | done = False
233 | totrew = 0
234 | for _ in range(self.n):
235 | ob, rew, done, info = self.env.step(action)
236 | totrew += rew
237 | if done: break
238 | return ob, totrew, done, info
239 |
240 |
241 | def make_mario_env(crop=True, frame_stack=True, clip_rewards=False):
242 | assert clip_rewards is False
243 | import gym
244 | import retro
245 | from baselines.common.atari_wrappers import FrameStack
246 |
247 | gym.undo_logger_setup()
248 | env = retro.make('SuperMarioBros-Nes', 'Level1-1')
249 | buttons = env.BUTTONS
250 | env = MarioXReward(env)
251 | env = FrameSkip(env, 4)
252 | env = ProcessFrame84(env, crop=crop)
253 | if frame_stack:
254 | env = FrameStack(env, 4)
255 | env = LimitedDiscreteActions(env, buttons)
256 | return env
257 |
258 |
259 | class OneChannel(gym.ObservationWrapper):
260 | def __init__(self, env, crop=True):
261 | self.crop = crop
262 | super(OneChannel, self).__init__(env)
263 | assert env.observation_space.dtype == np.uint8
264 | self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
265 |
266 | def observation(self, obs):
267 | return obs[:, :, 2:3]
268 |
269 |
270 | class RetroALEActions(gym.ActionWrapper):
271 | def __init__(self, env, all_buttons, n_players=1):
272 | gym.ActionWrapper.__init__(self, env)
273 | self.n_players = n_players
274 | self._num_buttons = len(all_buttons)
275 | bs = [-1, 0, 4, 5, 6, 7]
276 | actions = []
277 |
278 | def update_actions(old_actions, offset=0):
279 | actions = []
280 | for b in old_actions:
281 | for button in bs:
282 | action = []
283 | action.extend(b)
284 | if button != -1:
285 | action.append(button + offset)
286 | actions.append(action)
287 | return actions
288 |
289 | current_actions = [[]]
290 | for i in range(self.n_players):
291 | current_actions = update_actions(current_actions, i * self._num_buttons)
292 | self._actions = current_actions
293 | self.action_space = gym.spaces.Discrete(len(self._actions))
294 |
295 | def action(self, a):
296 | mask = np.zeros(self._num_buttons * self.n_players)
297 | for i in self._actions[a]:
298 | mask[i] = 1
299 | return mask
300 |
301 |
302 | class NoReward(gym.Wrapper):
303 | def __init__(self, env):
304 | gym.Wrapper.__init__(self, env)
305 |
306 | def step(self, action):
307 | ob, rew, done, info = self.env.step(action)
308 | return ob, 0.0, done, info
309 |
310 |
311 | def make_multi_pong(frame_stack=True):
312 | import gym
313 | import retro
314 | from baselines.common.atari_wrappers import FrameStack
315 | gym.undo_logger_setup()
316 | game_env = env = retro.make('Pong-Atari2600', players=2)
317 | env = RetroALEActions(env, game_env.BUTTONS, n_players=2)
318 | env = NoReward(env)
319 | env = FrameSkip(env, 4)
320 | env = ProcessFrame84(env, crop=False)
321 | if frame_stack:
322 | env = FrameStack(env, 4)
323 |
324 | return env
325 |
326 |
327 | def make_robo_pong(frame_stack=True):
328 | from baselines.common.atari_wrappers import FrameStack
329 | import roboenvs as robo
330 |
331 | env = robo.make_robopong()
332 | env = robo.DiscretizeActionWrapper(env, 2)
333 | env = robo.MultiDiscreteToUsual(env)
334 | env = OneChannel(env)
335 | if frame_stack:
336 | env = FrameStack(env, 4)
337 |
338 | env = AddRandomStateToInfo(env)
339 | return env
340 |
341 |
342 | def make_robo_hockey(frame_stack=True):
343 | from baselines.common.atari_wrappers import FrameStack
344 | import roboenvs as robo
345 |
346 | env = robo.make_robohockey()
347 | env = robo.DiscretizeActionWrapper(env, 2)
348 | env = robo.MultiDiscreteToUsual(env)
349 | env = OneChannel(env)
350 | if frame_stack:
351 | env = FrameStack(env, 4)
352 | env = AddRandomStateToInfo(env)
353 | return env
354 |
--------------------------------------------------------------------------------