├── .gitignore
├── LICENSE
├── README.md
├── baselines
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── atari_wrappers.py
    │   ├── cg.py
    │   ├── cmd_util.py
    │   ├── console_util.py
    │   ├── dataset.py
    │   ├── distributions.py
    │   ├── filters.py
    │   ├── math_util.py
    │   ├── misc_util.py
    │   ├── mpi_adam.py
    │   ├── mpi_fork.py
    │   ├── mpi_moments.py
    │   ├── mpi_running_mean_std.py
    │   ├── runners.py
    │   ├── running_mean_std.py
    │   ├── running_stat.py
    │   ├── schedules.py
    │   ├── segment_tree.py
    │   ├── tests
    │   │   ├── test_schedules.py
    │   │   ├── test_segment_tree.py
    │   │   └── test_tf_util.py
    │   ├── tf_util.py
    │   └── vec_env
    │   │   ├── __init__.py
    │   │   ├── dummy_vec_env.py
    │   │   ├── subproc_vec_env.py
    │   │   ├── vec_frame_stack.py
    │   │   └── vec_normalize.py
    ├── her
    │   ├── __init__.py
    │   ├── actor_critic.py
    │   ├── ddpg.py
    │   ├── experiment
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── play.py
    │   │   └── train.py
    │   ├── her.py
    │   ├── normalizer.py
    │   ├── replay_buffer.py
    │   ├── rollout.py
    │   └── util.py
    ├── logger.py
    └── results_plotter.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.pyc
 3 | *.pkl
 4 | *.py~
 5 | .pytest_cache
 6 | .DS_Store
 7 | .idea
 8 | *.txt
 9 | 
10 | # Setuptools distribution and build folders.
11 | /dist/
12 | /build
13 | keys/
14 | 
15 | # Virtualenv
16 | /env
17 | 
18 | 
19 | *.sublime-project
20 | *.sublime-workspace
21 | 
22 | *log/
23 | *logs/
24 | 
25 | .idea
26 | 
27 | logs/
28 | logs_backup/
29 | 
30 | .ipynb_checkpoints
31 | ghostdriver.log
32 | 
33 | htmlcov
34 | 
35 | junk
36 | src
37 | 
38 | *.egg-info
39 | .cache
40 | 
41 | MUJOCO_LOG.TXT
42 | 
43 | openai.sublime-project
44 | openai.sublime-workspace
45 | 
46 | train/
47 | td-error.txt
48 | 
49 | model/
50 | Baseline/__pycache__


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | __author__ = "Rui Zhao"
 2 | __copyright__ = "Siemens AG, 2018"
 3 | __licencse__ = "MIT"
 4 | __version__ = "0.1"
 5 | 
 6 | MIT License
 7 | Copyright (c) 2018 Siemens AG
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Energy-Based Hindsight Experience Prioritization
 2 | 
 3 | Here is the code for our paper "Energy-Based Hindsight Experience Prioritization".  
 4 | 
 5 | The paper is published in 2018 Conference on Robot Learning (CoRL 2018) as oral presentation (7%).  
 6 | 
 7 | The paper is avaliable at Proceedings of Machine Learning Research: http://proceedings.mlr.press/v87/zhao18a.html  
 8 | 
 9 | The code was developed by Rui Zhao (Siemens AG & Ludwig Maximilian University of Munich).  
10 | 
11 | For details on Energy-Based Hindsight Experience Prioritization (EBP), please read the published paper.  
12 | 
13 | The code is developed based on OpenAI Baselines (link: https://github.com/openai/baselines).   
14 | 
15 | ## Prerequisites  
16 | 
17 | The code requires python3 (>=3.5) with the development headers. You'll also need system packages CMake, OpenMPI and zlib. Those can be installed as follows  
18 | 
19 | ### Usage  
20 |     
21 | ```bash
22 | sudo apt-get update && sudo apt-get install cmake libopenmpi-dev python3-dev zlib1g-dev
23 | ```
24 | 
25 | To run the code, you need to install OpenAI Gym (link: https://github.com/openai/gym).  
26 | We use the robotics environment in OpenAI Gym, which needs the MuJoCu physics engine (link: http://www.mujoco.org/).   
27 | 
28 | The experiments were carried out on a 20-CPUs server.  
29 | We use 19 CPUs for training.  
30 | If you are running the experiments on a laptop, please configure a smaller number of CPUs.  
31 | Note that, with less CPUs, the performance will be effected.  
32 | 
33 | After the installaton of dependicies, you can reproduce the experimental results by running the following commnands:  
34 | ```
35 | python baselines/her/experiment/train.py --env_name FetchPickAndPlace-v0 --prioritization none --n_epochs 50 --num_cpu 19 
36 | python baselines/her/experiment/train.py --env_name FetchPickAndPlace-v0 --prioritization tderror --n_epochs 50 --num_cpu 19 
37 | python baselines/her/experiment/train.py --env_name FetchPickAndPlace-v0 --prioritization energy --clip_energy 0.5 --n_epochs 50 --num_cpu 19 
38 | ```
39 | For FetchPickAndPlace-v0, we use clip_energy parameter 0.5.  
40 | For the other three hand environments, we use clip_energy 2.5.  
41 | 
42 | ```
43 | python baselines/her/experiment/train.py --env_name HandManipulateEggFull-v0 --prioritization none --n_epochs 200 --num_cpu 19 
44 | python baselines/her/experiment/train.py --env_name HandManipulateEggFull-v0 --prioritization tderror --n_epochs 200 --num_cpu 19 
45 | python baselines/her/experiment/train.py --env_name HandManipulateEggFull-v0 --prioritization energy --clip_energy 2.5 --n_epochs 200 --num_cpu 19 
46 | ```
47 | 
48 | To test the learned policies, you can run the command:  
49 | ```
50 | python baselines/her/experiment/play.py /path/to/an/experiment/policy_latest.pkl
51 | ```
52 | 
53 | ## Citation:
54 | 
55 | Citation of the arXiv version:
56 | 
57 | ```
58 | @article{zhao2018energy,
59 |   title={Energy-Based Hindsight Experience Prioritization},
60 |   author={Zhao, Rui and Tresp, Volker},
61 |   journal={arXiv preprint arXiv:1810.01363},
62 |   year={2018}
63 | }
64 | ```
65 | 
66 | ## Licence:
67 | 
68 | MIT
69 | 


--------------------------------------------------------------------------------
/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ruizhaogit/EnergyBasedPrioritization/2fd2f5bab0547848f4f76b837d16238435518dcc/baselines/__init__.py


--------------------------------------------------------------------------------
/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F403
2 | from baselines.common.console_util import *
3 | from baselines.common.dataset import Dataset
4 | from baselines.common.math_util import *
5 | from baselines.common.misc_util import *
6 | 


--------------------------------------------------------------------------------
/baselines/common/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import deque
  3 | import gym
  4 | from gym import spaces
  5 | import cv2
  6 | cv2.ocl.setUseOpenCL(False)
  7 | 
  8 | class NoopResetEnv(gym.Wrapper):
  9 |     def __init__(self, env, noop_max=30):
 10 |         """Sample initial states by taking random number of no-ops on reset.
 11 |         No-op is assumed to be action 0.
 12 |         """
 13 |         gym.Wrapper.__init__(self, env)
 14 |         self.noop_max = noop_max
 15 |         self.override_num_noops = None
 16 |         self.noop_action = 0
 17 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 18 | 
 19 |     def reset(self, **kwargs):
 20 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 21 |         self.env.reset(**kwargs)
 22 |         if self.override_num_noops is not None:
 23 |             noops = self.override_num_noops
 24 |         else:
 25 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
 26 |         assert noops > 0
 27 |         obs = None
 28 |         for _ in range(noops):
 29 |             obs, _, done, _ = self.env.step(self.noop_action)
 30 |             if done:
 31 |                 obs = self.env.reset(**kwargs)
 32 |         return obs
 33 | 
 34 |     def step(self, ac):
 35 |         return self.env.step(ac)
 36 | 
 37 | class FireResetEnv(gym.Wrapper):
 38 |     def __init__(self, env):
 39 |         """Take action on reset for environments that are fixed until firing."""
 40 |         gym.Wrapper.__init__(self, env)
 41 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 42 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 43 | 
 44 |     def reset(self, **kwargs):
 45 |         self.env.reset(**kwargs)
 46 |         obs, _, done, _ = self.env.step(1)
 47 |         if done:
 48 |             self.env.reset(**kwargs)
 49 |         obs, _, done, _ = self.env.step(2)
 50 |         if done:
 51 |             self.env.reset(**kwargs)
 52 |         return obs
 53 | 
 54 |     def step(self, ac):
 55 |         return self.env.step(ac)
 56 | 
 57 | class EpisodicLifeEnv(gym.Wrapper):
 58 |     def __init__(self, env):
 59 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 60 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 61 |         """
 62 |         gym.Wrapper.__init__(self, env)
 63 |         self.lives = 0
 64 |         self.was_real_done  = True
 65 | 
 66 |     def step(self, action):
 67 |         obs, reward, done, info = self.env.step(action)
 68 |         self.was_real_done = done
 69 |         # check current lives, make loss of life terminal,
 70 |         # then update lives to handle bonus lives
 71 |         lives = self.env.unwrapped.ale.lives()
 72 |         if lives < self.lives and lives > 0:
 73 |             # for Qbert sometimes we stay in lives == 0 condtion for a few frames
 74 |             # so its important to keep lives > 0, so that we only reset once
 75 |             # the environment advertises done.
 76 |             done = True
 77 |         self.lives = lives
 78 |         return obs, reward, done, info
 79 | 
 80 |     def reset(self, **kwargs):
 81 |         """Reset only when lives are exhausted.
 82 |         This way all states are still reachable even though lives are episodic,
 83 |         and the learner need not know about any of this behind-the-scenes.
 84 |         """
 85 |         if self.was_real_done:
 86 |             obs = self.env.reset(**kwargs)
 87 |         else:
 88 |             # no-op step to advance from terminal/lost life state
 89 |             obs, _, _, _ = self.env.step(0)
 90 |         self.lives = self.env.unwrapped.ale.lives()
 91 |         return obs
 92 | 
 93 | class MaxAndSkipEnv(gym.Wrapper):
 94 |     def __init__(self, env, skip=4):
 95 |         """Return only every `skip`-th frame"""
 96 |         gym.Wrapper.__init__(self, env)
 97 |         # most recent raw observations (for max pooling across time steps)
 98 |         self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
 99 |         self._skip       = skip
100 | 
101 |     def step(self, action):
102 |         """Repeat action, sum reward, and max over last observations."""
103 |         total_reward = 0.0
104 |         done = None
105 |         for i in range(self._skip):
106 |             obs, reward, done, info = self.env.step(action)
107 |             if i == self._skip - 2: self._obs_buffer[0] = obs
108 |             if i == self._skip - 1: self._obs_buffer[1] = obs
109 |             total_reward += reward
110 |             if done:
111 |                 break
112 |         # Note that the observation on the done=True frame
113 |         # doesn't matter
114 |         max_frame = self._obs_buffer.max(axis=0)
115 | 
116 |         return max_frame, total_reward, done, info
117 | 
118 |     def reset(self, **kwargs):
119 |         return self.env.reset(**kwargs)
120 | 
121 | class ClipRewardEnv(gym.RewardWrapper):
122 |     def __init__(self, env):
123 |         gym.RewardWrapper.__init__(self, env)
124 | 
125 |     def reward(self, reward):
126 |         """Bin reward to {+1, 0, -1} by its sign."""
127 |         return np.sign(reward)
128 | 
129 | class WarpFrame(gym.ObservationWrapper):
130 |     def __init__(self, env):
131 |         """Warp frames to 84x84 as done in the Nature paper and later work."""
132 |         gym.ObservationWrapper.__init__(self, env)
133 |         self.width = 84
134 |         self.height = 84
135 |         self.observation_space = spaces.Box(low=0, high=255,
136 |             shape=(self.height, self.width, 1), dtype=np.uint8)
137 | 
138 |     def observation(self, frame):
139 |         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
140 |         frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
141 |         return frame[:, :, None]
142 | 
143 | class FrameStack(gym.Wrapper):
144 |     def __init__(self, env, k):
145 |         """Stack k last frames.
146 | 
147 |         Returns lazy array, which is much more memory efficient.
148 | 
149 |         See Also
150 |         --------
151 |         baselines.common.atari_wrappers.LazyFrames
152 |         """
153 |         gym.Wrapper.__init__(self, env)
154 |         self.k = k
155 |         self.frames = deque([], maxlen=k)
156 |         shp = env.observation_space.shape
157 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
158 | 
159 |     def reset(self):
160 |         ob = self.env.reset()
161 |         for _ in range(self.k):
162 |             self.frames.append(ob)
163 |         return self._get_ob()
164 | 
165 |     def step(self, action):
166 |         ob, reward, done, info = self.env.step(action)
167 |         self.frames.append(ob)
168 |         return self._get_ob(), reward, done, info
169 | 
170 |     def _get_ob(self):
171 |         assert len(self.frames) == self.k
172 |         return LazyFrames(list(self.frames))
173 | 
174 | class ScaledFloatFrame(gym.ObservationWrapper):
175 |     def __init__(self, env):
176 |         gym.ObservationWrapper.__init__(self, env)
177 | 
178 |     def observation(self, observation):
179 |         # careful! This undoes the memory optimization, use
180 |         # with smaller replay buffers only.
181 |         return np.array(observation).astype(np.float32) / 255.0
182 | 
183 | class LazyFrames(object):
184 |     def __init__(self, frames):
185 |         """This object ensures that common frames between the observations are only stored once.
186 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
187 |         buffers.
188 | 
189 |         This object should only be converted to numpy array before being passed to the model.
190 | 
191 |         You'd not believe how complex the previous solution was."""
192 |         self._frames = frames
193 |         self._out = None
194 | 
195 |     def _force(self):
196 |         if self._out is None:
197 |             self._out = np.concatenate(self._frames, axis=2)
198 |             self._frames = None
199 |         return self._out
200 | 
201 |     def __array__(self, dtype=None):
202 |         out = self._force()
203 |         if dtype is not None:
204 |             out = out.astype(dtype)
205 |         return out
206 | 
207 |     def __len__(self):
208 |         return len(self._force())
209 | 
210 |     def __getitem__(self, i):
211 |         return self._force()[i]
212 | 
213 | def make_atari(env_id):
214 |     env = gym.make(env_id)
215 |     assert 'NoFrameskip' in env.spec.id
216 |     env = NoopResetEnv(env, noop_max=30)
217 |     env = MaxAndSkipEnv(env, skip=4)
218 |     return env
219 | 
220 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
221 |     """Configure environment for DeepMind-style Atari.
222 |     """
223 |     if episode_life:
224 |         env = EpisodicLifeEnv(env)
225 |     if 'FIRE' in env.unwrapped.get_action_meanings():
226 |         env = FireResetEnv(env)
227 |     env = WarpFrame(env)
228 |     if scale:
229 |         env = ScaledFloatFrame(env)
230 |     if clip_rewards:
231 |         env = ClipRewardEnv(env)
232 |     if frame_stack:
233 |         env = FrameStack(env, 4)
234 |     return env
235 | 
236 | 


--------------------------------------------------------------------------------
/baselines/common/cg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
 3 |     """
 4 |     Demmel p 312
 5 |     """
 6 |     p = b.copy()
 7 |     r = b.copy()
 8 |     x = np.zeros_like(b)
 9 |     rdotr = r.dot(r)
10 | 
11 |     fmtstr =  "%10i %10.3g %10.3g"
12 |     titlestr =  "%10s %10s %10s"
13 |     if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 | 
15 |     for i in range(cg_iters):
16 |         if callback is not None:
17 |             callback(x)
18 |         if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 |         z = f_Ax(p)
20 |         v = rdotr / p.dot(z)
21 |         x += v*p
22 |         r -= v*z
23 |         newrdotr = r.dot(r)
24 |         mu = newrdotr/rdotr
25 |         p = r + mu*p
26 | 
27 |         rdotr = newrdotr
28 |         if rdotr < residual_tol:
29 |             break
30 | 
31 |     if callback is not None:
32 |         callback(x)
33 |     if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
34 |     return x


--------------------------------------------------------------------------------
/baselines/common/cmd_util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for scripts like run_atari.py.
 3 | """
 4 | 
 5 | import os
 6 | import gym
 7 | from gym.wrappers import FlattenDictWrapper
 8 | from baselines import logger
 9 | from baselines.bench import Monitor
10 | from baselines.common import set_global_seeds
11 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind
12 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
13 | 
14 | def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
15 |     """
16 |     Create a wrapped, monitored SubprocVecEnv for Atari.
17 |     """
18 |     if wrapper_kwargs is None: wrapper_kwargs = {}
19 |     def make_env(rank): # pylint: disable=C0111
20 |         def _thunk():
21 |             env = make_atari(env_id)
22 |             env.seed(seed + rank)
23 |             env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
24 |             return wrap_deepmind(env, **wrapper_kwargs)
25 |         return _thunk
26 |     set_global_seeds(seed)
27 |     return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
28 | 
29 | def make_mujoco_env(env_id, seed):
30 |     """
31 |     Create a wrapped, monitored gym.Env for MuJoCo.
32 |     """
33 |     set_global_seeds(seed)
34 |     env = gym.make(env_id)
35 |     env = Monitor(env, logger.get_dir())
36 |     env.seed(seed)
37 |     return env
38 | 
39 | def make_robotics_env(env_id, seed, rank=0):
40 |     """
41 |     Create a wrapped, monitored gym.Env for MuJoCo.
42 |     """
43 |     set_global_seeds(seed)
44 |     env = gym.make(env_id)
45 |     env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
46 |     env = Monitor(
47 |         env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
48 |         info_keywords=('is_success',))
49 |     env.seed(seed)
50 |     return env
51 | 
52 | def arg_parser():
53 |     """
54 |     Create an empty argparse.ArgumentParser.
55 |     """
56 |     import argparse
57 |     return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
58 | 
59 | def atari_arg_parser():
60 |     """
61 |     Create an argparse.ArgumentParser for run_atari.py.
62 |     """
63 |     parser = arg_parser()
64 |     parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
65 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
66 |     parser.add_argument('--num-timesteps', type=int, default=int(10e6))
67 |     return parser
68 | 
69 | def mujoco_arg_parser():
70 |     """
71 |     Create an argparse.ArgumentParser for run_mujoco.py.
72 |     """
73 |     parser = arg_parser()
74 |     parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
75 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
76 |     parser.add_argument('--num-timesteps', type=int, default=int(1e6))
77 |     return parser
78 | 
79 | def robotics_arg_parser():
80 |     """
81 |     Create an argparse.ArgumentParser for run_mujoco.py.
82 |     """
83 |     parser = arg_parser()
84 |     parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
85 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
86 |     parser.add_argument('--num-timesteps', type=int, default=int(1e6))
87 |     return parser
88 | 


--------------------------------------------------------------------------------
/baselines/common/console_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from contextlib import contextmanager
 3 | import numpy as np
 4 | import time
 5 | 
 6 | # ================================================================
 7 | # Misc
 8 | # ================================================================
 9 | 
10 | def fmt_row(width, row, header=False):
11 |     out = " | ".join(fmt_item(x, width) for x in row)
12 |     if header: out = out + "\n" + "-"*len(out)
13 |     return out
14 | 
15 | def fmt_item(x, l):
16 |     if isinstance(x, np.ndarray):
17 |         assert x.ndim==0
18 |         x = x.item()
19 |     if isinstance(x, (float, np.float32, np.float64)):
20 |         v = abs(x)
21 |         if (v < 1e-4 or v > 1e+4) and v > 0:
22 |             rep = "%7.2e" % x
23 |         else:
24 |             rep = "%7.5f" % x
25 |     else: rep = str(x)
26 |     return " "*(l - len(rep)) + rep
27 | 
28 | color2num = dict(
29 |     gray=30,
30 |     red=31,
31 |     green=32,
32 |     yellow=33,
33 |     blue=34,
34 |     magenta=35,
35 |     cyan=36,
36 |     white=37,
37 |     crimson=38
38 | )
39 | 
40 | def colorize(string, color, bold=False, highlight=False):
41 |     attr = []
42 |     num = color2num[color]
43 |     if highlight: num += 10
44 |     attr.append(str(num))
45 |     if bold: attr.append('1')
46 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
47 | 
48 | 
49 | MESSAGE_DEPTH = 0
50 | 
51 | @contextmanager
52 | def timed(msg):
53 |     global MESSAGE_DEPTH #pylint: disable=W0603
54 |     print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
55 |     tstart = time.time()
56 |     MESSAGE_DEPTH += 1
57 |     yield
58 |     MESSAGE_DEPTH -= 1
59 |     print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
60 | 


--------------------------------------------------------------------------------
/baselines/common/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Dataset(object):
 4 |     def __init__(self, data_map, deterministic=False, shuffle=True):
 5 |         self.data_map = data_map
 6 |         self.deterministic = deterministic
 7 |         self.enable_shuffle = shuffle
 8 |         self.n = next(iter(data_map.values())).shape[0]
 9 |         self._next_id = 0
10 |         self.shuffle()
11 | 
12 |     def shuffle(self):
13 |         if self.deterministic:
14 |             return
15 |         perm = np.arange(self.n)
16 |         np.random.shuffle(perm)
17 | 
18 |         for key in self.data_map:
19 |             self.data_map[key] = self.data_map[key][perm]
20 | 
21 |         self._next_id = 0
22 | 
23 |     def next_batch(self, batch_size):
24 |         if self._next_id >= self.n and self.enable_shuffle:
25 |             self.shuffle()
26 | 
27 |         cur_id = self._next_id
28 |         cur_batch_size = min(batch_size, self.n - self._next_id)
29 |         self._next_id += cur_batch_size
30 | 
31 |         data_map = dict()
32 |         for key in self.data_map:
33 |             data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 |         return data_map
35 | 
36 |     def iterate_once(self, batch_size):
37 |         if self.enable_shuffle: self.shuffle()
38 | 
39 |         while self._next_id <= self.n - batch_size:
40 |             yield self.next_batch(batch_size)
41 |         self._next_id = 0
42 | 
43 |     def subset(self, num_elements, deterministic=True):
44 |         data_map = dict()
45 |         for key in self.data_map:
46 |             data_map[key] = self.data_map[key][:num_elements]
47 |         return Dataset(data_map, deterministic)
48 | 
49 | 
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 |     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 |     arrays = tuple(map(np.asarray, arrays))
53 |     n = arrays[0].shape[0]
54 |     assert all(a.shape[0] == n for a in arrays[1:])
55 |     inds = np.arange(n)
56 |     if shuffle: np.random.shuffle(inds)
57 |     sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 |     for batch_inds in np.array_split(inds, sections):
59 |         if include_final_partial_batch or len(batch_inds) == batch_size:
60 |             yield tuple(a[batch_inds] for a in arrays)
61 | 


--------------------------------------------------------------------------------
/baselines/common/distributions.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import baselines.common.tf_util as U
  4 | from baselines.a2c.utils import fc
  5 | from tensorflow.python.ops import math_ops
  6 | 
  7 | class Pd(object):
  8 |     """
  9 |     A particular probability distribution
 10 |     """
 11 |     def flatparam(self):
 12 |         raise NotImplementedError
 13 |     def mode(self):
 14 |         raise NotImplementedError
 15 |     def neglogp(self, x):
 16 |         # Usually it's easier to define the negative logprob
 17 |         raise NotImplementedError
 18 |     def kl(self, other):
 19 |         raise NotImplementedError
 20 |     def entropy(self):
 21 |         raise NotImplementedError
 22 |     def sample(self):
 23 |         raise NotImplementedError
 24 |     def logp(self, x):
 25 |         return - self.neglogp(x)
 26 | 
 27 | class PdType(object):
 28 |     """
 29 |     Parametrized family of probability distributions
 30 |     """
 31 |     def pdclass(self):
 32 |         raise NotImplementedError
 33 |     def pdfromflat(self, flat):
 34 |         return self.pdclass()(flat)
 35 |     def pdfromlatent(self, latent_vector):
 36 |         raise NotImplementedError
 37 |     def param_shape(self):
 38 |         raise NotImplementedError
 39 |     def sample_shape(self):
 40 |         raise NotImplementedError
 41 |     def sample_dtype(self):
 42 |         raise NotImplementedError
 43 | 
 44 |     def param_placeholder(self, prepend_shape, name=None):
 45 |         return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
 46 |     def sample_placeholder(self, prepend_shape, name=None):
 47 |         return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
 48 | 
 49 | class CategoricalPdType(PdType):
 50 |     def __init__(self, ncat):
 51 |         self.ncat = ncat
 52 |     def pdclass(self):
 53 |         return CategoricalPd
 54 |     def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
 55 |         pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
 56 |         return self.pdfromflat(pdparam), pdparam
 57 | 
 58 |     def param_shape(self):
 59 |         return [self.ncat]
 60 |     def sample_shape(self):
 61 |         return []
 62 |     def sample_dtype(self):
 63 |         return tf.int32
 64 | 
 65 | 
 66 | class MultiCategoricalPdType(PdType):
 67 |     def __init__(self, nvec):
 68 |         self.ncats = nvec
 69 |     def pdclass(self):
 70 |         return MultiCategoricalPd
 71 |     def pdfromflat(self, flat):
 72 |         return MultiCategoricalPd(self.ncats, flat)
 73 |     def param_shape(self):
 74 |         return [sum(self.ncats)]
 75 |     def sample_shape(self):
 76 |         return [len(self.ncats)]
 77 |     def sample_dtype(self):
 78 |         return tf.int32
 79 | 
 80 | class DiagGaussianPdType(PdType):
 81 |     def __init__(self, size):
 82 |         self.size = size
 83 |     def pdclass(self):
 84 |         return DiagGaussianPd
 85 | 
 86 |     def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
 87 |         mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
 88 |         logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
 89 |         pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
 90 |         return self.pdfromflat(pdparam), mean
 91 | 
 92 |     def param_shape(self):
 93 |         return [2*self.size]
 94 |     def sample_shape(self):
 95 |         return [self.size]
 96 |     def sample_dtype(self):
 97 |         return tf.float32
 98 | 
 99 | class BernoulliPdType(PdType):
100 |     def __init__(self, size):
101 |         self.size = size
102 |     def pdclass(self):
103 |         return BernoulliPd
104 |     def param_shape(self):
105 |         return [self.size]
106 |     def sample_shape(self):
107 |         return [self.size]
108 |     def sample_dtype(self):
109 |         return tf.int32
110 | 
111 | # WRONG SECOND DERIVATIVES
112 | # class CategoricalPd(Pd):
113 | #     def __init__(self, logits):
114 | #         self.logits = logits
115 | #         self.ps = tf.nn.softmax(logits)
116 | #     @classmethod
117 | #     def fromflat(cls, flat):
118 | #         return cls(flat)
119 | #     def flatparam(self):
120 | #         return self.logits
121 | #     def mode(self):
122 | #         return U.argmax(self.logits, axis=-1)
123 | #     def logp(self, x):
124 | #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
125 | #     def kl(self, other):
126 | #         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
127 | #                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
128 | #     def entropy(self):
129 | #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
130 | #     def sample(self):
131 | #         u = tf.random_uniform(tf.shape(self.logits))
132 | #         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
133 | 
134 | class CategoricalPd(Pd):
135 |     def __init__(self, logits):
136 |         self.logits = logits
137 |     def flatparam(self):
138 |         return self.logits
139 |     def mode(self):
140 |         return tf.argmax(self.logits, axis=-1)
141 |     def neglogp(self, x):
142 |         # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
143 |         # Note: we can't use sparse_softmax_cross_entropy_with_logits because
144 |         #       the implementation does not allow second-order derivatives...
145 |         one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
146 |         return tf.nn.softmax_cross_entropy_with_logits(
147 |             logits=self.logits,
148 |             labels=one_hot_actions)
149 |     def kl(self, other):
150 |         a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
151 |         a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True)
152 |         ea0 = tf.exp(a0)
153 |         ea1 = tf.exp(a1)
154 |         z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
155 |         z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True)
156 |         p0 = ea0 / z0
157 |         return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
158 |     def entropy(self):
159 |         a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
160 |         ea0 = tf.exp(a0)
161 |         z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
162 |         p0 = ea0 / z0
163 |         return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
164 |     def sample(self):
165 |         u = tf.random_uniform(tf.shape(self.logits))
166 |         return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
167 |     @classmethod
168 |     def fromflat(cls, flat):
169 |         return cls(flat)
170 | 
171 | class MultiCategoricalPd(Pd):
172 |     def __init__(self, nvec, flat):
173 |         self.flat = flat
174 |         self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1)))
175 |     def flatparam(self):
176 |         return self.flat
177 |     def mode(self):
178 |         return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
179 |     def neglogp(self, x):
180 |         return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
181 |     def kl(self, other):
182 |         return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
183 |     def entropy(self):
184 |         return tf.add_n([p.entropy() for p in self.categoricals])
185 |     def sample(self):
186 |         return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
187 |     @classmethod
188 |     def fromflat(cls, flat):
189 |         raise NotImplementedError
190 | 
191 | class DiagGaussianPd(Pd):
192 |     def __init__(self, flat):
193 |         self.flat = flat
194 |         mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
195 |         self.mean = mean
196 |         self.logstd = logstd
197 |         self.std = tf.exp(logstd)
198 |     def flatparam(self):
199 |         return self.flat
200 |     def mode(self):
201 |         return self.mean
202 |     def neglogp(self, x):
203 |         return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
204 |                + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
205 |                + tf.reduce_sum(self.logstd, axis=-1)
206 |     def kl(self, other):
207 |         assert isinstance(other, DiagGaussianPd)
208 |         return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
209 |     def entropy(self):
210 |         return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
211 |     def sample(self):
212 |         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
213 |     @classmethod
214 |     def fromflat(cls, flat):
215 |         return cls(flat)
216 | 
217 | class BernoulliPd(Pd):
218 |     def __init__(self, logits):
219 |         self.logits = logits
220 |         self.ps = tf.sigmoid(logits)
221 |     def flatparam(self):
222 |         return self.logits
223 |     def mode(self):
224 |         return tf.round(self.ps)
225 |     def neglogp(self, x):
226 |         return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
227 |     def kl(self, other):
228 |         return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
229 |     def entropy(self):
230 |         return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
231 |     def sample(self):
232 |         u = tf.random_uniform(tf.shape(self.ps))
233 |         return tf.to_float(math_ops.less(u, self.ps))
234 |     @classmethod
235 |     def fromflat(cls, flat):
236 |         return cls(flat)
237 | 
238 | def make_pdtype(ac_space):
239 |     from gym import spaces
240 |     if isinstance(ac_space, spaces.Box):
241 |         assert len(ac_space.shape) == 1
242 |         return DiagGaussianPdType(ac_space.shape[0])
243 |     elif isinstance(ac_space, spaces.Discrete):
244 |         return CategoricalPdType(ac_space.n)
245 |     elif isinstance(ac_space, spaces.MultiDiscrete):
246 |         return MultiCategoricalPdType(ac_space.nvec)
247 |     elif isinstance(ac_space, spaces.MultiBinary):
248 |         return BernoulliPdType(ac_space.n)
249 |     else:
250 |         raise NotImplementedError
251 | 
252 | def shape_el(v, i):
253 |     maybe = v.get_shape()[i]
254 |     if maybe is not None:
255 |         return maybe
256 |     else:
257 |         return tf.shape(v)[i]
258 | 
259 | @U.in_session
260 | def test_probtypes():
261 |     np.random.seed(0)
262 | 
263 |     pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
264 |     diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101
265 |     validate_probtype(diag_gauss, pdparam_diag_gauss)
266 | 
267 |     pdparam_categorical = np.array([-.2, .3, .5])
268 |     categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
269 |     validate_probtype(categorical, pdparam_categorical)
270 | 
271 |     nvec = [1,2,3]
272 |     pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
273 |     multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
274 |     validate_probtype(multicategorical, pdparam_multicategorical)
275 | 
276 |     pdparam_bernoulli = np.array([-.2, .3, .5])
277 |     bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
278 |     validate_probtype(bernoulli, pdparam_bernoulli)
279 | 
280 | 
281 | def validate_probtype(probtype, pdparam):
282 |     N = 100000
283 |     # Check to see if mean negative log likelihood == differential entropy
284 |     Mval = np.repeat(pdparam[None, :], N, axis=0)
285 |     M = probtype.param_placeholder([N])
286 |     X = probtype.sample_placeholder([N])
287 |     pd = probtype.pdfromflat(M)
288 |     calcloglik = U.function([X, M], pd.logp(X))
289 |     calcent = U.function([M], pd.entropy())
290 |     Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
291 |     logliks = calcloglik(Xval, Mval)
292 |     entval_ll = - logliks.mean() #pylint: disable=E1101
293 |     entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
294 |     entval = calcent(Mval).mean() #pylint: disable=E1101
295 |     assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas
296 | 
297 |     # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
298 |     M2 = probtype.param_placeholder([N])
299 |     pd2 = probtype.pdfromflat(M2)
300 |     q = pdparam + np.random.randn(pdparam.size) * 0.1
301 |     Mval2 = np.repeat(q[None, :], N, axis=0)
302 |     calckl = U.function([M, M2], pd.kl(pd2))
303 |     klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
304 |     logliks = calcloglik(Xval, Mval2)
305 |     klval_ll = - entval - logliks.mean() #pylint: disable=E1101
306 |     klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
307 |     assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
308 |     print('ok on', probtype, pdparam)
309 | 
310 | 


--------------------------------------------------------------------------------
/baselines/common/filters.py:
--------------------------------------------------------------------------------
 1 | from .running_stat import RunningStat
 2 | from collections import deque
 3 | import numpy as np
 4 | 
 5 | class Filter(object):
 6 |     def __call__(self, x, update=True):
 7 |         raise NotImplementedError
 8 |     def reset(self):
 9 |         pass
10 | 
11 | class IdentityFilter(Filter):
12 |     def __call__(self, x, update=True):
13 |         return x
14 | 
15 | class CompositionFilter(Filter):
16 |     def __init__(self, fs):
17 |         self.fs = fs
18 |     def __call__(self, x, update=True):
19 |         for f in self.fs:
20 |             x = f(x)
21 |         return x
22 |     def output_shape(self, input_space):
23 |         out = input_space.shape
24 |         for f in self.fs:
25 |             out = f.output_shape(out)
26 |         return out
27 | 
28 | class ZFilter(Filter):
29 |     """
30 |     y = (x-mean)/std
31 |     using running estimates of mean,std
32 |     """
33 | 
34 |     def __init__(self, shape, demean=True, destd=True, clip=10.0):
35 |         self.demean = demean
36 |         self.destd = destd
37 |         self.clip = clip
38 | 
39 |         self.rs = RunningStat(shape)
40 | 
41 |     def __call__(self, x, update=True):
42 |         if update: self.rs.push(x)
43 |         if self.demean:
44 |             x = x - self.rs.mean
45 |         if self.destd:
46 |             x = x / (self.rs.std+1e-8)
47 |         if self.clip:
48 |             x = np.clip(x, -self.clip, self.clip)
49 |         return x
50 |     def output_shape(self, input_space):
51 |         return input_space.shape
52 | 
53 | class AddClock(Filter):
54 |     def __init__(self):
55 |         self.count = 0
56 |     def reset(self):
57 |         self.count = 0
58 |     def __call__(self, x, update=True):
59 |         return np.append(x, self.count/100.0)
60 |     def output_shape(self, input_space):
61 |         return (input_space.shape[0]+1,)
62 | 
63 | class FlattenFilter(Filter):
64 |     def __call__(self, x, update=True):
65 |         return x.ravel()
66 |     def output_shape(self, input_space):
67 |         return (int(np.prod(input_space.shape)),)
68 | 
69 | class Ind2OneHotFilter(Filter):
70 |     def __init__(self, n):
71 |         self.n = n
72 |     def __call__(self, x, update=True):
73 |         out = np.zeros(self.n)
74 |         out[x] = 1
75 |         return out
76 |     def output_shape(self, input_space):
77 |         return (input_space.n,)
78 | 
79 | class DivFilter(Filter):
80 |     def __init__(self, divisor):
81 |         self.divisor = divisor
82 |     def __call__(self, x, update=True):
83 |         return x / self.divisor
84 |     def output_shape(self, input_space):
85 |         return input_space.shape
86 | 
87 | class StackFilter(Filter):
88 |     def __init__(self, length):
89 |         self.stack = deque(maxlen=length)
90 |     def reset(self):
91 |         self.stack.clear()
92 |     def __call__(self, x, update=True):
93 |         self.stack.append(x)
94 |         while len(self.stack) < self.stack.maxlen:
95 |             self.stack.append(x)
96 |         return np.concatenate(self.stack, axis=-1)
97 |     def output_shape(self, input_space):
98 |         return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)
99 | 


--------------------------------------------------------------------------------
/baselines/common/math_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | 
 5 | def discount(x, gamma):
 6 |     """
 7 |     computes discounted sums along 0th dimension of x.
 8 | 
 9 |     inputs
10 |     ------
11 |     x: ndarray
12 |     gamma: float
13 | 
14 |     outputs
15 |     -------
16 |     y: ndarray with same shape as x, satisfying
17 | 
18 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 |                 where k = len(x) - t - 1
20 | 
21 |     """
22 |     assert x.ndim >= 1
23 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 | 
25 | def explained_variance(ypred,y):
26 |     """
27 |     Computes fraction of variance that ypred explains about y.
28 |     Returns 1 - Var[y-ypred] / Var[y]
29 | 
30 |     interpretation:
31 |         ev=0  =>  might as well have predicted zero
32 |         ev=1  =>  perfect prediction
33 |         ev<0  =>  worse than just predicting zero
34 | 
35 |     """
36 |     assert y.ndim == 1 and ypred.ndim == 1
37 |     vary = np.var(y)
38 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 | 
40 | def explained_variance_2d(ypred, y):
41 |     assert y.ndim == 2 and ypred.ndim == 2
42 |     vary = np.var(y, axis=0)
43 |     out = 1 - np.var(y-ypred)/vary
44 |     out[vary < 1e-10] = 0
45 |     return out
46 | 
47 | def ncc(ypred, y):
48 |     return np.corrcoef(ypred, y)[1,0]
49 | 
50 | def flatten_arrays(arrs):
51 |     return np.concatenate([arr.flat for arr in arrs])
52 | 
53 | def unflatten_vector(vec, shapes):
54 |     i=0
55 |     arrs = []
56 |     for shape in shapes:
57 |         size = np.prod(shape)
58 |         arr = vec[i:i+size].reshape(shape)
59 |         arrs.append(arr)
60 |         i += size
61 |     return arrs
62 | 
63 | def discount_with_boundaries(X, New, gamma):
64 |     """
65 |     X: 2d array of floats, time x features
66 |     New: 2d array of bools, indicating when a new episode has started
67 |     """
68 |     Y = np.zeros_like(X)
69 |     T = X.shape[0]
70 |     Y[T-1] = X[T-1]
71 |     for t in range(T-2, -1, -1):
72 |         Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 |     return Y
74 | 
75 | def test_discount_with_boundaries():
76 |     gamma=0.9
77 |     x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 |     starts = [1.0, 0.0, 0.0, 1.0]
79 |     y = discount_with_boundaries(x, starts, gamma)
80 |     assert np.allclose(y, [
81 |         1 + gamma * 2 + gamma**2 * 3,
82 |         2 + gamma * 3,
83 |         3,
84 |         4
85 |     ])


--------------------------------------------------------------------------------
/baselines/common/misc_util.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import os
  4 | import pickle
  5 | import random
  6 | import tempfile
  7 | import zipfile
  8 | 
  9 | 
 10 | def zipsame(*seqs):
 11 |     L = len(seqs[0])
 12 |     assert all(len(seq) == L for seq in seqs[1:])
 13 |     return zip(*seqs)
 14 | 
 15 | 
 16 | def unpack(seq, sizes):
 17 |     """
 18 |     Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'.
 19 |     None = just one bare element, not a list
 20 | 
 21 |     Example:
 22 |     unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6])
 23 |     """
 24 |     seq = list(seq)
 25 |     it = iter(seq)
 26 |     assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes)
 27 |     for size in sizes:
 28 |         if size is None:
 29 |             yield it.__next__()
 30 |         else:
 31 |             li = []
 32 |             for _ in range(size):
 33 |                 li.append(it.__next__())
 34 |             yield li
 35 | 
 36 | 
 37 | class EzPickle(object):
 38 |     """Objects that are pickled and unpickled via their constructor
 39 |     arguments.
 40 | 
 41 |     Example usage:
 42 | 
 43 |         class Dog(Animal, EzPickle):
 44 |             def __init__(self, furcolor, tailkind="bushy"):
 45 |                 Animal.__init__()
 46 |                 EzPickle.__init__(furcolor, tailkind)
 47 |                 ...
 48 | 
 49 |     When this object is unpickled, a new Dog will be constructed by passing the provided
 50 |     furcolor and tailkind into the constructor. However, philosophers are still not sure
 51 |     whether it is still the same dog.
 52 | 
 53 |     This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
 54 |     and Atari.
 55 |     """
 56 | 
 57 |     def __init__(self, *args, **kwargs):
 58 |         self._ezpickle_args = args
 59 |         self._ezpickle_kwargs = kwargs
 60 | 
 61 |     def __getstate__(self):
 62 |         return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs}
 63 | 
 64 |     def __setstate__(self, d):
 65 |         out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"])
 66 |         self.__dict__.update(out.__dict__)
 67 | 
 68 | 
 69 | def set_global_seeds(i):
 70 |     try:
 71 |         import tensorflow as tf
 72 |     except ImportError:
 73 |         pass
 74 |     else:
 75 |         tf.set_random_seed(i)
 76 |     np.random.seed(i)
 77 |     random.seed(i)
 78 | 
 79 | 
 80 | def pretty_eta(seconds_left):
 81 |     """Print the number of seconds in human readable format.
 82 | 
 83 |     Examples:
 84 |     2 days
 85 |     2 hours and 37 minutes
 86 |     less than a minute
 87 | 
 88 |     Paramters
 89 |     ---------
 90 |     seconds_left: int
 91 |         Number of seconds to be converted to the ETA
 92 |     Returns
 93 |     -------
 94 |     eta: str
 95 |         String representing the pretty ETA.
 96 |     """
 97 |     minutes_left = seconds_left // 60
 98 |     seconds_left %= 60
 99 |     hours_left = minutes_left // 60
100 |     minutes_left %= 60
101 |     days_left = hours_left // 24
102 |     hours_left %= 24
103 | 
104 |     def helper(cnt, name):
105 |         return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else ''))
106 | 
107 |     if days_left > 0:
108 |         msg = helper(days_left, 'day')
109 |         if hours_left > 0:
110 |             msg += ' and ' + helper(hours_left, 'hour')
111 |         return msg
112 |     if hours_left > 0:
113 |         msg = helper(hours_left, 'hour')
114 |         if minutes_left > 0:
115 |             msg += ' and ' + helper(minutes_left, 'minute')
116 |         return msg
117 |     if minutes_left > 0:
118 |         return helper(minutes_left, 'minute')
119 |     return 'less than a minute'
120 | 
121 | 
122 | class RunningAvg(object):
123 |     def __init__(self, gamma, init_value=None):
124 |         """Keep a running estimate of a quantity. This is a bit like mean
125 |         but more sensitive to recent changes.
126 | 
127 |         Parameters
128 |         ----------
129 |         gamma: float
130 |             Must be between 0 and 1, where 0 is the most sensitive to recent
131 |             changes.
132 |         init_value: float or None
133 |             Initial value of the estimate. If None, it will be set on the first update.
134 |         """
135 |         self._value = init_value
136 |         self._gamma = gamma
137 | 
138 |     def update(self, new_val):
139 |         """Update the estimate.
140 | 
141 |         Parameters
142 |         ----------
143 |         new_val: float
144 |             new observated value of estimated quantity.
145 |         """
146 |         if self._value is None:
147 |             self._value = new_val
148 |         else:
149 |             self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val
150 | 
151 |     def __float__(self):
152 |         """Get the current estimate"""
153 |         return self._value
154 | 
155 | def boolean_flag(parser, name, default=False, help=None):
156 |     """Add a boolean flag to argparse parser.
157 | 
158 |     Parameters
159 |     ----------
160 |     parser: argparse.Parser
161 |         parser to add the flag to
162 |     name: str
163 |         --<name> will enable the flag, while --no-<name> will disable it
164 |     default: bool or None
165 |         default value of the flag
166 |     help: str
167 |         help string for the flag
168 |     """
169 |     dest = name.replace('-', '_')
170 |     parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
171 |     parser.add_argument("--no-" + name, action="store_false", dest=dest)
172 | 
173 | 
174 | def get_wrapper_by_name(env, classname):
175 |     """Given an a gym environment possibly wrapped multiple times, returns a wrapper
176 |     of class named classname or raises ValueError if no such wrapper was applied
177 | 
178 |     Parameters
179 |     ----------
180 |     env: gym.Env of gym.Wrapper
181 |         gym environment
182 |     classname: str
183 |         name of the wrapper
184 | 
185 |     Returns
186 |     -------
187 |     wrapper: gym.Wrapper
188 |         wrapper named classname
189 |     """
190 |     currentenv = env
191 |     while True:
192 |         if classname == currentenv.class_name():
193 |             return currentenv
194 |         elif isinstance(currentenv, gym.Wrapper):
195 |             currentenv = currentenv.env
196 |         else:
197 |             raise ValueError("Couldn't find wrapper named %s" % classname)
198 | 
199 | 
200 | def relatively_safe_pickle_dump(obj, path, compression=False):
201 |     """This is just like regular pickle dump, except from the fact that failure cases are
202 |     different:
203 | 
204 |         - It's never possible that we end up with a pickle in corrupted state.
205 |         - If a there was a different file at the path, that file will remain unchanged in the
206 |           even of failure (provided that filesystem rename is atomic).
207 |         - it is sometimes possible that we end up with useless temp file which needs to be
208 |           deleted manually (it will be removed automatically on the next function call)
209 | 
210 |     The indended use case is periodic checkpoints of experiment state, such that we never
211 |     corrupt previous checkpoints if the current one fails.
212 | 
213 |     Parameters
214 |     ----------
215 |     obj: object
216 |         object to pickle
217 |     path: str
218 |         path to the output file
219 |     compression: bool
220 |         if true pickle will be compressed
221 |     """
222 |     temp_storage = path + ".relatively_safe"
223 |     if compression:
224 |         # Using gzip here would be simpler, but the size is limited to 2GB
225 |         with tempfile.NamedTemporaryFile() as uncompressed_file:
226 |             pickle.dump(obj, uncompressed_file)
227 |             uncompressed_file.file.flush()
228 |             with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
229 |                 myzip.write(uncompressed_file.name, "data")
230 |     else:
231 |         with open(temp_storage, "wb") as f:
232 |             pickle.dump(obj, f)
233 |     os.rename(temp_storage, path)
234 | 
235 | 
236 | def pickle_load(path, compression=False):
237 |     """Unpickle a possible compressed pickle.
238 | 
239 |     Parameters
240 |     ----------
241 |     path: str
242 |         path to the output file
243 |     compression: bool
244 |         if true assumes that pickle was compressed when created and attempts decompression.
245 | 
246 |     Returns
247 |     -------
248 |     obj: object
249 |         the unpickled object
250 |     """
251 | 
252 |     if compression:
253 |         with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
254 |             with myzip.open("data") as f:
255 |                 return pickle.load(f)
256 |     else:
257 |         with open(path, "rb") as f:
258 |             return pickle.load(f)
259 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_adam.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | class MpiAdam(object):
 7 |     def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
 8 |         self.var_list = var_list
 9 |         self.beta1 = beta1
10 |         self.beta2 = beta2
11 |         self.epsilon = epsilon
12 |         self.scale_grad_by_procs = scale_grad_by_procs
13 |         size = sum(U.numel(v) for v in var_list)
14 |         self.m = np.zeros(size, 'float32')
15 |         self.v = np.zeros(size, 'float32')
16 |         self.t = 0
17 |         self.setfromflat = U.SetFromFlat(var_list)
18 |         self.getflat = U.GetFlat(var_list)
19 |         self.comm = MPI.COMM_WORLD if comm is None else comm
20 | 
21 |     def update(self, localg, stepsize):
22 |         if self.t % 100 == 0:
23 |             self.check_synced()
24 |         localg = localg.astype('float32')
25 |         globalg = np.zeros_like(localg)
26 |         self.comm.Allreduce(localg, globalg, op=MPI.SUM)
27 |         if self.scale_grad_by_procs:
28 |             globalg /= self.comm.Get_size()
29 | 
30 |         self.t += 1
31 |         a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
32 |         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
33 |         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
34 |         step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
35 |         self.setfromflat(self.getflat() + step)
36 | 
37 |     def sync(self):
38 |         theta = self.getflat()
39 |         self.comm.Bcast(theta, root=0)
40 |         self.setfromflat(theta)
41 | 
42 |     def check_synced(self):
43 |         if self.comm.Get_rank() == 0: # this is root
44 |             theta = self.getflat()
45 |             self.comm.Bcast(theta, root=0)
46 |         else:
47 |             thetalocal = self.getflat()
48 |             thetaroot = np.empty_like(thetalocal)
49 |             self.comm.Bcast(thetaroot, root=0)
50 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
51 | 
52 | @U.in_session
53 | def test_MpiAdam():
54 |     np.random.seed(0)
55 |     tf.set_random_seed(0)
56 | 
57 |     a = tf.Variable(np.random.randn(3).astype('float32'))
58 |     b = tf.Variable(np.random.randn(2,5).astype('float32'))
59 |     loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
60 | 
61 |     stepsize = 1e-2
62 |     update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
63 |     do_update = U.function([], loss, updates=[update_op])
64 | 
65 |     tf.get_default_session().run(tf.global_variables_initializer())
66 |     for i in range(10):
67 |         print(i,do_update())
68 | 
69 |     tf.set_random_seed(0)
70 |     tf.get_default_session().run(tf.global_variables_initializer())
71 | 
72 |     var_list = [a,b]
73 |     lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
74 |     adam = MpiAdam(var_list)
75 | 
76 |     for i in range(10):
77 |         l,g = lossandgrad()
78 |         adam.update(g, stepsize)
79 |         print(i,l)


--------------------------------------------------------------------------------
/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
 1 | import os, subprocess, sys
 2 | 
 3 | def mpi_fork(n, bind_to_core=False):
 4 |     """Re-launches the current script with workers
 5 |     Returns "parent" for original parent, "child" for MPI children
 6 |     """
 7 |     if n<=1: 
 8 |         return "child"
 9 |     if os.getenv("IN_MPI") is None:
10 |         env = os.environ.copy()
11 |         env.update(
12 |             MKL_NUM_THREADS="1",
13 |             OMP_NUM_THREADS="1",
14 |             IN_MPI="1"
15 |         )
16 |         args = ["mpirun", "-np", str(n)]
17 |         if bind_to_core:
18 |             args += ["-bind-to", "core"]
19 |         args += [sys.executable] + sys.argv
20 |         subprocess.check_call(args, env=env)
21 |         return "parent"
22 |     else:
23 |         return "child"
24 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | from baselines.common import zipsame
 4 | 
 5 | 
 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False):
 7 |     x = np.asarray(x)
 8 |     assert x.ndim > 0
 9 |     if comm is None: comm = MPI.COMM_WORLD
10 |     xsum = x.sum(axis=axis, keepdims=keepdims)
11 |     n = xsum.size
12 |     localsum = np.zeros(n+1, x.dtype)
13 |     localsum[:n] = xsum.ravel()
14 |     localsum[n] = x.shape[axis]
15 |     globalsum = np.zeros_like(localsum)
16 |     comm.Allreduce(localsum, globalsum, op=MPI.SUM)
17 |     return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
18 | 
19 | def mpi_moments(x, axis=0, comm=None, keepdims=False):
20 |     x = np.asarray(x)
21 |     assert x.ndim > 0
22 |     mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
23 |     sqdiffs = np.square(x - mean)
24 |     meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
25 |     assert count1 == count
26 |     std = np.sqrt(meansqdiff)
27 |     if not keepdims:
28 |         newshape = mean.shape[:axis] + mean.shape[axis+1:]
29 |         mean = mean.reshape(newshape)
30 |         std = std.reshape(newshape)
31 |     return mean, std, count
32 | 
33 | 
34 | def test_runningmeanstd():
35 |     import subprocess
36 |     subprocess.check_call(['mpirun', '-np', '3', 
37 |         'python','-c', 
38 |         'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
39 | 
40 | def _helper_runningmeanstd():
41 |     comm = MPI.COMM_WORLD
42 |     np.random.seed(0)
43 |     for (triple,axis) in [
44 |         ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
45 |         ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
46 |         ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
47 |         ]:
48 | 
49 | 
50 |         x = np.concatenate(triple, axis=axis)
51 |         ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
52 | 
53 | 
54 |         ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
55 | 
56 |         for (a1,a2) in zipsame(ms1, ms2):
57 |             print(a1, a2)
58 |             assert np.allclose(a1, a2)
59 |             print("ok!")
60 | 
61 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
  1 | from mpi4py import MPI
  2 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np
  3 | 
  4 | class RunningMeanStd(object):
  5 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
  6 |     def __init__(self, epsilon=1e-2, shape=()):
  7 | 
  8 |         self._sum = tf.get_variable(
  9 |             dtype=tf.float64,
 10 |             shape=shape,
 11 |             initializer=tf.constant_initializer(0.0),
 12 |             name="runningsum", trainable=False)
 13 |         self._sumsq = tf.get_variable(
 14 |             dtype=tf.float64,
 15 |             shape=shape,
 16 |             initializer=tf.constant_initializer(epsilon),
 17 |             name="runningsumsq", trainable=False)
 18 |         self._count = tf.get_variable(
 19 |             dtype=tf.float64,
 20 |             shape=(),
 21 |             initializer=tf.constant_initializer(epsilon),
 22 |             name="count", trainable=False)
 23 |         self.shape = shape
 24 | 
 25 |         self.mean = tf.to_float(self._sum / self._count)
 26 |         self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
 27 | 
 28 |         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
 29 |         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
 30 |         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
 31 |         self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
 32 |             updates=[tf.assign_add(self._sum, newsum),
 33 |                      tf.assign_add(self._sumsq, newsumsq),
 34 |                      tf.assign_add(self._count, newcount)])
 35 | 
 36 | 
 37 |     def update(self, x):
 38 |         x = x.astype('float64')
 39 |         n = int(np.prod(self.shape))
 40 |         totalvec = np.zeros(n*2+1, 'float64')
 41 |         addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
 42 |         MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
 43 |         self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
 44 | 
 45 | @U.in_session
 46 | def test_runningmeanstd():
 47 |     for (x1, x2, x3) in [
 48 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
 49 |         (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
 50 |         ]:
 51 | 
 52 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
 53 |         U.initialize()
 54 | 
 55 |         x = np.concatenate([x1, x2, x3], axis=0)
 56 |         ms1 = [x.mean(axis=0), x.std(axis=0)]
 57 |         rms.update(x1)
 58 |         rms.update(x2)
 59 |         rms.update(x3)
 60 |         ms2 = [rms.mean.eval(), rms.std.eval()]
 61 | 
 62 |         assert np.allclose(ms1, ms2)
 63 | 
 64 | @U.in_session
 65 | def test_dist():
 66 |     np.random.seed(0)
 67 |     p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
 68 |     q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
 69 | 
 70 |     # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
 71 |     # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
 72 | 
 73 |     comm = MPI.COMM_WORLD
 74 |     assert comm.Get_size()==2
 75 |     if comm.Get_rank()==0:
 76 |         x1,x2,x3 = p1,p2,p3
 77 |     elif comm.Get_rank()==1:
 78 |         x1,x2,x3 = q1,q2,q3
 79 |     else:
 80 |         assert False
 81 | 
 82 |     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
 83 |     U.initialize()
 84 | 
 85 |     rms.update(x1)
 86 |     rms.update(x2)
 87 |     rms.update(x3)
 88 | 
 89 |     bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
 90 | 
 91 |     def checkallclose(x,y):
 92 |         print(x,y)
 93 |         return np.allclose(x,y)
 94 | 
 95 |     assert checkallclose(
 96 |         bigvec.mean(axis=0),
 97 |         rms.mean.eval(),
 98 |     )
 99 |     assert checkallclose(
100 |         bigvec.std(axis=0),
101 |         rms.std.eval(),
102 |     )
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     # Run with mpirun -np 2 python <filename>
107 |     test_dist()
108 | 


--------------------------------------------------------------------------------
/baselines/common/runners.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | class AbstractEnvRunner(ABC):
 5 |     def __init__(self, *, env, model, nsteps):
 6 |         self.env = env
 7 |         self.model = model
 8 |         nenv = env.num_envs
 9 |         self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
10 |         self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
11 |         self.obs[:] = env.reset()
12 |         self.nsteps = nsteps
13 |         self.states = model.initial_state
14 |         self.dones = [False for _ in range(nenv)]
15 | 
16 |     @abstractmethod
17 |     def run(self):
18 |         raise NotImplementedError
19 | 


--------------------------------------------------------------------------------
/baselines/common/running_mean_std.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | class RunningMeanStd(object):
 3 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
 4 |     def __init__(self, epsilon=1e-4, shape=()):
 5 |         self.mean = np.zeros(shape, 'float64')
 6 |         self.var = np.ones(shape, 'float64')
 7 |         self.count = epsilon
 8 | 
 9 |     def update(self, x):
10 |         batch_mean = np.mean(x, axis=0)
11 |         batch_var = np.var(x, axis=0)
12 |         batch_count = x.shape[0]
13 |         self.update_from_moments(batch_mean, batch_var, batch_count)
14 | 
15 |     def update_from_moments(self, batch_mean, batch_var, batch_count):
16 |         delta = batch_mean - self.mean
17 |         tot_count = self.count + batch_count
18 | 
19 |         new_mean = self.mean + delta * batch_count / tot_count        
20 |         m_a = self.var * (self.count)
21 |         m_b = batch_var * (batch_count)
22 |         M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
23 |         new_var = M2 / (self.count + batch_count)
24 | 
25 |         new_count = batch_count + self.count
26 | 
27 |         self.mean = new_mean
28 |         self.var = new_var
29 |         self.count = new_count    
30 | 
31 | def test_runningmeanstd():
32 |     for (x1, x2, x3) in [
33 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
34 |         (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
35 |         ]:
36 | 
37 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
38 | 
39 |         x = np.concatenate([x1, x2, x3], axis=0)
40 |         ms1 = [x.mean(axis=0), x.var(axis=0)]
41 |         rms.update(x1)
42 |         rms.update(x2)
43 |         rms.update(x3)
44 |         ms2 = [rms.mean, rms.var]
45 | 
46 |         assert np.allclose(ms1, ms2)
47 | 


--------------------------------------------------------------------------------
/baselines/common/running_stat.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # http://www.johndcook.com/blog/standard_deviation/
 4 | class RunningStat(object):
 5 |     def __init__(self, shape):
 6 |         self._n = 0
 7 |         self._M = np.zeros(shape)
 8 |         self._S = np.zeros(shape)
 9 |     def push(self, x):
10 |         x = np.asarray(x)
11 |         assert x.shape == self._M.shape
12 |         self._n += 1
13 |         if self._n == 1:
14 |             self._M[...] = x
15 |         else:
16 |             oldM = self._M.copy()
17 |             self._M[...] = oldM + (x - oldM)/self._n
18 |             self._S[...] = self._S + (x - oldM)*(x - self._M)
19 |     @property
20 |     def n(self):
21 |         return self._n
22 |     @property
23 |     def mean(self):
24 |         return self._M
25 |     @property
26 |     def var(self):
27 |         return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
28 |     @property
29 |     def std(self):
30 |         return np.sqrt(self.var)
31 |     @property
32 |     def shape(self):
33 |         return self._M.shape
34 | 
35 | def test_running_stat():
36 |     for shp in ((), (3,), (3,4)):
37 |         li = []
38 |         rs = RunningStat(shp)
39 |         for _ in range(5):
40 |             val = np.random.randn(*shp)
41 |             rs.push(val)
42 |             li.append(val)
43 |             m = np.mean(li, axis=0)
44 |             assert np.allclose(rs.mean, m)
45 |             v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
46 |             assert np.allclose(rs.var, v)
47 | 


--------------------------------------------------------------------------------
/baselines/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 |  - learning rate for the optimizer
  4 |  - exploration epsilon for the epsilon greedy exploration strategy
  5 |  - beta parameter for beta parameter in prioritized replay
  6 | 
  7 | Each schedule has a function `value(t)` which returns the current value
  8 | of the parameter given the timestep t of the optimization procedure.
  9 | """
 10 | 
 11 | 
 12 | class Schedule(object):
 13 |     def value(self, t):
 14 |         """Value of the schedule at time t"""
 15 |         raise NotImplementedError()
 16 | 
 17 | 
 18 | class ConstantSchedule(object):
 19 |     def __init__(self, value):
 20 |         """Value remains constant over time.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         value: float
 25 |             Constant value of the schedule
 26 |         """
 27 |         self._v = value
 28 | 
 29 |     def value(self, t):
 30 |         """See Schedule.value"""
 31 |         return self._v
 32 | 
 33 | 
 34 | def linear_interpolation(l, r, alpha):
 35 |     return l + alpha * (r - l)
 36 | 
 37 | 
 38 | class PiecewiseSchedule(object):
 39 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 40 |         """Piecewise schedule.
 41 | 
 42 |         endpoints: [(int, int)]
 43 |             list of pairs `(time, value)` meanining that schedule should output
 44 |             `value` when `t==time`. All the values for time must be sorted in
 45 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 46 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 47 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 48 |             time passed between `time_a` and `time_b` for time `t`.
 49 |         interpolation: lambda float, float, float: float
 50 |             a function that takes value to the left and to the right of t according
 51 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 52 |             right endpoint that t has covered. See linear_interpolation for example.
 53 |         outside_value: float
 54 |             if the value is requested outside of all the intervals sepecified in
 55 |             `endpoints` this value is returned. If None then AssertionError is
 56 |             raised when outside value is requested.
 57 |         """
 58 |         idxes = [e[0] for e in endpoints]
 59 |         assert idxes == sorted(idxes)
 60 |         self._interpolation = interpolation
 61 |         self._outside_value = outside_value
 62 |         self._endpoints = endpoints
 63 | 
 64 |     def value(self, t):
 65 |         """See Schedule.value"""
 66 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 67 |             if l_t <= t and t < r_t:
 68 |                 alpha = float(t - l_t) / (r_t - l_t)
 69 |                 return self._interpolation(l, r, alpha)
 70 | 
 71 |         # t does not belong to any of the pieces, so doom.
 72 |         assert self._outside_value is not None
 73 |         return self._outside_value
 74 | 
 75 | 
 76 | class LinearSchedule(object):
 77 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 78 |         """Linear interpolation between initial_p and final_p over
 79 |         schedule_timesteps. After this many timesteps pass final_p is
 80 |         returned.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         schedule_timesteps: int
 85 |             Number of timesteps for which to linearly anneal initial_p
 86 |             to final_p
 87 |         initial_p: float
 88 |             initial output value
 89 |         final_p: float
 90 |             final output value
 91 |         """
 92 |         self.schedule_timesteps = schedule_timesteps
 93 |         self.final_p = final_p
 94 |         self.initial_p = initial_p
 95 | 
 96 |     def value(self, t):
 97 |         """See Schedule.value"""
 98 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
 99 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
100 | 


--------------------------------------------------------------------------------
/baselines/common/segment_tree.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | 
  4 | class SegmentTree(object):
  5 |     def __init__(self, capacity, operation, neutral_element):
  6 |         """Build a Segment Tree data structure.
  7 | 
  8 |         https://en.wikipedia.org/wiki/Segment_tree
  9 | 
 10 |         Can be used as regular array, but with two
 11 |         important differences:
 12 | 
 13 |             a) setting item's value is slightly slower.
 14 |                It is O(lg capacity) instead of O(1).
 15 |             b) user has access to an efficient ( O(log segment size) )
 16 |                `reduce` operation which reduces `operation` over
 17 |                a contiguous subsequence of items in the array.
 18 | 
 19 |         Paramters
 20 |         ---------
 21 |         capacity: int
 22 |             Total size of the array - must be a power of two.
 23 |         operation: lambda obj, obj -> obj
 24 |             and operation for combining elements (eg. sum, max)
 25 |             must form a mathematical group together with the set of
 26 |             possible values for array elements (i.e. be associative)
 27 |         neutral_element: obj
 28 |             neutral element for the operation above. eg. float('-inf')
 29 |             for max and 0 for sum.
 30 |         """
 31 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 32 |         self._capacity = capacity
 33 |         self._value = [neutral_element for _ in range(2 * capacity)]
 34 |         self._operation = operation
 35 | 
 36 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 37 |         if start == node_start and end == node_end:
 38 |             return self._value[node]
 39 |         mid = (node_start + node_end) // 2
 40 |         if end <= mid:
 41 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 42 |         else:
 43 |             if mid + 1 <= start:
 44 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 45 |             else:
 46 |                 return self._operation(
 47 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 48 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 49 |                 )
 50 | 
 51 |     def reduce(self, start=0, end=None):
 52 |         """Returns result of applying `self.operation`
 53 |         to a contiguous subsequence of the array.
 54 | 
 55 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         start: int
 60 |             beginning of the subsequence
 61 |         end: int
 62 |             end of the subsequences
 63 | 
 64 |         Returns
 65 |         -------
 66 |         reduced: obj
 67 |             result of reducing self.operation over the specified range of array elements.
 68 |         """
 69 |         if end is None:
 70 |             end = self._capacity
 71 |         if end < 0:
 72 |             end += self._capacity
 73 |         end -= 1
 74 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 75 | 
 76 |     def __setitem__(self, idx, val):
 77 |         # index of the leaf
 78 |         idx += self._capacity
 79 |         self._value[idx] = val
 80 |         idx //= 2
 81 |         while idx >= 1:
 82 |             self._value[idx] = self._operation(
 83 |                 self._value[2 * idx],
 84 |                 self._value[2 * idx + 1]
 85 |             )
 86 |             idx //= 2
 87 | 
 88 |     def __getitem__(self, idx):
 89 |         assert 0 <= idx < self._capacity
 90 |         return self._value[self._capacity + idx]
 91 | 
 92 | 
 93 | class SumSegmentTree(SegmentTree):
 94 |     def __init__(self, capacity):
 95 |         super(SumSegmentTree, self).__init__(
 96 |             capacity=capacity,
 97 |             operation=operator.add,
 98 |             neutral_element=0.0
 99 |         )
100 | 
101 |     def sum(self, start=0, end=None):
102 |         """Returns arr[start] + ... + arr[end]"""
103 |         return super(SumSegmentTree, self).reduce(start, end)
104 | 
105 |     def find_prefixsum_idx(self, prefixsum):
106 |         """Find the highest index `i` in the array such that
107 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
108 | 
109 |         if array values are probabilities, this function
110 |         allows to sample indexes according to the discrete
111 |         probability efficiently.
112 | 
113 |         Parameters
114 |         ----------
115 |         perfixsum: float
116 |             upperbound on the sum of array prefix
117 | 
118 |         Returns
119 |         -------
120 |         idx: int
121 |             highest index satisfying the prefixsum constraint
122 |         """
123 |         assert 0 <= prefixsum <= self.sum() + 1e-5
124 |         idx = 1
125 |         while idx < self._capacity:  # while non-leaf
126 |             if self._value[2 * idx] > prefixsum:
127 |                 idx = 2 * idx
128 |             else:
129 |                 prefixsum -= self._value[2 * idx]
130 |                 idx = 2 * idx + 1
131 |         return idx - self._capacity
132 | 
133 | 
134 | class MinSegmentTree(SegmentTree):
135 |     def __init__(self, capacity):
136 |         super(MinSegmentTree, self).__init__(
137 |             capacity=capacity,
138 |             operation=min,
139 |             neutral_element=float('inf')
140 |         )
141 | 
142 |     def min(self, start=0, end=None):
143 |         """Returns min(arr[start], ...,  arr[end])"""
144 | 
145 |         return super(MinSegmentTree, self).reduce(start, end)
146 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
 4 | 
 5 | 
 6 | def test_piecewise_schedule():
 7 |     ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
 8 | 
 9 |     assert np.isclose(ps.value(-10), 500)
10 |     assert np.isclose(ps.value(0), 150)
11 |     assert np.isclose(ps.value(5), 200)
12 |     assert np.isclose(ps.value(9), 80)
13 |     assert np.isclose(ps.value(50), 50)
14 |     assert np.isclose(ps.value(80), 50)
15 |     assert np.isclose(ps.value(150), 0)
16 |     assert np.isclose(ps.value(175), -25)
17 |     assert np.isclose(ps.value(201), 500)
18 |     assert np.isclose(ps.value(500), 500)
19 | 
20 |     assert np.isclose(ps.value(200 - 1e-10), -50)
21 | 
22 | 
23 | def test_constant_schedule():
24 |     cs = ConstantSchedule(5)
25 |     for i in range(-100, 100):
26 |         assert np.isclose(cs.value(i), 5)
27 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  4 | 
  5 | 
  6 | def test_tree_set():
  7 |     tree = SumSegmentTree(4)
  8 | 
  9 |     tree[2] = 1.0
 10 |     tree[3] = 3.0
 11 | 
 12 |     assert np.isclose(tree.sum(), 4.0)
 13 |     assert np.isclose(tree.sum(0, 2), 0.0)
 14 |     assert np.isclose(tree.sum(0, 3), 1.0)
 15 |     assert np.isclose(tree.sum(2, 3), 1.0)
 16 |     assert np.isclose(tree.sum(2, -1), 1.0)
 17 |     assert np.isclose(tree.sum(2, 4), 4.0)
 18 | 
 19 | 
 20 | def test_tree_set_overlap():
 21 |     tree = SumSegmentTree(4)
 22 | 
 23 |     tree[2] = 1.0
 24 |     tree[2] = 3.0
 25 | 
 26 |     assert np.isclose(tree.sum(), 3.0)
 27 |     assert np.isclose(tree.sum(2, 3), 3.0)
 28 |     assert np.isclose(tree.sum(2, -1), 3.0)
 29 |     assert np.isclose(tree.sum(2, 4), 3.0)
 30 |     assert np.isclose(tree.sum(1, 2), 0.0)
 31 | 
 32 | 
 33 | def test_prefixsum_idx():
 34 |     tree = SumSegmentTree(4)
 35 | 
 36 |     tree[2] = 1.0
 37 |     tree[3] = 3.0
 38 | 
 39 |     assert tree.find_prefixsum_idx(0.0) == 2
 40 |     assert tree.find_prefixsum_idx(0.5) == 2
 41 |     assert tree.find_prefixsum_idx(0.99) == 2
 42 |     assert tree.find_prefixsum_idx(1.01) == 3
 43 |     assert tree.find_prefixsum_idx(3.00) == 3
 44 |     assert tree.find_prefixsum_idx(4.00) == 3
 45 | 
 46 | 
 47 | def test_prefixsum_idx2():
 48 |     tree = SumSegmentTree(4)
 49 | 
 50 |     tree[0] = 0.5
 51 |     tree[1] = 1.0
 52 |     tree[2] = 1.0
 53 |     tree[3] = 3.0
 54 | 
 55 |     assert tree.find_prefixsum_idx(0.00) == 0
 56 |     assert tree.find_prefixsum_idx(0.55) == 1
 57 |     assert tree.find_prefixsum_idx(0.99) == 1
 58 |     assert tree.find_prefixsum_idx(1.51) == 2
 59 |     assert tree.find_prefixsum_idx(3.00) == 3
 60 |     assert tree.find_prefixsum_idx(5.50) == 3
 61 | 
 62 | 
 63 | def test_max_interval_tree():
 64 |     tree = MinSegmentTree(4)
 65 | 
 66 |     tree[0] = 1.0
 67 |     tree[2] = 0.5
 68 |     tree[3] = 3.0
 69 | 
 70 |     assert np.isclose(tree.min(), 0.5)
 71 |     assert np.isclose(tree.min(0, 2), 1.0)
 72 |     assert np.isclose(tree.min(0, 3), 0.5)
 73 |     assert np.isclose(tree.min(0, -1), 0.5)
 74 |     assert np.isclose(tree.min(2, 4), 0.5)
 75 |     assert np.isclose(tree.min(3, 4), 3.0)
 76 | 
 77 |     tree[2] = 0.7
 78 | 
 79 |     assert np.isclose(tree.min(), 0.7)
 80 |     assert np.isclose(tree.min(0, 2), 1.0)
 81 |     assert np.isclose(tree.min(0, 3), 0.7)
 82 |     assert np.isclose(tree.min(0, -1), 0.7)
 83 |     assert np.isclose(tree.min(2, 4), 0.7)
 84 |     assert np.isclose(tree.min(3, 4), 3.0)
 85 | 
 86 |     tree[2] = 4.0
 87 | 
 88 |     assert np.isclose(tree.min(), 1.0)
 89 |     assert np.isclose(tree.min(0, 2), 1.0)
 90 |     assert np.isclose(tree.min(0, 3), 1.0)
 91 |     assert np.isclose(tree.min(0, -1), 1.0)
 92 |     assert np.isclose(tree.min(2, 4), 3.0)
 93 |     assert np.isclose(tree.min(2, 3), 4.0)
 94 |     assert np.isclose(tree.min(2, -1), 4.0)
 95 |     assert np.isclose(tree.min(3, 4), 3.0)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     test_tree_set()
100 |     test_tree_set_overlap()
101 |     test_prefixsum_idx()
102 |     test_prefixsum_idx2()
103 |     test_max_interval_tree()
104 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
 1 | # tests for tf_util
 2 | import tensorflow as tf
 3 | from baselines.common.tf_util import (
 4 |     function,
 5 |     initialize,
 6 |     single_threaded_session
 7 | )
 8 | 
 9 | 
10 | def test_function():
11 |     with tf.Graph().as_default():
12 |         x = tf.placeholder(tf.int32, (), name="x")
13 |         y = tf.placeholder(tf.int32, (), name="y")
14 |         z = 3 * x + 2 * y
15 |         lin = function([x, y], z, givens={y: 0})
16 | 
17 |         with single_threaded_session():
18 |             initialize()
19 | 
20 |             assert lin(2) == 6
21 |             assert lin(2, 2) == 10
22 | 
23 | 
24 | def test_multikwargs():
25 |     with tf.Graph().as_default():
26 |         x = tf.placeholder(tf.int32, (), name="x")
27 |         with tf.variable_scope("other"):
28 |             x2 = tf.placeholder(tf.int32, (), name="x")
29 |         z = 3 * x + 2 * x2
30 | 
31 |         lin = function([x, x2], z, givens={x2: 0})
32 |         with single_threaded_session():
33 |             initialize()
34 |             assert lin(2) == 6
35 |             assert lin(2, 2) == 10
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     test_function()
40 |     test_multikwargs()
41 | 


--------------------------------------------------------------------------------
/baselines/common/tf_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf  # pylint: ignore-module
  3 | import copy
  4 | import os
  5 | import functools
  6 | import collections
  7 | import multiprocessing
  8 | 
  9 | def switch(condition, then_expression, else_expression):
 10 |     """Switches between two operations depending on a scalar value (int or bool).
 11 |     Note that both `then_expression` and `else_expression`
 12 |     should be symbolic tensors of the *same shape*.
 13 | 
 14 |     # Arguments
 15 |         condition: scalar tensor.
 16 |         then_expression: TensorFlow operation.
 17 |         else_expression: TensorFlow operation.
 18 |     """
 19 |     x_shape = copy.copy(then_expression.get_shape())
 20 |     x = tf.cond(tf.cast(condition, 'bool'),
 21 |                 lambda: then_expression,
 22 |                 lambda: else_expression)
 23 |     x.set_shape(x_shape)
 24 |     return x
 25 | 
 26 | # ================================================================
 27 | # Extras
 28 | # ================================================================
 29 | 
 30 | def lrelu(x, leak=0.2):
 31 |     f1 = 0.5 * (1 + leak)
 32 |     f2 = 0.5 * (1 - leak)
 33 |     return f1 * x + f2 * abs(x)
 34 | 
 35 | # ================================================================
 36 | # Mathematical utils
 37 | # ================================================================
 38 | 
 39 | def huber_loss(x, delta=1.0):
 40 |     """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
 41 |     return tf.where(
 42 |         tf.abs(x) < delta,
 43 |         tf.square(x) * 0.5,
 44 |         delta * (tf.abs(x) - 0.5 * delta)
 45 |     )
 46 | 
 47 | # ================================================================
 48 | # Global session
 49 | # ================================================================
 50 | 
 51 | def make_session(num_cpu=None, make_default=False, graph=None):
 52 |     """Returns a session that will use <num_cpu> CPU's only"""
 53 |     if num_cpu is None:
 54 |         num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count()))
 55 |     tf_config = tf.ConfigProto(
 56 |         inter_op_parallelism_threads=num_cpu,
 57 |         intra_op_parallelism_threads=num_cpu)
 58 |     tf_config.gpu_options.allocator_type = 'BFC'
 59 |     if make_default:
 60 |         return tf.InteractiveSession(config=tf_config, graph=graph)
 61 |     else:
 62 |         return tf.Session(config=tf_config, graph=graph)
 63 | 
 64 | def single_threaded_session():
 65 |     """Returns a session which will only use a single CPU"""
 66 |     return make_session(num_cpu=1)
 67 | 
 68 | def in_session(f):
 69 |     @functools.wraps(f)
 70 |     def newfunc(*args, **kwargs):
 71 |         with tf.Session():
 72 |             f(*args, **kwargs)
 73 |     return newfunc
 74 | 
 75 | ALREADY_INITIALIZED = set()
 76 | 
 77 | def initialize():
 78 |     """Initialize all the uninitialized variables in the global scope."""
 79 |     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
 80 |     tf.get_default_session().run(tf.variables_initializer(new_variables))
 81 |     ALREADY_INITIALIZED.update(new_variables)
 82 | 
 83 | # ================================================================
 84 | # Model components
 85 | # ================================================================
 86 | 
 87 | def normc_initializer(std=1.0, axis=0):
 88 |     def _initializer(shape, dtype=None, partition_info=None):  # pylint: disable=W0613
 89 |         out = np.random.randn(*shape).astype(np.float32)
 90 |         out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True))
 91 |         return tf.constant(out)
 92 |     return _initializer
 93 | 
 94 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
 95 |            summary_tag=None):
 96 |     with tf.variable_scope(name):
 97 |         stride_shape = [1, stride[0], stride[1], 1]
 98 |         filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
 99 | 
100 |         # there are "num input feature maps * filter height * filter width"
101 |         # inputs to each hidden unit
102 |         fan_in = intprod(filter_shape[:3])
103 |         # each unit in the lower layer receives a gradient from:
104 |         # "num output feature maps * filter height * filter width" /
105 |         #   pooling size
106 |         fan_out = intprod(filter_shape[:2]) * num_filters
107 |         # initialize weights with random weights
108 |         w_bound = np.sqrt(6. / (fan_in + fan_out))
109 | 
110 |         w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
111 |                             collections=collections)
112 |         b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(),
113 |                             collections=collections)
114 | 
115 |         if summary_tag is not None:
116 |             tf.summary.image(summary_tag,
117 |                              tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
118 |                                           [2, 0, 1, 3]),
119 |                              max_images=10)
120 | 
121 |         return tf.nn.conv2d(x, w, stride_shape, pad) + b
122 | 
123 | # ================================================================
124 | # Theano-like Function
125 | # ================================================================
126 | 
127 | def function(inputs, outputs, updates=None, givens=None):
128 |     """Just like Theano function. Take a bunch of tensorflow placeholders and expressions
129 |     computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
130 |     values to be fed to the input's placeholders and produces the values of the expressions
131 |     in outputs.
132 | 
133 |     Input values can be passed in the same order as inputs or can be provided as kwargs based
134 |     on placeholder name (passed to constructor or accessible via placeholder.op.name).
135 | 
136 |     Example:
137 |         x = tf.placeholder(tf.int32, (), name="x")
138 |         y = tf.placeholder(tf.int32, (), name="y")
139 |         z = 3 * x + 2 * y
140 |         lin = function([x, y], z, givens={y: 0})
141 | 
142 |         with single_threaded_session():
143 |             initialize()
144 | 
145 |             assert lin(2) == 6
146 |             assert lin(x=3) == 9
147 |             assert lin(2, 2) == 10
148 |             assert lin(x=2, y=3) == 12
149 | 
150 |     Parameters
151 |     ----------
152 |     inputs: [tf.placeholder, tf.constant, or object with make_feed_dict method]
153 |         list of input arguments
154 |     outputs: [tf.Variable] or tf.Variable
155 |         list of outputs or a single output to be returned from function. Returned
156 |         value will also have the same shape.
157 |     """
158 |     if isinstance(outputs, list):
159 |         return _Function(inputs, outputs, updates, givens=givens)
160 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
161 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
162 |         return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
163 |     else:
164 |         f = _Function(inputs, [outputs], updates, givens=givens)
165 |         return lambda *args, **kwargs: f(*args, **kwargs)[0]
166 | 
167 | 
168 | class _Function(object):
169 |     def __init__(self, inputs, outputs, updates, givens):
170 |         for inpt in inputs:
171 |             if not hasattr(inpt, 'make_feed_dict') and not (type(inpt) is tf.Tensor and len(inpt.op.inputs) == 0):
172 |                 assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method"
173 |         self.inputs = inputs
174 |         updates = updates or []
175 |         self.update_group = tf.group(*updates)
176 |         self.outputs_update = list(outputs) + [self.update_group]
177 |         self.givens = {} if givens is None else givens
178 | 
179 |     def _feed_input(self, feed_dict, inpt, value):
180 |         if hasattr(inpt, 'make_feed_dict'):
181 |             feed_dict.update(inpt.make_feed_dict(value))
182 |         else:
183 |             feed_dict[inpt] = value
184 | 
185 |     def __call__(self, *args):
186 |         assert len(args) <= len(self.inputs), "Too many arguments provided"
187 |         feed_dict = {}
188 |         # Update the args
189 |         for inpt, value in zip(self.inputs, args):
190 |             self._feed_input(feed_dict, inpt, value)
191 |         # Update feed dict with givens.
192 |         for inpt in self.givens:
193 |             feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
194 |         results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
195 |         return results
196 | 
197 | # ================================================================
198 | # Flat vectors
199 | # ================================================================
200 | 
201 | def var_shape(x):
202 |     out = x.get_shape().as_list()
203 |     assert all(isinstance(a, int) for a in out), \
204 |         "shape function assumes that shape is fully known"
205 |     return out
206 | 
207 | def numel(x):
208 |     return intprod(var_shape(x))
209 | 
210 | def intprod(x):
211 |     return int(np.prod(x))
212 | 
213 | def flatgrad(loss, var_list, clip_norm=None):
214 |     grads = tf.gradients(loss, var_list)
215 |     if clip_norm is not None:
216 |         grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads]
217 |     return tf.concat(axis=0, values=[
218 |         tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)])
219 |         for (v, grad) in zip(var_list, grads)
220 |     ])
221 | 
222 | class SetFromFlat(object):
223 |     def __init__(self, var_list, dtype=tf.float32):
224 |         assigns = []
225 |         shapes = list(map(var_shape, var_list))
226 |         total_size = np.sum([intprod(shape) for shape in shapes])
227 | 
228 |         self.theta = theta = tf.placeholder(dtype, [total_size])
229 |         start = 0
230 |         assigns = []
231 |         for (shape, v) in zip(shapes, var_list):
232 |             size = intprod(shape)
233 |             assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape)))
234 |             start += size
235 |         self.op = tf.group(*assigns)
236 | 
237 |     def __call__(self, theta):
238 |         tf.get_default_session().run(self.op, feed_dict={self.theta: theta})
239 | 
240 | class GetFlat(object):
241 |     def __init__(self, var_list):
242 |         self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
243 | 
244 |     def __call__(self):
245 |         return tf.get_default_session().run(self.op)
246 | 
247 | _PLACEHOLDER_CACHE = {}  # name -> (placeholder, dtype, shape)
248 | 
249 | def get_placeholder(name, dtype, shape):
250 |     if name in _PLACEHOLDER_CACHE:
251 |         out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
252 |         assert dtype1 == dtype and shape1 == shape
253 |         return out
254 |     else:
255 |         out = tf.placeholder(dtype=dtype, shape=shape, name=name)
256 |         _PLACEHOLDER_CACHE[name] = (out, dtype, shape)
257 |         return out
258 | 
259 | def get_placeholder_cached(name):
260 |     return _PLACEHOLDER_CACHE[name][0]
261 | 
262 | def flattenallbut0(x):
263 |     return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
264 | 
265 | 
266 | # ================================================================
267 | # Diagnostics 
268 | # ================================================================
269 | 
270 | def display_var_info(vars):
271 |     from baselines import logger
272 |     count_params = 0
273 |     for v in vars:
274 |         name = v.name
275 |         if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue
276 |         v_params = np.prod(v.shape.as_list())
277 |         count_params += v_params
278 |         if "/b:" in name or "/biases" in name: continue    # Wx+b, bias is not interesting to look at => count params, but not print
279 |         logger.info("   %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape)))
280 | 
281 |     logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))
282 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from baselines import logger
  3 | 
  4 | class AlreadySteppingError(Exception):
  5 |     """
  6 |     Raised when an asynchronous step is running while
  7 |     step_async() is called again.
  8 |     """
  9 |     def __init__(self):
 10 |         msg = 'already running an async step'
 11 |         Exception.__init__(self, msg)
 12 | 
 13 | class NotSteppingError(Exception):
 14 |     """
 15 |     Raised when an asynchronous step is not running but
 16 |     step_wait() is called.
 17 |     """
 18 |     def __init__(self):
 19 |         msg = 'not running an async step'
 20 |         Exception.__init__(self, msg)
 21 | 
 22 | class VecEnv(ABC):
 23 |     """
 24 |     An abstract asynchronous, vectorized environment.
 25 |     """
 26 |     def __init__(self, num_envs, observation_space, action_space):
 27 |         self.num_envs = num_envs
 28 |         self.observation_space = observation_space
 29 |         self.action_space = action_space
 30 | 
 31 |     @abstractmethod
 32 |     def reset(self):
 33 |         """
 34 |         Reset all the environments and return an array of
 35 |         observations, or a tuple of observation arrays.
 36 | 
 37 |         If step_async is still doing work, that work will
 38 |         be cancelled and step_wait() should not be called
 39 |         until step_async() is invoked again.
 40 |         """
 41 |         pass
 42 | 
 43 |     @abstractmethod
 44 |     def step_async(self, actions):
 45 |         """
 46 |         Tell all the environments to start taking a step
 47 |         with the given actions.
 48 |         Call step_wait() to get the results of the step.
 49 | 
 50 |         You should not call this if a step_async run is
 51 |         already pending.
 52 |         """
 53 |         pass
 54 | 
 55 |     @abstractmethod
 56 |     def step_wait(self):
 57 |         """
 58 |         Wait for the step taken with step_async().
 59 | 
 60 |         Returns (obs, rews, dones, infos):
 61 |          - obs: an array of observations, or a tuple of
 62 |                 arrays of observations.
 63 |          - rews: an array of rewards
 64 |          - dones: an array of "episode done" booleans
 65 |          - infos: a sequence of info objects
 66 |         """
 67 |         pass
 68 | 
 69 |     @abstractmethod
 70 |     def close(self):
 71 |         """
 72 |         Clean up the environments' resources.
 73 |         """
 74 |         pass
 75 | 
 76 |     def step(self, actions):
 77 |         self.step_async(actions)
 78 |         return self.step_wait()
 79 | 
 80 |     def render(self):
 81 |         logger.warn('Render not defined for %s'%self)
 82 | 
 83 |     @property
 84 |     def unwrapped(self):
 85 |         if isinstance(self, VecEnvWrapper):
 86 |             return self.venv.unwrapped
 87 |         else:
 88 |             return self
 89 | 
 90 | class VecEnvWrapper(VecEnv):
 91 |     def __init__(self, venv, observation_space=None, action_space=None):
 92 |         self.venv = venv
 93 |         VecEnv.__init__(self, 
 94 |             num_envs=venv.num_envs,
 95 |             observation_space=observation_space or venv.observation_space, 
 96 |             action_space=action_space or venv.action_space)
 97 | 
 98 |     def step_async(self, actions):
 99 |         self.venv.step_async(actions)
100 | 
101 |     @abstractmethod
102 |     def reset(self):
103 |         pass
104 | 
105 |     @abstractmethod
106 |     def step_wait(self):
107 |         pass
108 | 
109 |     def close(self):
110 |         return self.venv.close()
111 | 
112 |     def render(self):
113 |         self.venv.render()
114 | 
115 | class CloudpickleWrapper(object):
116 |     """
117 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
118 |     """
119 |     def __init__(self, x):
120 |         self.x = x
121 |     def __getstate__(self):
122 |         import cloudpickle
123 |         return cloudpickle.dumps(self.x)
124 |     def __setstate__(self, ob):
125 |         import pickle
126 |         self.x = pickle.loads(ob)
127 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import spaces
 3 | from collections import OrderedDict
 4 | from . import VecEnv
 5 | 
 6 | class DummyVecEnv(VecEnv):
 7 |     def __init__(self, env_fns):
 8 |         self.envs = [fn() for fn in env_fns]
 9 |         env = self.envs[0]
10 |         VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
11 |         shapes, dtypes = {}, {}
12 |         self.keys = []
13 |         obs_space = env.observation_space
14 |         if isinstance(obs_space, spaces.Dict):
15 |             assert isinstance(obs_space.spaces, OrderedDict)
16 |             for key, box in obs_space.spaces.items():
17 |                 assert isinstance(box, spaces.Box)
18 |                 shapes[key] = box.shape
19 |                 dtypes[key] = box.dtype
20 |                 self.keys.append(key)
21 |         else:
22 |             box = obs_space
23 |             assert isinstance(box, spaces.Box)
24 |             self.keys = [None]
25 |             shapes, dtypes = { None: box.shape }, { None: box.dtype }
26 |         self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
27 |         self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
28 |         self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
29 |         self.buf_infos = [{} for _ in range(self.num_envs)]
30 |         self.actions = None
31 | 
32 |     def step_async(self, actions):
33 |         self.actions = actions
34 | 
35 |     def step_wait(self):
36 |         for e in range(self.num_envs):
37 |             obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e])
38 |             if self.buf_dones[e]:
39 |                 obs = self.envs[e].reset()
40 |             self._save_obs(e, obs)
41 |         return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
42 |                 self.buf_infos.copy())
43 | 
44 |     def reset(self):
45 |         for e in range(self.num_envs):
46 |             obs = self.envs[e].reset()
47 |             self._save_obs(e, obs)
48 |         return self._obs_from_buf()
49 | 
50 |     def close(self):
51 |         return
52 | 
53 |     def _save_obs(self, e, obs):
54 |         for k in self.keys:
55 |             if k is None:
56 |                 self.buf_obs[k][e] = obs
57 |             else:
58 |                 self.buf_obs[k][e] = obs[k]
59 | 
60 |     def _obs_from_buf(self):
61 |         if self.keys==[None]:
62 |             return self.buf_obs[None]
63 |         else:
64 |             return self.buf_obs
65 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/subproc_vec_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiprocessing import Process, Pipe
 3 | from baselines.common.vec_env import VecEnv, CloudpickleWrapper
 4 | 
 5 | 
 6 | def worker(remote, parent_remote, env_fn_wrapper):
 7 |     parent_remote.close()
 8 |     env = env_fn_wrapper.x()
 9 |     while True:
10 |         cmd, data = remote.recv()
11 |         if cmd == 'step':
12 |             ob, reward, done, info = env.step(data)
13 |             if done:
14 |                 ob = env.reset()
15 |             remote.send((ob, reward, done, info))
16 |         elif cmd == 'reset':
17 |             ob = env.reset()
18 |             remote.send(ob)
19 |         elif cmd == 'reset_task':
20 |             ob = env.reset_task()
21 |             remote.send(ob)
22 |         elif cmd == 'close':
23 |             remote.close()
24 |             break
25 |         elif cmd == 'get_spaces':
26 |             remote.send((env.observation_space, env.action_space))
27 |         else:
28 |             raise NotImplementedError
29 | 
30 | 
31 | class SubprocVecEnv(VecEnv):
32 |     def __init__(self, env_fns, spaces=None):
33 |         """
34 |         envs: list of gym environments to run in subprocesses
35 |         """
36 |         self.waiting = False
37 |         self.closed = False
38 |         nenvs = len(env_fns)
39 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
40 |         self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
41 |             for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
42 |         for p in self.ps:
43 |             p.daemon = True # if the main process crashes, we should not cause things to hang
44 |             p.start()
45 |         for remote in self.work_remotes:
46 |             remote.close()
47 | 
48 |         self.remotes[0].send(('get_spaces', None))
49 |         observation_space, action_space = self.remotes[0].recv()
50 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
51 | 
52 |     def step_async(self, actions):
53 |         for remote, action in zip(self.remotes, actions):
54 |             remote.send(('step', action))
55 |         self.waiting = True
56 | 
57 |     def step_wait(self):
58 |         results = [remote.recv() for remote in self.remotes]
59 |         self.waiting = False
60 |         obs, rews, dones, infos = zip(*results)
61 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
62 | 
63 |     def reset(self):
64 |         for remote in self.remotes:
65 |             remote.send(('reset', None))
66 |         return np.stack([remote.recv() for remote in self.remotes])
67 | 
68 |     def reset_task(self):
69 |         for remote in self.remotes:
70 |             remote.send(('reset_task', None))
71 |         return np.stack([remote.recv() for remote in self.remotes])
72 | 
73 |     def close(self):
74 |         if self.closed:
75 |             return
76 |         if self.waiting:
77 |             for remote in self.remotes:            
78 |                 remote.recv()
79 |         for remote in self.remotes:
80 |             remote.send(('close', None))
81 |         for p in self.ps:
82 |             p.join()
83 |         self.closed = True
84 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.vec_env import VecEnvWrapper
 2 | import numpy as np
 3 | from gym import spaces
 4 | 
 5 | class VecFrameStack(VecEnvWrapper):
 6 |     """
 7 |     Vectorized environment base class
 8 |     """
 9 |     def __init__(self, venv, nstack):
10 |         self.venv = venv
11 |         self.nstack = nstack
12 |         wos = venv.observation_space # wrapped ob space
13 |         low = np.repeat(wos.low, self.nstack, axis=-1)
14 |         high = np.repeat(wos.high, self.nstack, axis=-1)
15 |         self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
16 |         observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
17 |         VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
18 | 
19 |     def step_wait(self):
20 |         obs, rews, news, infos = self.venv.step_wait()
21 |         self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
22 |         for (i, new) in enumerate(news):
23 |             if new:
24 |                 self.stackedobs[i] = 0
25 |         self.stackedobs[..., -obs.shape[-1]:] = obs
26 |         return self.stackedobs, rews, news, infos
27 | 
28 |     def reset(self):
29 |         """
30 |         Reset all environments
31 |         """
32 |         obs = self.venv.reset()
33 |         self.stackedobs[...] = 0
34 |         self.stackedobs[..., -obs.shape[-1]:] = obs
35 |         return self.stackedobs
36 | 
37 |     def close(self):
38 |         self.venv.close()
39 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.vec_env import VecEnvWrapper
 2 | from baselines.common.running_mean_std import RunningMeanStd
 3 | import numpy as np
 4 | 
 5 | class VecNormalize(VecEnvWrapper):
 6 |     """
 7 |     Vectorized environment base class
 8 |     """
 9 |     def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
10 |         VecEnvWrapper.__init__(self, venv)
11 |         self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
12 |         self.ret_rms = RunningMeanStd(shape=()) if ret else None
13 |         self.clipob = clipob
14 |         self.cliprew = cliprew
15 |         self.ret = np.zeros(self.num_envs)
16 |         self.gamma = gamma
17 |         self.epsilon = epsilon
18 | 
19 |     def step_wait(self):
20 |         """
21 |         Apply sequence of actions to sequence of environments
22 |         actions -> (observations, rewards, news)
23 | 
24 |         where 'news' is a boolean vector indicating whether each element is new.
25 |         """
26 |         obs, rews, news, infos = self.venv.step_wait()
27 |         self.ret = self.ret * self.gamma + rews
28 |         obs = self._obfilt(obs)
29 |         if self.ret_rms:
30 |             self.ret_rms.update(self.ret)
31 |             rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
32 |         return obs, rews, news, infos
33 | 
34 |     def _obfilt(self, obs):
35 |         if self.ob_rms:
36 |             self.ob_rms.update(obs)
37 |             obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
38 |             return obs
39 |         else:
40 |             return obs
41 | 
42 |     def reset(self):
43 |         """
44 |         Reset all environments
45 |         """
46 |         obs = self.venv.reset()
47 |         return self._obfilt(obs)
48 | 


--------------------------------------------------------------------------------
/baselines/her/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ruizhaogit/EnergyBasedPrioritization/2fd2f5bab0547848f4f76b837d16238435518dcc/baselines/her/__init__.py


--------------------------------------------------------------------------------
/baselines/her/actor_critic.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from baselines.her.util import store_args, nn
 3 | 
 4 | 
 5 | class ActorCritic:
 6 |     @store_args
 7 |     def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers,
 8 |                  **kwargs):
 9 |         """The actor-critic network and related training code.
10 | 
11 |         Args:
12 |             inputs_tf (dict of tensors): all necessary inputs for the network: the
13 |                 observation (o), the goal (g), and the action (u)
14 |             dimo (int): the dimension of the observations
15 |             dimg (int): the dimension of the goals
16 |             dimu (int): the dimension of the actions
17 |             max_u (float): the maximum magnitude of actions; action outputs will be scaled
18 |                 accordingly
19 |             o_stats (baselines.her.Normalizer): normalizer for observations
20 |             g_stats (baselines.her.Normalizer): normalizer for goals
21 |             hidden (int): number of hidden units that should be used in hidden layers
22 |             layers (int): number of hidden layers
23 |         """
24 |         self.o_tf = inputs_tf['o']
25 |         self.g_tf = inputs_tf['g']
26 |         self.u_tf = inputs_tf['u']
27 | 
28 |         # Prepare inputs for actor and critic.
29 |         o = self.o_stats.normalize(self.o_tf)
30 |         g = self.g_stats.normalize(self.g_tf)
31 |         input_pi = tf.concat(axis=1, values=[o, g])  # for actor
32 | 
33 |         # Networks.
34 |         with tf.variable_scope('pi'):
35 |             self.pi_tf = self.max_u * tf.tanh(nn(
36 |                 input_pi, [self.hidden] * self.layers + [self.dimu]))
37 |         with tf.variable_scope('Q'):
38 |             # for policy training
39 |             input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
40 |             self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
41 |             # for critic training
42 |             input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
43 |             self._input_Q = input_Q  # exposed for tests
44 |             self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
45 | 


--------------------------------------------------------------------------------
/baselines/her/ddpg.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from tensorflow.contrib.staging import StagingArea
  5 | from baselines import logger
  6 | from baselines.her.util import (
  7 |     import_function, store_args, flatten_grads, transitions_in_episode_batch)
  8 | from baselines.her.normalizer import Normalizer
  9 | from baselines.her.replay_buffer import ReplayBuffer, ReplayBufferEnergy, PrioritizedReplayBuffer
 10 | from baselines.common.mpi_adam import MpiAdam
 11 | import baselines.common.tf_util as U
 12 | from baselines.common.schedules import LinearSchedule, PiecewiseSchedule
 13 | import json
 14 | 
 15 | 
 16 | 
 17 | def dims_to_shapes(input_dims):
 18 |     return {key: tuple([val]) if val > 0 else tuple() for key, val in input_dims.items()}
 19 | 
 20 | 
 21 | class DDPG(object):
 22 |     @store_args
 23 |     def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
 24 |                  Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
 25 |                  rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
 26 |                  sample_transitions, gamma, temperature, prioritization, env_name,
 27 |                  alpha, beta0, beta_iters, eps, max_timesteps, rank_method, reuse=False, **kwargs):
 28 |         """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
 29 | 
 30 |         Args:
 31 |             input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
 32 |                 actions (u)
 33 |             buffer_size (int): number of transitions that are stored in the replay buffer
 34 |             hidden (int): number of units in the hidden layers
 35 |             layers (int): number of hidden layers
 36 |             network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
 37 |             polyak (float): coefficient for Polyak-averaging of the target network
 38 |             batch_size (int): batch size for training
 39 |             Q_lr (float): learning rate for the Q (critic) network
 40 |             pi_lr (float): learning rate for the pi (actor) network
 41 |             norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
 42 |             norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
 43 |             max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
 44 |             action_l2 (float): coefficient for L2 penalty on the actions
 45 |             clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
 46 |             scope (str): the scope used for the TensorFlow graph
 47 |             T (int): the time horizon for rollouts
 48 |             rollout_batch_size (int): number of parallel rollouts per DDPG agent
 49 |             subtract_goals (function): function that subtracts goals from each other
 50 |             relative_goals (boolean): whether or not relative goals should be fed into the network
 51 |             clip_pos_returns (boolean): whether or not positive returns should be clipped
 52 |             clip_return (float): clip returns to be in [-clip_return, clip_return]
 53 |             sample_transitions (function) function that samples from the replay buffer
 54 |             gamma (float): gamma used for Q learning updates
 55 |             reuse (boolean): whether or not the networks should be reused
 56 |         """
 57 |         if self.clip_return is None:
 58 |             self.clip_return = np.inf
 59 | 
 60 |         self.create_actor_critic = import_function(self.network_class)
 61 | 
 62 |         input_shapes = dims_to_shapes(self.input_dims)
 63 |         self.dimo = self.input_dims['o']
 64 |         self.dimg = self.input_dims['g']
 65 |         self.dimu = self.input_dims['u']
 66 | 
 67 |         self.prioritization = prioritization
 68 |         self.env_name = env_name
 69 |         self.temperature = temperature
 70 |         self.rank_method = rank_method
 71 | 
 72 |         # Prepare staging area for feeding data to the model.
 73 |         stage_shapes = OrderedDict()
 74 |         for key in sorted(self.input_dims.keys()):
 75 |             if key.startswith('info_'):
 76 |                 continue
 77 |             stage_shapes[key] = (None, *input_shapes[key])
 78 |         for key in ['o', 'g']:
 79 |             stage_shapes[key + '_2'] = stage_shapes[key]
 80 |         stage_shapes['r'] = (None,)
 81 |         stage_shapes['w'] = (None,)
 82 |         self.stage_shapes = stage_shapes
 83 | 
 84 |         # Create network.
 85 |         with tf.variable_scope(self.scope):
 86 |             self.staging_tf = StagingArea(
 87 |                 dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
 88 |                 shapes=list(self.stage_shapes.values()))
 89 |             self.buffer_ph_tf = [
 90 |                 tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
 91 |             self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
 92 | 
 93 |             self._create_network(reuse=reuse)
 94 | 
 95 |         # Configure the replay buffer.
 96 |         buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
 97 |                          for key, val in input_shapes.items()}
 98 |         buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
 99 |         buffer_shapes['ag'] = (self.T+1, self.dimg)
100 |         buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
101 | 
102 |         if self.prioritization == 'energy':
103 |             self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size, self.T, self.sample_transitions, 
104 |                                             self.prioritization, self.env_name)
105 |         elif self.prioritization == 'tderror':
106 |             self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha, self.env_name)
107 |             if beta_iters is None:
108 |                 beta_iters = max_timesteps
109 |             self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0)
110 |         else:
111 |             self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
112 | 
113 |     def _random_action(self, n):
114 |         return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))
115 | 
116 |     def _preprocess_og(self, o, ag, g):
117 |         if self.relative_goals:
118 |             g_shape = g.shape
119 |             g = g.reshape(-1, self.dimg)
120 |             ag = ag.reshape(-1, self.dimg)
121 |             g = self.subtract_goals(g, ag)
122 |             g = g.reshape(*g_shape)
123 |         o = np.clip(o, -self.clip_obs, self.clip_obs)
124 |         g = np.clip(g, -self.clip_obs, self.clip_obs)
125 |         return o, g
126 | 
127 |     def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
128 |                     compute_Q=False):
129 |         o, g = self._preprocess_og(o, ag, g)
130 |         policy = self.target if use_target_net else self.main
131 |         # values to compute
132 |         vals = [policy.pi_tf]
133 |         if compute_Q:
134 |             vals += [policy.Q_pi_tf]
135 |         # feed
136 |         feed = {
137 |             policy.o_tf: o.reshape(-1, self.dimo),
138 |             policy.g_tf: g.reshape(-1, self.dimg),
139 |             policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
140 |         }
141 | 
142 |         ret = self.sess.run(vals, feed_dict=feed)
143 | 
144 |         # action postprocessing
145 |         u = ret[0]
146 |         noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
147 |         u += noise
148 |         u = np.clip(u, -self.max_u, self.max_u)
149 |         u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
150 |         if u.shape[0] == 1:
151 |             u = u[0]
152 |         u = u.copy()
153 |         ret[0] = u
154 | 
155 |         if len(ret) == 1:
156 |             return ret[0]
157 |         else:
158 |             return ret
159 | 
160 |     def get_td_errors(self, o, g, u):
161 |         o, g = self._preprocess_og(o, g, g)
162 |         vals = [self.td_error_tf]
163 |         r = np.ones((o.reshape(-1, self.dimo).shape[0],1))
164 | 
165 |         feed = {
166 |             self.target.o_tf: o.reshape(-1, self.dimo),
167 |             self.target.g_tf: g.reshape(-1, self.dimg),
168 |             self.bath_tf_r: r,
169 |             self.main.o_tf: o.reshape(-1, self.dimo),
170 |             self.main.g_tf: g.reshape(-1, self.dimg),
171 |             self.main.u_tf: u.reshape(-1, self.dimu)
172 |         }
173 |         td_errors = self.sess.run(vals, feed_dict=feed)
174 |         td_errors = td_errors.copy()
175 | 
176 |         return td_errors
177 | 
178 |     def store_episode(self, episode_batch, dump_buffer, w_potential, w_linear, w_rotational, rank_method, clip_energy, update_stats=True):
179 |         """
180 |         episode_batch: array of batch_size x (T or T+1) x dim_key
181 |                        'o' is of size T+1, others are of size T
182 |         """
183 |         if self.prioritization == 'tderror':
184 |             self.buffer.store_episode(episode_batch, dump_buffer)
185 |         elif self.prioritization == 'energy':
186 |             self.buffer.store_episode(episode_batch, w_potential, w_linear, w_rotational, rank_method, clip_energy)
187 |         else:
188 |             self.buffer.store_episode(episode_batch)
189 | 
190 |         if update_stats:
191 |             # add transitions to normalizer
192 |             episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
193 |             episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
194 |             num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
195 |             
196 |             if self.prioritization == 'energy':
197 |                 if not self.buffer.current_size==0 and not len(episode_batch['ag'])==0:
198 |                     transitions = self.sample_transitions(episode_batch, num_normalizing_transitions, 'none', 1.0, True)
199 |             elif self.prioritization == 'tderror':
200 |                 transitions, weights, episode_idxs = \
201 |                 self.sample_transitions(self.buffer, episode_batch, num_normalizing_transitions, beta=0)
202 |             else:
203 |                 transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)
204 | 
205 | 
206 |             o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
207 |             transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
208 | 
209 |             self.o_stats.update(transitions['o'])
210 |             self.g_stats.update(transitions['g'])
211 | 
212 |             self.o_stats.recompute_stats()
213 |             self.g_stats.recompute_stats()
214 | 
215 |     def get_current_buffer_size(self):
216 |         return self.buffer.get_current_size()
217 | 
218 |     def dump_buffer(self, epoch):
219 |         self.buffer.dump_buffer(epoch)
220 | 
221 |     def _sync_optimizers(self):
222 |         self.Q_adam.sync()
223 |         self.pi_adam.sync()
224 | 
225 |     def _grads(self):
226 |         # Avoid feed_dict here for performance!
227 |         critic_loss, actor_loss, Q_grad, pi_grad, td_error = self.sess.run([
228 |             self.Q_loss_tf,
229 |             self.main.Q_pi_tf,
230 |             self.Q_grad_tf,
231 |             self.pi_grad_tf,
232 |             self.td_error_tf
233 |         ])
234 |         return critic_loss, actor_loss, Q_grad, pi_grad, td_error
235 | 
236 |     def _update(self, Q_grad, pi_grad):
237 |         self.Q_adam.update(Q_grad, self.Q_lr)
238 |         self.pi_adam.update(pi_grad, self.pi_lr)
239 | 
240 |     def sample_batch(self, t):
241 | 
242 |         if self.prioritization == 'energy':
243 |             transitions = self.buffer.sample(self.batch_size, self.rank_method, temperature=self.temperature)
244 |             weights = np.ones_like(transitions['r']).copy()
245 |         elif self.prioritization == 'tderror':
246 |             transitions, weights, idxs = self.buffer.sample(self.batch_size, beta=self.beta_schedule.value(t))
247 |         else:
248 |             transitions = self.buffer.sample(self.batch_size)
249 |             weights = np.ones_like(transitions['r']).copy()
250 | 
251 |         o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
252 |         ag, ag_2 = transitions['ag'], transitions['ag_2']
253 |         transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
254 |         transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)
255 | 
256 |         transitions['w'] = weights.flatten().copy() # note: ordered dict
257 |         transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
258 | 
259 |         if self.prioritization == 'tderror':
260 |             return (transitions_batch, idxs)
261 |         else:
262 |             return transitions_batch
263 | 
264 |     def stage_batch(self, t, batch=None): #
265 |         if batch is None:
266 |             if self.prioritization == 'tderror':
267 |                 batch, idxs = self.sample_batch(t)
268 |             else:
269 |                 batch = self.sample_batch(t)
270 |         assert len(self.buffer_ph_tf) == len(batch)
271 |         self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))
272 | 
273 |         if self.prioritization == 'tderror':
274 |             return idxs
275 | 
276 |     def train(self, t, dump_buffer, stage=True):
277 |         if not self.buffer.current_size==0:
278 |             if stage:
279 |                 if self.prioritization == 'tderror':
280 |                     idxs = self.stage_batch(t)
281 |                 else:
282 |                     self.stage_batch(t)
283 |             critic_loss, actor_loss, Q_grad, pi_grad, td_error = self._grads()            
284 |             if self.prioritization == 'tderror':
285 |                 new_priorities = np.abs(td_error) + self.eps # td_error
286 | 
287 |                 if dump_buffer:
288 |                     T = self.buffer.buffers['u'].shape[1]
289 |                     episode_idxs = idxs // T
290 |                     t_samples = idxs % T
291 |                     batch_size = td_error.shape[0]
292 |                     with self.buffer.lock:
293 |                         for i in range(batch_size):
294 |                             self.buffer.buffers['td'][episode_idxs[i]][t_samples[i]] = td_error[i]
295 | 
296 |                 self.buffer.update_priorities(idxs, new_priorities)
297 |             self._update(Q_grad, pi_grad)
298 |             return critic_loss, actor_loss
299 | 
300 |     def _init_target_net(self):
301 |         self.sess.run(self.init_target_net_op)
302 | 
303 |     def update_target_net(self):
304 |         self.sess.run(self.update_target_net_op)
305 | 
306 |     def clear_buffer(self):
307 |         self.buffer.clear_buffer()
308 | 
309 |     def _vars(self, scope):
310 |         res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
311 |         assert len(res) > 0
312 |         return res
313 | 
314 |     def _global_vars(self, scope):
315 |         res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
316 |         return res
317 | 
318 |     def _create_network(self, reuse=False):
319 |         logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
320 | 
321 |         self.sess = tf.get_default_session()
322 |         if self.sess is None:
323 |             self.sess = tf.InteractiveSession()
324 | 
325 |         # running averages
326 |         with tf.variable_scope('o_stats') as vs:
327 |             if reuse:
328 |                 vs.reuse_variables()
329 |             self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
330 |         with tf.variable_scope('g_stats') as vs:
331 |             if reuse:
332 |                 vs.reuse_variables()
333 |             self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)
334 | 
335 |         # mini-batch sampling.
336 |         batch = self.staging_tf.get()
337 |         batch_tf = OrderedDict([(key, batch[i])
338 |                                 for i, key in enumerate(self.stage_shapes.keys())])
339 |         batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
340 |         batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1])
341 | 
342 |         # networks
343 |         with tf.variable_scope('main') as vs:
344 |             if reuse:
345 |                 vs.reuse_variables()
346 |             self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
347 |             vs.reuse_variables()
348 |         with tf.variable_scope('target') as vs:
349 |             if reuse:
350 |                 vs.reuse_variables()
351 |             target_batch_tf = batch_tf.copy()
352 |             target_batch_tf['o'] = batch_tf['o_2']
353 |             target_batch_tf['g'] = batch_tf['g_2']
354 |             self.target = self.create_actor_critic(
355 |                 target_batch_tf, net_type='target', **self.__dict__)
356 |             vs.reuse_variables()
357 |         assert len(self._vars("main")) == len(self._vars("target"))
358 | 
359 |         # loss functions
360 |         target_Q_pi_tf = self.target.Q_pi_tf
361 |         clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
362 |         target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
363 | 
364 |         self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf
365 |         self.errors_tf = tf.square(self.td_error_tf)
366 |         self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf)
367 |         self.Q_loss_tf = tf.reduce_mean(self.errors_tf)
368 | 
369 |         self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
370 |         self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
371 |         Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
372 |         pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
373 |         assert len(self._vars('main/Q')) == len(Q_grads_tf)
374 |         assert len(self._vars('main/pi')) == len(pi_grads_tf)
375 |         self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
376 |         self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
377 |         self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
378 |         self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))
379 | 
380 |         # optimizers
381 |         self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
382 |         self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)
383 | 
384 |         # polyak averaging
385 |         self.main_vars = self._vars('main/Q') + self._vars('main/pi')
386 |         self.target_vars = self._vars('target/Q') + self._vars('target/pi')
387 |         self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
388 |         self.init_target_net_op = list(
389 |             map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
390 |         self.update_target_net_op = list(
391 |             map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))
392 | 
393 |         # initialize all variables
394 |         tf.variables_initializer(self._global_vars('')).run()
395 |         self._sync_optimizers()
396 |         self._init_target_net()
397 | 
398 |     def logs(self, prefix=''):
399 |         logs = []
400 |         logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
401 |         logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
402 |         logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
403 |         logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]
404 |         
405 |         if prefix is not '' and not prefix.endswith('/'):
406 |             return [(prefix + '/' + key, val) for key, val in logs]
407 |         else:
408 |             return logs
409 | 
410 |     def __getstate__(self):
411 |         """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
412 |         """
413 |         excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
414 |                              'main', 'target', 'lock', 'env', 'sample_transitions',
415 |                              'stage_shapes', 'create_actor_critic']
416 | 
417 |         state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
418 |         state['buffer_size'] = self.buffer_size
419 |         state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name])
420 |         return state
421 | 
422 |     def __setstate__(self, state):
423 |         if 'sample_transitions' not in state:
424 |             # We don't need this for playing the policy.
425 |             state['sample_transitions'] = None
426 |         state['env_name'] = None # No need for playing the policy
427 | 
428 |         self.__init__(**state)
429 |         # set up stats (they are overwritten in __init__)
430 |         for k, v in state.items():
431 |             if k[-6:] == '_stats':
432 |                 self.__dict__[k] = v
433 |         # load TF variables
434 |         vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
435 |         assert(len(vars) == len(state["tf"]))
436 |         node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
437 |         self.sess.run(node)
438 | 


--------------------------------------------------------------------------------
/baselines/her/experiment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ruizhaogit/EnergyBasedPrioritization/2fd2f5bab0547848f4f76b837d16238435518dcc/baselines/her/experiment/__init__.py


--------------------------------------------------------------------------------
/baselines/her/experiment/config.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import numpy as np
  3 | import json
  4 | import os
  5 | import gym
  6 | 
  7 | from baselines import logger
  8 | from baselines.her.ddpg import DDPG
  9 | from baselines.her.her import make_sample_her_transitions, \
 10 |                               make_sample_her_transitions_energy, \
 11 |                               make_sample_her_transitions_prioritized_replay
 12 | 
 13 | 
 14 | DEFAULT_ENV_PARAMS = {
 15 |     'FetchReach-v0': {
 16 |         'n_cycles': 10,
 17 |     },
 18 | }
 19 | 
 20 | 
 21 | DEFAULT_PARAMS = {
 22 |     # env
 23 |     'max_u': 1.,  # max absolute value of actions on different coordinates
 24 |     # ddpg
 25 |     'layers': 3,  # number of layers in the critic/actor networks
 26 |     'hidden': 256,  # number of neurons in each hidden layers
 27 |     'network_class': 'baselines.her.actor_critic:ActorCritic',
 28 |     'Q_lr': 0.001,  # critic learning rate
 29 |     'pi_lr': 0.001,  # actor learning rate
 30 |     'buffer_size': int(1E6),  # int(1E6) int(1E6) bug for experience replay 
 31 |     'polyak': 0.95,  # polyak averaging coefficient
 32 |     'action_l2': 1.0,  # quadratic penalty on actions (before rescaling by max_u)
 33 |     'clip_obs': 200.,
 34 |     'scope': 'ddpg',  # can be tweaked for testing
 35 |     'relative_goals': False,
 36 |     # training
 37 |     'n_cycles': 50,  # per epoch
 38 |     'rollout_batch_size': 2,  # per mpi thread
 39 |     'n_batches': 40,  # training batches per cycle
 40 |     'batch_size': 256,  # per mpi thread, measured in transitions and reduced to even multiple of chunk_length.
 41 |     'n_test_rollouts': 10,  # number of test rollouts per epoch, each consists of rollout_batch_size rollouts
 42 |     'test_with_polyak': False,  # run test episodes with the target network
 43 |     # exploration
 44 |     'random_eps': 0.3,  # percentage of time a random action is taken
 45 |     'noise_eps': 0.2,  # std of gaussian noise added to not-completely-random actions as a percentage of max_u
 46 |     # HER
 47 |     'replay_strategy': 'future',  # supported modes: future, none
 48 |     'replay_k': 4,  # number of additional goals used for replay, only used if off_policy_data=future
 49 |     # normalization
 50 |     'norm_eps': 0.01,  # epsilon used for observation normalization
 51 |     'norm_clip': 5,  # normalized observations are cropped to this values
 52 | 
 53 |     # prioritized_replay (tderror)
 54 |     'alpha': 0.6, # 0.6
 55 |     'beta0': 0.4, # 0.4
 56 |     'beta_iters': None, # None
 57 |     'eps': 1e-6,
 58 | 
 59 |     # energy-based prioritization
 60 |     'w_potential': 1.0,
 61 |     'w_linear': 1.0,
 62 |     'w_rotational': 1.0,
 63 | }
 64 | 
 65 | 
 66 | CACHED_ENVS = {}
 67 | def cached_make_env(make_env):
 68 |     """
 69 |     Only creates a new environment from the provided function if one has not yet already been
 70 |     created. This is useful here because we need to infer certain properties of the env, e.g.
 71 |     its observation and action spaces, without any intend of actually using it.
 72 |     """
 73 |     if make_env not in CACHED_ENVS:
 74 |         env = make_env()
 75 |         CACHED_ENVS[make_env] = env
 76 |     return CACHED_ENVS[make_env]
 77 | 
 78 | 
 79 | def prepare_params(kwargs):
 80 |     # DDPG params
 81 |     ddpg_params = dict()
 82 | 
 83 |     env_name = kwargs['env_name']
 84 |     def make_env():
 85 |         return gym.make(env_name)
 86 |     kwargs['make_env'] = make_env
 87 |     tmp_env = cached_make_env(kwargs['make_env'])
 88 |     assert hasattr(tmp_env, '_max_episode_steps')
 89 |     kwargs['T'] = tmp_env._max_episode_steps
 90 |     tmp_env.reset()
 91 |     kwargs['max_u'] = np.array(kwargs['max_u']) if type(kwargs['max_u']) == list else kwargs['max_u']
 92 |     kwargs['gamma'] = 1. - 1. / kwargs['T']
 93 |     if 'lr' in kwargs:
 94 |         kwargs['pi_lr'] = kwargs['lr']
 95 |         kwargs['Q_lr'] = kwargs['lr']
 96 |         del kwargs['lr']
 97 |     for name in ['buffer_size', 'hidden', 'layers',
 98 |                  'network_class',
 99 |                  'polyak', 
100 |                  'batch_size', 'Q_lr', 'pi_lr',
101 |                  'norm_eps', 'norm_clip', 'max_u',
102 |                  'action_l2', 'clip_obs', 'scope', 'relative_goals',
103 |                  'alpha', 'beta0', 'beta_iters', 'eps']:
104 |         ddpg_params[name] = kwargs[name]
105 |         kwargs['_' + name] = kwargs[name]
106 |         del kwargs[name]
107 |     kwargs['ddpg_params'] = ddpg_params
108 | 
109 |     return kwargs
110 | 
111 | 
112 | def log_params(params, logger=logger):
113 |     for key in sorted(params.keys()):
114 |         logger.info('{}: {}'.format(key, params[key]))
115 | 
116 | 
117 | def configure_her(params):
118 |     env = cached_make_env(params['make_env'])
119 |     env.reset()
120 |     def reward_fun(ag_2, g, info):  # vectorized
121 |         return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info)
122 | 
123 |     # Prepare configuration for HER.
124 |     her_params = {
125 |         'reward_fun': reward_fun,
126 |     }
127 |     for name in ['replay_strategy', 'replay_k']:
128 |         her_params[name] = params[name]
129 |         params['_' + name] = her_params[name]
130 |         del params[name]
131 | 
132 |     if params['prioritization'] == 'energy':
133 |         sample_her_transitions = make_sample_her_transitions_energy(**her_params)
134 |     elif params['prioritization'] == 'tderror':
135 |         sample_her_transitions = make_sample_her_transitions_prioritized_replay(**her_params)
136 |     else:
137 |         sample_her_transitions = make_sample_her_transitions(**her_params)
138 | 
139 |     return sample_her_transitions
140 | 
141 | 
142 | def simple_goal_subtract(a, b):
143 |     assert a.shape == b.shape
144 |     return a - b
145 | 
146 | 
147 | def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True):
148 |     sample_her_transitions = configure_her(params)
149 |     # Extract relevant parameters.
150 |     gamma = params['gamma']
151 |     rollout_batch_size = params['rollout_batch_size']
152 |     ddpg_params = params['ddpg_params']
153 |     temperature = params['temperature']
154 |     prioritization = params['prioritization']
155 |     env_name = params['env_name']
156 |     max_timesteps = params['max_timesteps']
157 |     rank_method = params['rank_method']
158 | 
159 |     input_dims = dims.copy()
160 | 
161 |     # DDPG agent
162 |     env = cached_make_env(params['make_env'])
163 |     env.reset()
164 |     ddpg_params.update({'input_dims': input_dims,  # agent takes an input observations
165 |                         'T': params['T'],
166 |                         'clip_pos_returns': True,  # clip positive returns
167 |                         'clip_return': (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
168 |                         'rollout_batch_size': rollout_batch_size,
169 |                         'subtract_goals': simple_goal_subtract,
170 |                         'sample_transitions': sample_her_transitions,
171 |                         'gamma': gamma,
172 |                         'temperature': temperature,
173 |                         'prioritization': prioritization,
174 |                         'env_name': env_name,
175 |                         'max_timesteps': max_timesteps,
176 |                         'rank_method': rank_method,
177 |                         })
178 |     ddpg_params['info'] = {
179 |         'env_name': params['env_name'],
180 |     }
181 |     policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi)
182 |     return policy
183 | 
184 | 
185 | def configure_dims(params):
186 |     env = cached_make_env(params['make_env'])
187 |     env.reset()
188 |     obs, _, _, info = env.step(env.action_space.sample())
189 | 
190 |     dims = {
191 |         'o': obs['observation'].shape[0],
192 |         'u': env.action_space.shape[0],
193 |         'g': obs['desired_goal'].shape[0],
194 |     }
195 |     for key, value in info.items():
196 |         value = np.array(value)
197 |         if value.ndim == 0:
198 |             value = value.reshape(1)
199 |         dims['info_{}'.format(key)] = value.shape[0]
200 |     return dims
201 | 


--------------------------------------------------------------------------------
/baselines/her/experiment/play.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import numpy as np
 3 | import pickle
 4 | 
 5 | from baselines import logger
 6 | from baselines.common import set_global_seeds
 7 | import baselines.her.experiment.config as config
 8 | from baselines.her.rollout import RolloutWorker
 9 | 
10 | 
11 | @click.command()
12 | @click.argument('policy_file', type=str)
13 | @click.option('--seed', type=int, default=0)
14 | @click.option('--n_test_rollouts', type=int, default=20)
15 | @click.option('--render', type=int, default=1)
16 | def main(policy_file, seed, n_test_rollouts, render):
17 |     set_global_seeds(seed)
18 | 
19 |     # Load policy.
20 |     with open(policy_file, 'rb') as f:
21 |         policy = pickle.load(f)
22 |     env_name = policy.info['env_name']
23 | 
24 |     # Prepare params.
25 |     params = config.DEFAULT_PARAMS
26 |     if env_name in config.DEFAULT_ENV_PARAMS:
27 |         params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
28 |     params['env_name'] = env_name
29 |     params = config.prepare_params(params)
30 |     config.log_params(params, logger=logger)
31 | 
32 |     dims = config.configure_dims(params)
33 | 
34 |     eval_params = {
35 |         'exploit': True,
36 |         'use_target_net': params['test_with_polyak'],
37 |         'compute_Q': True,
38 |         'rollout_batch_size': 1,
39 |         'render': bool(render),
40 |     }
41 | 
42 |     for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
43 |         eval_params[name] = params[name]
44 |     
45 |     evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
46 |     evaluator.seed(seed)
47 | 
48 |     # Run evaluation.
49 |     evaluator.clear_history()
50 |     for _ in range(n_test_rollouts):
51 |         evaluator.generate_rollouts()
52 | 
53 |     # record logs
54 |     for key, val in evaluator.logs('test'):
55 |         logger.record_tabular(key, np.mean(val))
56 |     logger.dump_tabular()
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/baselines/her/experiment/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import click
  5 | import numpy as np
  6 | import json
  7 | from mpi4py import MPI
  8 | 
  9 | from baselines import logger
 10 | from baselines.common import set_global_seeds
 11 | from baselines.common.mpi_moments import mpi_moments
 12 | import baselines.her.experiment.config as config
 13 | from baselines.her.rollout import RolloutWorker
 14 | from baselines.her.util import mpi_fork
 15 | 
 16 | import os.path as osp
 17 | import tempfile
 18 | import datetime
 19 | 
 20 | 
 21 | def mpi_average(value):
 22 |     if value == []:
 23 |         value = [0.]
 24 |     if not isinstance(value, list):
 25 |         value = [value]
 26 |     return mpi_moments(np.array(value))[0]
 27 | 
 28 | 
 29 | def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, 
 30 |           policy_save_interval, save_policies, num_cpu, dump_buffer, w_potential, w_linear,
 31 |           w_rotational, rank_method, clip_energy, **kwargs):
 32 |     rank = MPI.COMM_WORLD.Get_rank()
 33 | 
 34 |     latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl')
 35 |     best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl')
 36 |     periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl')
 37 | 
 38 |     logger.info("Training...")
 39 |     best_success_rate = -1
 40 |     t = 1
 41 |     for epoch in range(n_epochs):
 42 |         # train
 43 |         rollout_worker.clear_history()
 44 |         for cycle in range(n_cycles):
 45 |             episode = rollout_worker.generate_rollouts()
 46 |             policy.store_episode(episode, dump_buffer, w_potential, w_linear, w_rotational, rank_method, clip_energy)
 47 |             for batch in range(n_batches):
 48 |                 t = ((epoch*n_cycles*n_batches)+(cycle*n_batches)+batch)*num_cpu
 49 |                 policy.train(t, dump_buffer)
 50 | 
 51 |             policy.update_target_net()
 52 | 
 53 |         # test
 54 |         evaluator.clear_history()
 55 |         for _ in range(n_test_rollouts):
 56 |             evaluator.generate_rollouts()
 57 | 
 58 |         # record logs
 59 |         logger.record_tabular('epoch', epoch)
 60 |         for key, val in evaluator.logs('test'):
 61 |             logger.record_tabular(key, mpi_average(val))
 62 |         for key, val in rollout_worker.logs('train'):
 63 |             logger.record_tabular(key, mpi_average(val))
 64 |         for key, val in policy.logs():
 65 |             logger.record_tabular(key, mpi_average(val))
 66 | 
 67 |         if rank == 0:
 68 |             logger.dump_tabular()
 69 | 
 70 |             if dump_buffer:
 71 |                 policy.dump_buffer(epoch)
 72 | 
 73 |         # save the policy if it's better than the previous ones
 74 |         success_rate = mpi_average(evaluator.current_success_rate())
 75 |         if rank == 0 and success_rate >= best_success_rate and save_policies:
 76 |             best_success_rate = success_rate
 77 |             logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path))
 78 |             evaluator.save_policy(best_policy_path)
 79 |             evaluator.save_policy(latest_policy_path)
 80 |         if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies:
 81 |             policy_path = periodic_policy_path.format(epoch)
 82 |             logger.info('Saving periodic policy to {} ...'.format(policy_path))
 83 |             evaluator.save_policy(policy_path)
 84 | 
 85 |         # make sure that different threads have different seeds
 86 |         local_uniform = np.random.uniform(size=(1,))
 87 |         root_uniform = local_uniform.copy()
 88 |         MPI.COMM_WORLD.Bcast(root_uniform, root=0)
 89 |         if rank != 0:
 90 |             assert local_uniform[0] != root_uniform[0]
 91 | 
 92 | 
 93 | def launch(
 94 |     env_name, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
 95 |     temperature, prioritization, binding, logging, version, dump_buffer, n_cycles, rank_method,
 96 |     w_potential, w_linear, w_rotational, clip_energy, override_params={}, save_policies=True):
 97 | 
 98 |     # Fork for multi-CPU MPI implementation.
 99 |     if num_cpu > 1:
100 |         whoami = mpi_fork(num_cpu, binding)
101 |         if whoami == 'parent':
102 |             sys.exit(0)
103 |         import baselines.common.tf_util as U
104 |         U.single_threaded_session().__enter__()
105 |     rank = MPI.COMM_WORLD.Get_rank()
106 | 
107 |     # Configure logging
108 | 
109 |     if logging: 
110 |         logdir = 'logs/'+str(env_name)+'-temperature'+str(temperature)+\
111 |                  '-prioritization'+str(prioritization)+'-replay_strategy'+str(replay_strategy)+\
112 |                  '-n_epochs'+str(n_epochs)+'-num_cpu'+str(num_cpu)+'-seed'+str(seed)+\
113 |                  '-n_cycles'+str(n_cycles)+'-rank_method'+str(rank_method)+\
114 |                  '-w_potential'+str(w_potential)+'-w_linear'+str(w_linear)+'-w_rotational'+str(w_rotational)+\
115 |                  '-clip_energy'+str(clip_energy)+\
116 |                  '-version'+str(version)
117 |     else:
118 |         logdir = osp.join(tempfile.gettempdir(),
119 |             datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
120 | 
121 |     if rank == 0:
122 |         if logdir or logger.get_dir() is None:
123 |             logger.configure(dir=logdir)
124 |     else:
125 |         logger.configure()
126 |     logdir = logger.get_dir()
127 |     assert logdir is not None
128 |     os.makedirs(logdir, exist_ok=True)
129 | 
130 |     # Seed everything.
131 |     rank_seed = seed + 1000000 * rank
132 |     set_global_seeds(rank_seed)
133 | 
134 |     # Prepare params.
135 |     params = config.DEFAULT_PARAMS
136 |     params['env_name'] = env_name
137 |     params['replay_strategy'] = replay_strategy
138 |     params['temperature'] = temperature
139 |     params['prioritization'] = prioritization
140 |     params['binding'] = binding
141 |     params['max_timesteps'] = n_epochs * params['n_cycles'] *  params['n_batches'] * num_cpu
142 |     params['version'] = version
143 |     params['dump_buffer'] = dump_buffer
144 |     params['n_cycles'] = n_cycles
145 |     params['rank_method'] = rank_method
146 |     params['w_potential'] = w_potential
147 |     params['w_linear'] = w_linear
148 |     params['w_rotational'] = w_rotational
149 |     params['clip_energy'] = clip_energy
150 |     params['n_epochs'] = n_epochs
151 |     params['num_cpu'] = num_cpu
152 | 
153 |     if params['dump_buffer']:
154 |         params['alpha'] =0
155 | 
156 |     if env_name in config.DEFAULT_ENV_PARAMS:
157 |         params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
158 |     params.update(**override_params)  # makes it possible to override any parameter
159 |     with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
160 |         json.dump(params, f)
161 |     params = config.prepare_params(params)
162 |     config.log_params(params, logger=logger)
163 | 
164 |     dims = config.configure_dims(params)
165 |     policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)
166 | 
167 |     rollout_params = {
168 |         'exploit': False,
169 |         'use_target_net': False,
170 |         'use_demo_states': True,
171 |         'compute_Q': False,
172 |         'T': params['T'],
173 |     }
174 | 
175 |     eval_params = {
176 |         'exploit': True,
177 |         'use_target_net': params['test_with_polyak'],
178 |         'use_demo_states': False,
179 |         'compute_Q': True,
180 |         'T': params['T'],
181 |     }
182 | 
183 |     for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
184 |         rollout_params[name] = params[name]
185 |         eval_params[name] = params[name]
186 | 
187 |     rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
188 |     rollout_worker.seed(rank_seed)
189 | 
190 |     evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
191 |     evaluator.seed(rank_seed)
192 | 
193 |     train(
194 |         logdir=logdir, policy=policy, rollout_worker=rollout_worker,
195 |         evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
196 |         n_cycles=params['n_cycles'], n_batches=params['n_batches'],
197 |         policy_save_interval=policy_save_interval, save_policies=save_policies,
198 |         num_cpu=num_cpu, dump_buffer=dump_buffer, w_potential=params['w_potential'], 
199 |         w_linear=params['w_linear'], w_rotational=params['w_rotational'], rank_method=rank_method,
200 |         clip_energy=clip_energy)
201 | 
202 | 
203 | @click.command()
204 | @click.option('--env_name', type=click.Choice(['FetchPickAndPlace-v0', 'HandManipulateBlockFull-v0', \
205 |         'HandManipulateEggFull-v0', 'HandManipulatePenRotate-v0']), default='FetchPickAndPlace-v0', help='the name of the OpenAI Gym \
206 |         environment that you want to train on. We tested EBP on four challenging robotic manipulation tasks, including: \
207 |         FetchPickAndPlace-v0, HandManipulateBlockFull-v0, HandManipulateEggFull-v0, HandManipulatePenRotate-v0')
208 | @click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
209 | @click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')
210 | @click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code')
211 | @click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.')
212 | @click.option('--replay_strategy', type=click.Choice(['future', 'final', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.')
213 | @click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped')
214 | @click.option('--temperature', type=float, default=1.0, help='temperature value for Enery-Based Prioritization (EBP)')
215 | @click.option('--prioritization', type=click.Choice(['none', 'energy', 'tderror']), default='energy', help='the prioritization strategy to be used. "energy" uses EBP;\
216 |                                                                                                              "none" is vanilla HER; tderror is Prioritized Experience Replay.')
217 | @click.option('--binding', type=click.Choice(['none', 'core']), default='core', help='configure mpi using bind-to none or core.')
218 | @click.option('--logging', type=bool, default=False, help='whether or not logging')
219 | @click.option('--version', type=int, default=0, help='version')
220 | @click.option('--dump_buffer', type=bool, default=False, help='dump buffer contains achieved goals, energy, tderrors for analysis')
221 | @click.option('--n_cycles', type=int, default=50, help='n_cycles')
222 | @click.option('--rank_method', type=click.Choice(['none', 'min', 'dense', 'average']), default='none', help='energy ranking method')
223 | @click.option('--w_potential', type=float, default=1.0, help='w_potential')
224 | @click.option('--w_linear', type=float, default=1.0, help='w_linear')
225 | @click.option('--w_rotational', type=float, default=1.0, help='w_rotational')
226 | @click.option('--clip_energy', type=float, default=999, help='clip_energy')
227 | 
228 | def main(**kwargs):
229 |     launch(**kwargs)
230 | 
231 | if __name__ == '__main__':
232 |     main()
233 | 


--------------------------------------------------------------------------------
/baselines/her/her.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from scipy.stats import rankdata
  4 | 
  5 | import random
  6 | 
  7 | 
  8 | def make_sample_her_transitions(replay_strategy, replay_k, reward_fun):
  9 |     """Creates a sample function that can be used for HER experience replay.
 10 | 
 11 |     Args:
 12 |         replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none',
 13 |             regular DDPG experience replay is used
 14 |         replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
 15 |             as many HER replays as regular replays are used)
 16 |         reward_fun (function): function to re-compute the reward with substituted goals
 17 |     """
 18 |     if (replay_strategy == 'future') or (replay_strategy == 'final'):
 19 |         future_p = 1 - (1. / (1 + replay_k))
 20 |     else:  # 'replay_strategy' == 'none'
 21 |         future_p = 0
 22 | 
 23 |     def _sample_her_transitions(episode_batch, batch_size_in_transitions):
 24 |         """episode_batch is {key: array(buffer_size x T x dim_key)}
 25 |         """
 26 |         T = episode_batch['u'].shape[1]
 27 |         rollout_batch_size = episode_batch['u'].shape[0]
 28 |         batch_size = batch_size_in_transitions
 29 | 
 30 |         # Select which episodes and time steps to use.
 31 |         episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
 32 |         t_samples = np.random.randint(T, size=batch_size)
 33 |         transitions = {key: episode_batch[key][episode_idxs, t_samples].copy()
 34 |                        for key in episode_batch.keys()}
 35 | 
 36 |         # Select future time indexes proportional with probability future_p. These
 37 |         # will be used for HER replay by substituting in future goals.
 38 |         her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
 39 |         future_offset = np.random.uniform(size=batch_size) * (T - t_samples)
 40 |         future_offset = future_offset.astype(int)
 41 |         future_t = (t_samples + 1 + future_offset)[her_indexes]
 42 | 
 43 |         if replay_strategy == 'final':
 44 |             future_t[:] = T
 45 | 
 46 |         # Replace goal with achieved goal but only for the previously-selected
 47 |         # HER transitions (as defined by her_indexes). For the other transitions,
 48 |         # keep the original goal.
 49 |         future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
 50 |         transitions['g'][her_indexes] = future_ag
 51 | 
 52 |         # Reconstruct info dictionary for reward computation.
 53 |         info = {}
 54 |         for key, value in transitions.items():
 55 |             if key.startswith('info_'):
 56 |                 info[key.replace('info_', '')] = value
 57 | 
 58 |         # Re-compute reward since we may have substituted the goal.
 59 |         reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
 60 |         reward_params['info'] = info
 61 |         transitions['r'] = reward_fun(**reward_params)
 62 | 
 63 |         transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
 64 |                        for k in transitions.keys()}
 65 | 
 66 |         assert(transitions['u'].shape[0] == batch_size_in_transitions)
 67 | 
 68 |         return transitions
 69 | 
 70 |     return _sample_her_transitions
 71 | 
 72 | 
 73 | def make_sample_her_transitions_energy(replay_strategy, replay_k, reward_fun):
 74 | 
 75 |     if (replay_strategy == 'future') or (replay_strategy == 'final'):
 76 |         future_p = 1 - (1. / (1 + replay_k))
 77 |     else:
 78 |         future_p = 0
 79 |   
 80 | 
 81 |     def _sample_her_transitions(episode_batch, batch_size_in_transitions, rank_method, temperature, update_stats=False):
 82 | 
 83 |         T = episode_batch['u'].shape[1]
 84 |         rollout_batch_size = episode_batch['u'].shape[0]
 85 |         batch_size = batch_size_in_transitions
 86 | 
 87 |         episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
 88 |         t_samples = np.random.randint(T, size=batch_size)
 89 | 
 90 |         if not update_stats:
 91 |             if rank_method == 'none':
 92 |                 energy_trajectory = episode_batch['e']
 93 |             else:
 94 |                 energy_trajectory = episode_batch['p']
 95 |             p_trajectory = np.power(energy_trajectory, 1/(temperature+1e-2))
 96 |             p_trajectory = p_trajectory / p_trajectory.sum()
 97 |             episode_idxs_energy = np.random.choice(rollout_batch_size, size=batch_size, replace=True, p=p_trajectory.flatten())
 98 |             episode_idxs = episode_idxs_energy
 99 | 
100 | 
101 |         transitions = {}
102 |         for key in episode_batch.keys():
103 |             if not key =='p' and not key == 's' and not key == 'e':
104 |                 transitions[key] = episode_batch[key][episode_idxs, t_samples].copy()
105 | 
106 |         # Select future time indexes proportional with probability future_p. These
107 |         # will be used for HER replay by substituting in future goals.
108 |         her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
109 | 
110 |         future_offset = np.random.uniform(size=batch_size) * (T - t_samples)
111 |         future_offset = future_offset.astype(int)
112 |         future_t = (t_samples + 1 + future_offset)[her_indexes]
113 | 
114 |         if replay_strategy == 'final':
115 |             future_t[:] = T
116 | 
117 |         # Replace goal with achieved goal but only for the previously-selected
118 |         # HER transitions (as defined by her_indexes). For the other transitions,
119 |         # keep the original goal.
120 |         future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
121 | 
122 |         transitions['g'][her_indexes] = future_ag
123 | 
124 |         # Reconstruct info dictionary for reward computation.
125 |         info = {}
126 |         for key, value in transitions.items():
127 |             if key.startswith('info_'):
128 |                 info[key.replace('info_', '')] = value
129 | 
130 |         # Re-compute reward since we may have substituted the goal.
131 |         reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
132 |         reward_params['info'] = info
133 | 
134 |         transitions['r'] = reward_fun(**reward_params)
135 | 
136 |         transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
137 |                        for k in transitions.keys()}
138 | 
139 |         assert(transitions['u'].shape[0] == batch_size_in_transitions)
140 | 
141 |         return transitions
142 | 
143 |     return _sample_her_transitions
144 | 
145 | 
146 | def make_sample_her_transitions_prioritized_replay(replay_strategy, replay_k, reward_fun):
147 | 
148 |     if (replay_strategy == 'future') or (replay_strategy == 'final'):
149 |         future_p = 1 - (1. / (1 + replay_k))
150 |     else: 
151 |         future_p = 0  
152 | 
153 |     def _sample_proportional(self, rollout_batch_size, batch_size, T):
154 |         episode_idxs = []
155 |         t_samples = []
156 |         for _ in range(batch_size):
157 |             self.n_transitions_stored = min(self.n_transitions_stored, self.size_in_transitions)
158 |             mass = random.random() * self._it_sum.sum(0, self.n_transitions_stored - 1)
159 |             idx = self._it_sum.find_prefixsum_idx(mass)
160 |             assert idx < self.n_transitions_stored
161 |             episode_idx = idx//T
162 |             assert episode_idx < rollout_batch_size
163 |             t_sample = idx%T
164 |             episode_idxs.append(episode_idx)
165 |             t_samples.append(t_sample)
166 | 
167 |         return (episode_idxs, t_samples)
168 | 
169 |     def _sample_her_transitions(self, episode_batch, batch_size_in_transitions, beta):
170 |         """episode_batch is {key: array(buffer_size x T x dim_key)}
171 |         """
172 | 
173 |         T = episode_batch['u'].shape[1]
174 |         rollout_batch_size = episode_batch['u'].shape[0]
175 |         batch_size = batch_size_in_transitions
176 | 
177 |         if rollout_batch_size < self.current_size:
178 |             episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
179 |             t_samples = np.random.randint(T, size=batch_size)
180 |         else:
181 |             assert beta >= 0
182 |             episode_idxs, t_samples = _sample_proportional(self, rollout_batch_size, batch_size, T)
183 |             episode_idxs = np.array(episode_idxs)
184 |             t_samples = np.array(t_samples)
185 | 
186 |         weights = []
187 |         p_min = self._it_min.min() / self._it_sum.sum()
188 |         max_weight = (p_min * self.n_transitions_stored) ** (-beta)
189 | 
190 |         for episode_idx, t_sample in zip(episode_idxs, t_samples):
191 |             p_sample = self._it_sum[episode_idx*T+t_sample] / self._it_sum.sum()
192 |             weight = (p_sample * self.n_transitions_stored) ** (-beta)
193 |             weights.append(weight / max_weight)
194 | 
195 |         weights = np.array(weights)
196 | 
197 |         transitions = {}
198 |         for key in episode_batch.keys():
199 |             if not key == "td" and not key == "e":
200 |                 episode_batch_key = episode_batch[key].copy()
201 |                 transitions[key] = episode_batch_key[episode_idxs, t_samples].copy()
202 | 
203 |         # Select future time indexes proportional with probability future_p. These
204 |         # will be used for HER replay by substituting in future goals.
205 |         her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
206 | 
207 |         future_offset = np.random.uniform(size=batch_size) * (T - t_samples)
208 |         future_offset = future_offset.astype(int)
209 |         future_t = (t_samples + 1 + future_offset)[her_indexes]
210 | 
211 |         if replay_strategy == 'final':
212 |             future_t[:] = T
213 | 
214 |         # Replace goal with achieved goal but only for the previously-selected
215 |         # HER transitions (as defined by her_indexes). For the other transitions,
216 |         # keep the original goal.
217 |         future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
218 | 
219 |         # Reconstruct info dictionary for reward computation.
220 |         info = {}
221 |         for key, value in transitions.items():
222 |             if key.startswith('info_'):
223 |                 info[key.replace('info_', '')] = value
224 | 
225 |         reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
226 |         reward_params['info'] = info
227 | 
228 |         transitions['g'][her_indexes] = future_ag
229 | 
230 |         # Re-compute reward since we may have substituted the goal.
231 |         reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
232 |         reward_params['info'] = info
233 | 
234 |         transitions['r'] = reward_fun(**reward_params)
235 | 
236 |         transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
237 |                        for k in transitions.keys()}
238 | 
239 |         assert(transitions['u'].shape[0] == batch_size_in_transitions)
240 | 
241 |         idxs = episode_idxs * T + t_samples
242 | 
243 |         return (transitions, weights, idxs)
244 | 
245 |     return _sample_her_transitions
246 | 


--------------------------------------------------------------------------------
/baselines/her/normalizer.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | 
  3 | import numpy as np
  4 | from mpi4py import MPI
  5 | import tensorflow as tf
  6 | 
  7 | from baselines.her.util import reshape_for_broadcasting
  8 | 
  9 | 
 10 | class Normalizer:
 11 |     def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None):
 12 |         """A normalizer that ensures that observations are approximately distributed according to
 13 |         a standard Normal distribution (i.e. have mean zero and variance one).
 14 | 
 15 |         Args:
 16 |             size (int): the size of the observation to be normalized
 17 |             eps (float): a small constant that avoids underflows
 18 |             default_clip_range (float): normalized observations are clipped to be in
 19 |                 [-default_clip_range, default_clip_range]
 20 |             sess (object): the TensorFlow session to be used
 21 |         """
 22 |         self.size = size
 23 |         self.eps = eps
 24 |         self.default_clip_range = default_clip_range
 25 |         self.sess = sess if sess is not None else tf.get_default_session()
 26 | 
 27 |         self.local_sum = np.zeros(self.size, np.float32)
 28 |         self.local_sumsq = np.zeros(self.size, np.float32)
 29 |         self.local_count = np.zeros(1, np.float32)
 30 | 
 31 |         self.sum_tf = tf.get_variable(
 32 |             initializer=tf.zeros_initializer(), shape=self.local_sum.shape, name='sum',
 33 |             trainable=False, dtype=tf.float32)
 34 |         self.sumsq_tf = tf.get_variable(
 35 |             initializer=tf.zeros_initializer(), shape=self.local_sumsq.shape, name='sumsq',
 36 |             trainable=False, dtype=tf.float32)
 37 |         self.count_tf = tf.get_variable(
 38 |             initializer=tf.ones_initializer(), shape=self.local_count.shape, name='count',
 39 |             trainable=False, dtype=tf.float32)
 40 |         self.mean = tf.get_variable(
 41 |             initializer=tf.zeros_initializer(), shape=(self.size,), name='mean',
 42 |             trainable=False, dtype=tf.float32)
 43 |         self.std = tf.get_variable(
 44 |             initializer=tf.ones_initializer(), shape=(self.size,), name='std',
 45 |             trainable=False, dtype=tf.float32)
 46 |         self.count_pl = tf.placeholder(name='count_pl', shape=(1,), dtype=tf.float32)
 47 |         self.sum_pl = tf.placeholder(name='sum_pl', shape=(self.size,), dtype=tf.float32)
 48 |         self.sumsq_pl = tf.placeholder(name='sumsq_pl', shape=(self.size,), dtype=tf.float32)
 49 | 
 50 |         self.update_op = tf.group(
 51 |             self.count_tf.assign_add(self.count_pl),
 52 |             self.sum_tf.assign_add(self.sum_pl),
 53 |             self.sumsq_tf.assign_add(self.sumsq_pl)
 54 |         )
 55 |         self.recompute_op = tf.group(
 56 |             tf.assign(self.mean, self.sum_tf / self.count_tf),
 57 |             tf.assign(self.std, tf.sqrt(tf.maximum(
 58 |                 tf.square(self.eps),
 59 |                 self.sumsq_tf / self.count_tf - tf.square(self.sum_tf / self.count_tf)
 60 |             ))),
 61 |         )
 62 |         self.lock = threading.Lock()
 63 | 
 64 |     def update(self, v):
 65 |         v = v.reshape(-1, self.size)
 66 | 
 67 |         with self.lock:
 68 |             self.local_sum += v.sum(axis=0)
 69 |             self.local_sumsq += (np.square(v)).sum(axis=0)
 70 |             self.local_count[0] += v.shape[0]
 71 | 
 72 |     def normalize(self, v, clip_range=None):
 73 |         if clip_range is None:
 74 |             clip_range = self.default_clip_range
 75 |         mean = reshape_for_broadcasting(self.mean, v)
 76 |         std = reshape_for_broadcasting(self.std,  v)
 77 |         return tf.clip_by_value((v - mean) / std, -clip_range, clip_range)
 78 | 
 79 |     def denormalize(self, v):
 80 |         mean = reshape_for_broadcasting(self.mean, v)
 81 |         std = reshape_for_broadcasting(self.std,  v)
 82 |         return mean + v * std
 83 | 
 84 |     def _mpi_average(self, x):
 85 |         buf = np.zeros_like(x)
 86 |         MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM)
 87 |         buf /= MPI.COMM_WORLD.Get_size()
 88 |         return buf
 89 | 
 90 |     def synchronize(self, local_sum, local_sumsq, local_count, root=None):
 91 |         local_sum[...] = self._mpi_average(local_sum)
 92 |         local_sumsq[...] = self._mpi_average(local_sumsq)
 93 |         local_count[...] = self._mpi_average(local_count)
 94 |         return local_sum, local_sumsq, local_count
 95 | 
 96 |     def recompute_stats(self):
 97 |         with self.lock:
 98 |             # Copy over results.
 99 |             local_count = self.local_count.copy()
100 |             local_sum = self.local_sum.copy()
101 |             local_sumsq = self.local_sumsq.copy()
102 | 
103 |             # Reset.
104 |             self.local_count[...] = 0
105 |             self.local_sum[...] = 0
106 |             self.local_sumsq[...] = 0
107 | 
108 |         # We perform the synchronization outside of the lock to keep the critical section as short
109 |         # as possible.
110 |         synced_sum, synced_sumsq, synced_count = self.synchronize(
111 |             local_sum=local_sum, local_sumsq=local_sumsq, local_count=local_count)
112 | 
113 |         self.sess.run(self.update_op, feed_dict={
114 |             self.count_pl: synced_count,
115 |             self.sum_pl: synced_sum,
116 |             self.sumsq_pl: synced_sumsq,
117 |         })
118 |         self.sess.run(self.recompute_op)
119 | 
120 | 
121 | class IdentityNormalizer:
122 |     def __init__(self, size, std=1.):
123 |         self.size = size
124 |         self.mean = tf.zeros(self.size, tf.float32)
125 |         self.std = std * tf.ones(self.size, tf.float32)
126 | 
127 |     def update(self, x):
128 |         pass
129 | 
130 |     def normalize(self, x, clip_range=None):
131 |         return x / self.std
132 | 
133 |     def denormalize(self, x):
134 |         return self.std * x
135 | 
136 |     def synchronize(self):
137 |         pass
138 | 
139 |     def recompute_stats(self):
140 |         pass
141 | 


--------------------------------------------------------------------------------
/baselines/her/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | 
  3 | import numpy as np
  4 | 
  5 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  6 | 
  7 | import math
  8 | 
  9 | from scipy.stats import rankdata
 10 | 
 11 | import json
 12 | 
 13 | def quaternion_to_euler_angle(array):
 14 |     w = array[0]
 15 |     x = array[1]
 16 |     y = array[2]
 17 |     z = array[3]
 18 |     ysqr = y * y
 19 |     t0 = +2.0 * (w * x + y * z)
 20 |     t1 = +1.0 - 2.0 * (x * x + ysqr)
 21 |     X = math.atan2(t0, t1)
 22 |     t2 = +2.0 * (w * y - z * x)
 23 |     t2 = +1.0 if t2 > +1.0 else t2
 24 |     t2 = -1.0 if t2 < -1.0 else t2
 25 |     Y = math.asin(t2)
 26 |     t3 = +2.0 * (w * z + x * y)
 27 |     t4 = +1.0 - 2.0 * (ysqr + z * z)
 28 |     Z = math.atan2(t3, t4)
 29 |     result = np.array([X, Y, Z])
 30 |     return result
 31 | 
 32 | class ReplayBuffer:
 33 |     def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions):
 34 |         """Creates a replay buffer.
 35 | 
 36 |         Args:
 37 |             buffer_shapes (dict of ints): the shape for all buffers that are used in the replay
 38 |                 buffer
 39 |             size_in_transitions (int): the size of the buffer, measured in transitions
 40 |             T (int): the time horizon for episodes
 41 |             sample_transitions (function): a function that samples from the replay buffer
 42 |         """
 43 |         self.buffer_shapes = buffer_shapes
 44 |         self.size = size_in_transitions // T
 45 |         self.T = T
 46 |         self.sample_transitions = sample_transitions
 47 | 
 48 |         # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)}
 49 |         self.buffers = {key: np.empty([self.size, *shape])
 50 |                         for key, shape in buffer_shapes.items()}
 51 | 
 52 |         # memory management
 53 |         self.current_size = 0
 54 |         self.n_transitions_stored = 0
 55 | 
 56 |         self.lock = threading.Lock()
 57 | 
 58 |     @property
 59 |     def full(self):
 60 |         with self.lock:
 61 |             return self.current_size == self.size
 62 | 
 63 |     def sample(self, batch_size):
 64 |         """Returns a dict {key: array(batch_size x shapes[key])}
 65 |         """
 66 |         buffers = {}
 67 | 
 68 |         with self.lock:
 69 |             assert self.current_size > 0
 70 |             for key in self.buffers.keys():
 71 |                 buffers[key] = self.buffers[key][:self.current_size]
 72 | 
 73 |         buffers['o_2'] = buffers['o'][:, 1:, :]
 74 |         buffers['ag_2'] = buffers['ag'][:, 1:, :]
 75 | 
 76 |         transitions = self.sample_transitions(buffers, batch_size)
 77 | 
 78 |         for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())):
 79 |             assert key in transitions, "key %s missing from transitions" % key
 80 | 
 81 |         return transitions
 82 | 
 83 |     def store_episode(self, episode_batch):
 84 |         """episode_batch: array(batch_size x (T or T+1) x dim_key)
 85 |         """
 86 |         batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()]
 87 |         assert np.all(np.array(batch_sizes) == batch_sizes[0])
 88 |         batch_size = batch_sizes[0]
 89 | 
 90 |         with self.lock:
 91 |             idxs = self._get_storage_idx(batch_size)
 92 | 
 93 |             # load inputs into buffers
 94 |             for key in self.buffers.keys():
 95 |                 self.buffers[key][idxs] = episode_batch[key]
 96 | 
 97 |             self.n_transitions_stored += batch_size * self.T
 98 | 
 99 |     def get_current_episode_size(self):
100 |         with self.lock:
101 |             return self.current_size
102 | 
103 |     def get_current_size(self):
104 |         with self.lock:
105 |             return self.current_size * self.T
106 | 
107 |     def get_transitions_stored(self):
108 |         with self.lock:
109 |             return self.n_transitions_stored
110 | 
111 |     def clear_buffer(self):
112 |         with self.lock:
113 |             self.current_size = 0
114 | 
115 |     def _get_storage_idx(self, inc=None):
116 |         inc = inc or 1   # size increment
117 |         assert inc <= self.size, "Batch committed to replay is too large!"
118 |         # go consecutively until you hit the end, and then go randomly.
119 |         if self.current_size+inc <= self.size:
120 |             idx = np.arange(self.current_size, self.current_size+inc)
121 |         elif self.current_size < self.size:
122 |             overflow = inc - (self.size - self.current_size)
123 |             idx_a = np.arange(self.current_size, self.size)
124 |             idx_b = np.random.randint(0, self.current_size, overflow)
125 |             idx = np.concatenate([idx_a, idx_b])
126 |         else:
127 |             idx = np.random.randint(0, self.size, inc)
128 | 
129 |         # update replay size
130 |         self.current_size = min(self.size, self.current_size+inc)
131 | 
132 |         if inc == 1:
133 |             idx = idx[0]
134 |         return idx
135 | 
136 | 
137 | class ReplayBufferEnergy:
138 |     def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions, prioritization, env_name):
139 |         """Creates a replay buffer.
140 | 
141 |         Args:
142 |             buffer_shapes (dict of ints): the shape for all buffers that are used in the replay
143 |                 buffer
144 |             size_in_transitions (int): the size of the buffer, measured in transitions
145 |             T (int): the time horizon for episodes
146 |             sample_transitions (function): a function that samples from the replay buffer
147 |         """
148 |         self.buffer_shapes = buffer_shapes
149 |         self.size = size_in_transitions // T
150 |         self.T = T
151 |         self.sample_transitions = sample_transitions
152 | 
153 |         self.buffers = {key: np.empty([self.size, *shape])
154 |                         for key, shape in buffer_shapes.items()}
155 |         self.buffers['e'] = np.empty([self.size, 1]) # energy
156 |         self.buffers['p'] = np.empty([self.size, 1]) # priority/ranking
157 | 
158 |         self.prioritization = prioritization
159 |         self.env_name = env_name
160 | 
161 |         # memory management
162 |         self.current_size = 0
163 |         self.n_transitions_stored = 0
164 | 
165 |         self.current_size_test = 0
166 |         self.n_transitions_stored_test = 0
167 | 
168 |         self.lock = threading.Lock()
169 | 
170 |     @property
171 |     def full(self):
172 |         with self.lock:
173 |             return self.current_size == self.size
174 | 
175 |     def sample(self, batch_size, rank_method, temperature):
176 |         """Returns a dict {key: array(batch_size x shapes[key])}
177 |         """
178 |         buffers = {}
179 | 
180 |         with self.lock:
181 |             assert self.current_size > 0
182 |             for key in self.buffers.keys():
183 |                 buffers[key] = self.buffers[key][:self.current_size]
184 | 
185 |         buffers['o_2'] = buffers['o'][:, 1:, :]
186 |         buffers['ag_2'] = buffers['ag'][:, 1:, :]
187 | 
188 |         transitions = self.sample_transitions(buffers, batch_size, rank_method, temperature)
189 | 
190 |         for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())):
191 |             if not key == 'p' and not key == 'e':
192 |                 assert key in transitions, "key %s missing from transitions" % key
193 | 
194 |         return transitions
195 | 
196 |     def store_episode(self, episode_batch, w_potential, w_linear, w_rotational, rank_method, clip_energy):
197 |         """episode_batch: array(batch_size x (T or T+1) x dim_key)
198 |         """
199 |         batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()]
200 |         assert np.all(np.array(batch_sizes) == batch_sizes[0])
201 |         batch_size = batch_sizes[0]
202 | 
203 |         buffers = {}
204 |         for key in episode_batch.keys():
205 |             buffers[key] = episode_batch[key]
206 | 
207 |         if self.prioritization == 'energy':
208 |             if self.env_name in ['FetchPickAndPlace-v0', 'FetchSlide-v0', 'FetchPush-v0']:
209 |                 height = buffers['ag'][:, :, 2]
210 |                 height_0 = np.repeat(height[:,0].reshape(-1,1), height[:,1::].shape[1], axis=1)
211 |                 height = height[:,1::] - height_0
212 |                 g, m, delta_t = 9.81, 1, 0.04
213 |                 potential_energy = g*m*height
214 |                 diff = np.diff(buffers['ag'], axis=1)
215 |                 velocity = diff / delta_t
216 |                 kinetic_energy = 0.5 * m * np.power(velocity, 2)
217 |                 kinetic_energy = np.sum(kinetic_energy, axis=2)
218 |                 energy_totoal = w_potential*potential_energy + w_linear*kinetic_energy
219 |                 energy_diff = np.diff(energy_totoal, axis=1)
220 |                 energy_transition = energy_totoal.copy()
221 |                 energy_transition[:,1::] = energy_diff.copy()
222 |                 energy_transition = np.clip(energy_transition, 0, clip_energy)
223 |                 energy_transition_total = np.sum(energy_transition, axis=1)
224 |                 episode_batch['e'] = energy_transition_total.reshape(-1,1)
225 |             elif self.env_name in ['HandManipulatePenRotate-v0', \
226 |                                    'HandManipulateEggFull-v0', \
227 |                                    'HandManipulateBlockFull-v0', \
228 |                                    'HandManipulateBlockRotateXYZ-v0']:
229 |                 g, m, delta_t, inertia  = 9.81, 1, 0.04, 1
230 |                 quaternion = buffers['ag'][:,:,3:].copy()
231 |                 angle = np.apply_along_axis(quaternion_to_euler_angle, 2, quaternion)
232 |                 diff_angle = np.diff(angle, axis=1)
233 |                 angular_velocity = diff_angle / delta_t
234 |                 rotational_energy = 0.5 * inertia * np.power(angular_velocity, 2)
235 |                 rotational_energy = np.sum(rotational_energy, axis=2)
236 |                 buffers['ag'] = buffers['ag'][:,:,:3]
237 |                 height = buffers['ag'][:, :, 2]
238 |                 height_0 = np.repeat(height[:,0].reshape(-1,1), height[:,1::].shape[1], axis=1)
239 |                 height = height[:,1::] - height_0
240 |                 potential_energy = g*m*height
241 |                 diff = np.diff(buffers['ag'], axis=1)
242 |                 velocity = diff / delta_t
243 |                 kinetic_energy = 0.5 * m * np.power(velocity, 2)
244 |                 kinetic_energy = np.sum(kinetic_energy, axis=2)
245 |                 energy_totoal = w_potential*potential_energy + w_linear*kinetic_energy + w_rotational*rotational_energy
246 |                 energy_diff = np.diff(energy_totoal, axis=1)
247 |                 energy_transition = energy_totoal.copy()
248 |                 energy_transition[:,1::] = energy_diff.copy()
249 |                 energy_transition = np.clip(energy_transition, 0, clip_energy)
250 |                 energy_transition_total = np.sum(energy_transition, axis=1)
251 |                 episode_batch['e'] = energy_transition_total.reshape(-1,1)
252 |             else:
253 |                 print('Trajectory Energy Function Not Implemented')
254 |                 exit()
255 | 
256 |         with self.lock:
257 |             idxs = self._get_storage_idx(batch_size)
258 | 
259 |             # load inputs into buffers
260 |             for key in self.buffers.keys():
261 |                 if not key == 'p':
262 |                     self.buffers[key][idxs] = episode_batch[key]
263 | 
264 |             self.n_transitions_stored += batch_size * self.T
265 | 
266 |             energy_transition_total = self.buffers['e'][:self.current_size]
267 |             if rank_method == 'none':
268 |                 rank_method = 'dense'
269 |             energy_rank = rankdata(energy_transition_total, method=rank_method)
270 |             energy_rank = energy_rank - 1
271 |             energy_rank = energy_rank.reshape(-1, 1)
272 |             self.buffers['p'][:self.current_size] = energy_rank.copy()
273 | 
274 |     def get_current_episode_size(self):
275 |         with self.lock:
276 |             return self.current_size
277 | 
278 |     def get_current_size(self):
279 |         with self.lock:
280 |             return self.current_size * self.T
281 | 
282 |     def get_transitions_stored(self):
283 |         with self.lock:
284 |             return self.n_transitions_stored
285 | 
286 |     def clear_buffer(self):
287 |         with self.lock:
288 |             self.current_size = 0
289 | 
290 |     def _get_storage_idx(self, inc=None):
291 |         inc = inc or 1   # size increment
292 |         assert inc <= self.size, "Batch committed to replay is too large!"
293 |         # go consecutively until you hit the end, and then go randomly.
294 |         if self.current_size+inc <= self.size:
295 |             idx = np.arange(self.current_size, self.current_size+inc)
296 |         elif self.current_size < self.size:
297 |             overflow = inc - (self.size - self.current_size)
298 |             idx_a = np.arange(self.current_size, self.size)
299 |             idx_b = np.random.randint(0, self.current_size, overflow)
300 |             idx = np.concatenate([idx_a, idx_b])
301 |         else:
302 |             idx = np.random.randint(0, self.size, inc)
303 | 
304 |         # update replay size
305 |         self.current_size = min(self.size, self.current_size+inc)
306 | 
307 |         if inc == 1:
308 |             idx = idx[0]
309 |         return idx
310 | 
311 | 
312 | class PrioritizedReplayBuffer(ReplayBuffer):
313 |     def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions, alpha, env_name):
314 |         """Create Prioritized Replay buffer.
315 |         """
316 |         super(PrioritizedReplayBuffer, self).__init__(buffer_shapes, size_in_transitions, T, sample_transitions)
317 |         assert alpha >= 0
318 |         self._alpha = alpha
319 | 
320 |         it_capacity = 1
321 |         self.size_in_transitions = size_in_transitions
322 |         while it_capacity < size_in_transitions:
323 |             it_capacity *= 2
324 | 
325 |         self._it_sum = SumSegmentTree(it_capacity)
326 |         self._it_min = MinSegmentTree(it_capacity)
327 |         self._max_priority = 1.0
328 | 
329 |         self.T = T
330 |         self.buffers['td'] = np.zeros([self.size, self.T]) # accumulated td-error
331 |         self.buffers['e'] = np.zeros([self.size, self.T]) # trajectory energy
332 |         self.env_name = env_name
333 | 
334 |     def store_episode(self, episode_batch, dump_buffer):
335 |         """episode_batch: array(batch_size x (T or T+1) x dim_key)
336 |         """
337 | 
338 |         batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()]
339 |         assert np.all(np.array(batch_sizes) == batch_sizes[0])
340 |         batch_size = batch_sizes[0]
341 | 
342 |         if dump_buffer:
343 | 
344 |             buffers = {}
345 |             for key in episode_batch.keys():
346 |                 buffers[key] = episode_batch[key]
347 | 
348 |             if self.env_name in ['FetchPickAndPlace-v0', 'FetchSlide-v0', 'FetchPush-v0']:
349 |                 height = buffers['ag'][:, :, 2]
350 |                 height_0 = np.repeat(height[:,0].reshape(-1,1), height[:,1::].shape[1], axis=1)
351 |                 height = height[:,1::] - height_0
352 |                 g, m, delta_t = 9.81, 1, 0.04
353 |                 potential_energy = g*m*height
354 |                 diff = np.diff(buffers['ag'], axis=1)
355 |                 velocity = diff / delta_t
356 |                 kinetic_energy = 0.5 * m * np.power(velocity, 2)
357 |                 kinetic_energy = np.sum(kinetic_energy, axis=2)
358 |                 energy_totoal = potential_energy + kinetic_energy
359 |                 energy_diff = np.diff(energy_totoal, axis=1)
360 |                 energy_transition = energy_totoal.copy()
361 |                 energy_transition[:,1::] = energy_diff.copy()
362 |                 episode_batch['e'] = energy_transition
363 |             elif self.env_name in ['HandManipulatePenRotate-v0', \
364 |                                    'HandManipulateEggFull-v0', \
365 |                                    'HandManipulateBlockFull-v0', \
366 |                                    'HandManipulateBlockRotateXYZ-v0']:
367 |                 g, m, delta_t, inertia  = 9.81, 1, 0.04, 1
368 |                 quaternion = buffers['ag'][:,:,3:].copy()
369 |                 angle = np.apply_along_axis(quaternion_to_euler_angle, 2, quaternion)
370 |                 diff_angle = np.diff(angle, axis=1)
371 |                 angular_velocity = diff_angle / delta_t
372 |                 rotational_energy = 0.5 * inertia * np.power(angular_velocity, 2)
373 |                 rotational_energy = np.sum(rotational_energy, axis=2)
374 |                 buffers['ag'] = buffers['ag'][:,:,:3]
375 |                 height = buffers['ag'][:, :, 2]
376 |                 height_0 = np.repeat(height[:,0].reshape(-1,1), height[:,1::].shape[1], axis=1)
377 |                 height = height[:,1::] - height_0
378 |                 potential_energy = g*m*height
379 |                 diff = np.diff(buffers['ag'], axis=1)
380 |                 velocity = diff / delta_t
381 |                 kinetic_energy = 0.5 * m * np.power(velocity, 2)
382 |                 kinetic_energy = np.sum(kinetic_energy, axis=2)
383 |                 energy_totoal = potential_energy + kinetic_energy + rotational_energy
384 |                 energy_diff = np.diff(energy_totoal, axis=1)
385 |                 energy_transition = energy_totoal.copy()
386 |                 energy_transition[:,1::] = energy_diff.copy()
387 |                 episode_batch['e'] = energy_transition
388 | 
389 | 
390 |         with self.lock:
391 |             idxs = self._get_storage_idx(batch_size)
392 | 
393 |             # load inputs into buffers
394 |             for key in self.buffers.keys():
395 |                 if not key == 'td':
396 |                     if dump_buffer:
397 |                         self.buffers[key][idxs] = episode_batch[key]
398 |                     else:
399 |                         if not key == 'e':
400 |                             self.buffers[key][idxs] = episode_batch[key]
401 | 
402 |             self.n_transitions_stored += batch_size * self.T
403 | 
404 |             for idx in idxs:
405 |                 episode_idx = idx
406 |                 for t in range(episode_idx*self.T, (episode_idx+1)*self.T):
407 |                     assert (episode_idx+1)*self.T-1 < min(self.n_transitions_stored, self.size_in_transitions)
408 |                     self._it_sum[t] = self._max_priority ** self._alpha
409 |                     self._it_min[t] = self._max_priority ** self._alpha
410 | 
411 |     def dump_buffer(self, epoch):
412 |         for i in range(self.current_size):
413 |             entry = {"e": self.buffers['e'][i].tolist(), \
414 |                      "td": self.buffers['td'][i].tolist(), \
415 |                      "ag": self.buffers['ag'][i].tolist() }
416 |             with open('buffer_epoch_{0}.txt'.format(epoch), 'a') as file:
417 |                  file.write(json.dumps(entry))  # use `json.loads` to do the reverse
418 |                  file.write("\n")
419 | 
420 |         print("dump buffer")
421 | 
422 | 
423 |     def sample(self, batch_size, beta):
424 |         """Returns a dict {key: array(batch_size x shapes[key])}
425 |         """
426 | 
427 |         """Sample a batch of experiences.
428 | 
429 |         compared to ReplayBuffer.sample
430 |         it also returns importance weights and idxes
431 |         of sampled experiences.
432 | 
433 |         Parameters
434 |         ----------
435 |         batch_size: int
436 |             How many transitions to sample.
437 |         beta: float
438 |             To what degree to use importance weights
439 |             (0 - no corrections, 1 - full correction)
440 |         """
441 |         buffers = {}
442 | 
443 |         with self.lock:
444 |             assert self.current_size > 0
445 |             for key in self.buffers.keys():
446 |                 buffers[key] = self.buffers[key][:self.current_size]
447 | 
448 |         buffers['o_2'] = buffers['o'][:, 1:, :]
449 |         buffers['ag_2'] = buffers['ag'][:, 1:, :]
450 | 
451 |         transitions, weights, idxs = self.sample_transitions(self, buffers, batch_size, beta)
452 | 
453 |         for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())):
454 |             if not key == 'td' and not key == 'e':
455 |                 assert key in transitions, "key %s missing from transitions" % key
456 | 
457 |         return (transitions, weights, idxs)
458 | 
459 | 
460 |     def update_priorities(self, idxes, priorities):
461 |         """Update priorities of sampled transitions.
462 | 
463 |         sets priority of transition at index idxes[i] in buffer
464 |         to priorities[i].
465 | 
466 |         Parameters
467 |         ----------
468 |         idxes: [int]
469 |             List of idxes of sampled transitions
470 |         priorities: [float]
471 |             List of updated priorities corresponding to
472 |             transitions at the sampled idxes denoted by
473 |             variable `idxes`.
474 |         """
475 |         assert len(idxes) == len(priorities)
476 |         for idx, priority in zip(idxes, priorities.flatten()):
477 |             assert priority > 0
478 |             assert 0 <= idx < self.n_transitions_stored
479 |             self._it_sum[idx] = priority ** self._alpha
480 |             self._it_min[idx] = priority ** self._alpha
481 | 
482 |             self._max_priority = max(self._max_priority, priority)
483 | 


--------------------------------------------------------------------------------
/baselines/her/rollout.py:
--------------------------------------------------------------------------------
  1 | from collections import deque
  2 | 
  3 | import numpy as np
  4 | import pickle
  5 | from mujoco_py import MujocoException
  6 | 
  7 | from baselines.her.util import convert_episode_to_batch_major, store_args
  8 | 
  9 | class RolloutWorker:
 10 | 
 11 |     @store_args
 12 |     def __init__(self, make_env, policy, dims, logger, T, rollout_batch_size=1,
 13 |                  exploit=False, use_target_net=False, compute_Q=False, noise_eps=0,
 14 |                  random_eps=0, history_len=100, render=False, **kwargs):
 15 |         """Rollout worker generates experience by interacting with one or many environments.
 16 | 
 17 |         Args:
 18 |             make_env (function): a factory function that creates a new instance of the environment
 19 |                 when called
 20 |             policy (object): the policy that is used to act
 21 |             dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u)
 22 |             logger (object): the logger that is used by the rollout worker
 23 |             rollout_batch_size (int): the number of parallel rollouts that should be used
 24 |             exploit (boolean): whether or not to exploit, i.e. to act optimally according to the
 25 |                 current policy without any exploration
 26 |             use_target_net (boolean): whether or not to use the target net for rollouts
 27 |             compute_Q (boolean): whether or not to compute the Q values alongside the actions
 28 |             noise_eps (float): scale of the additive Gaussian noise
 29 |             random_eps (float): probability of selecting a completely random action
 30 |             history_len (int): length of history for statistics smoothing
 31 |             render (boolean): whether or not to render the rollouts
 32 |         """
 33 |         self.envs = [make_env() for _ in range(rollout_batch_size)]
 34 |         assert self.T > 0
 35 | 
 36 |         self.info_keys = [key.replace('info_', '') for key in dims.keys() if key.startswith('info_')]
 37 | 
 38 |         self.success_history = deque(maxlen=history_len)
 39 |         self.Q_history = deque(maxlen=history_len)
 40 | 
 41 |         self.n_episodes = 0
 42 |         self.g = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # goals
 43 |         self.initial_o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
 44 |         self.initial_ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
 45 |         self.reset_all_rollouts()
 46 |         self.clear_history()
 47 | 
 48 |     def reset_rollout(self, i):
 49 |         """Resets the `i`-th rollout environment, re-samples a new goal, and updates the `initial_o`
 50 |         and `g` arrays accordingly.
 51 |         """
 52 |         obs = self.envs[i].reset()
 53 |         self.initial_o[i] = obs['observation']
 54 |         self.initial_ag[i] = obs['achieved_goal']
 55 |         self.g[i] = obs['desired_goal']
 56 | 
 57 |     def reset_all_rollouts(self):
 58 |         """Resets all `rollout_batch_size` rollout workers.
 59 |         """
 60 |         for i in range(self.rollout_batch_size):
 61 |             self.reset_rollout(i)
 62 | 
 63 |     def generate_rollouts(self):
 64 |         """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
 65 |         policy acting on it accordingly.
 66 |         """
 67 |         self.reset_all_rollouts()
 68 | 
 69 |         # compute observations
 70 |         o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
 71 |         ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
 72 |         o[:] = self.initial_o
 73 |         ag[:] = self.initial_ag
 74 | 
 75 |         # generate episodes
 76 |         obs, achieved_goals, acts, goals, successes = [], [], [], [], []
 77 |         info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
 78 |         Qs = []
 79 |         for t in range(self.T):
 80 |             policy_output = self.policy.get_actions(
 81 |                 o, ag, self.g,
 82 |                 compute_Q=self.compute_Q,
 83 |                 noise_eps=self.noise_eps if not self.exploit else 0.,
 84 |                 random_eps=self.random_eps if not self.exploit else 0.,
 85 |                 use_target_net=self.use_target_net)
 86 | 
 87 |             if self.compute_Q:
 88 |                 u, Q = policy_output
 89 |                 Qs.append(Q)
 90 |             else:
 91 |                 u = policy_output
 92 | 
 93 |             if u.ndim == 1:
 94 |                 # The non-batched case should still have a reasonable shape.
 95 |                 u = u.reshape(1, -1)
 96 | 
 97 |             o_new = np.empty((self.rollout_batch_size, self.dims['o']))
 98 |             ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
 99 |             success = np.zeros(self.rollout_batch_size)
100 |             # compute new states and observations
101 |             for i in range(self.rollout_batch_size):
102 |                 try:
103 |                     # We fully ignore the reward here because it will have to be re-computed
104 |                     # for HER.
105 |                     curr_o_new, _, _, info = self.envs[i].step(u[i])
106 |                     if 'is_success' in info:
107 |                         success[i] = info['is_success']
108 |                     o_new[i] = curr_o_new['observation']
109 |                     ag_new[i] = curr_o_new['achieved_goal']
110 |                     for idx, key in enumerate(self.info_keys):
111 |                         info_values[idx][t, i] = info[key]
112 |                     if self.render:
113 |                         self.envs[i].render()
114 |                 except MujocoException as e:
115 |                     return self.generate_rollouts()
116 | 
117 |             if np.isnan(o_new).any():
118 |                 self.logger.warning('NaN caught during rollout generation. Trying again...')
119 |                 self.reset_all_rollouts()
120 |                 return self.generate_rollouts()
121 | 
122 |             obs.append(o.copy())
123 |             achieved_goals.append(ag.copy())
124 |             successes.append(success.copy())
125 |             acts.append(u.copy())
126 |             goals.append(self.g.copy())
127 |             o[...] = o_new
128 |             ag[...] = ag_new
129 |         obs.append(o.copy())
130 |         achieved_goals.append(ag.copy())
131 |         self.initial_o[:] = o
132 | 
133 |         successful = np.array(successes)[-1, :].copy()
134 | 
135 |         episode = dict(o=obs,
136 |                        u=acts,
137 |                        g=goals,
138 |                        ag=achieved_goals,)
139 |         for key, value in zip(self.info_keys, info_values):
140 |             episode['info_{}'.format(key)] = value
141 | 
142 |         # stats
143 |         assert successful.shape == (self.rollout_batch_size,)
144 |         success_rate = np.mean(successful)
145 |         self.success_history.append(success_rate)
146 |         if self.compute_Q:
147 |             self.Q_history.append(np.mean(Qs))
148 |         self.n_episodes += self.rollout_batch_size
149 | 
150 |         return convert_episode_to_batch_major(episode)
151 | 
152 |     def clear_history(self):
153 |         """Clears all histories that are used for statistics
154 |         """
155 |         self.success_history.clear()
156 |         self.Q_history.clear()
157 | 
158 |     def current_success_rate(self):
159 |         return np.mean(self.success_history)
160 | 
161 |     def current_mean_Q(self):
162 |         return np.mean(self.Q_history)
163 | 
164 |     def save_policy(self, path):
165 |         """Pickles the current policy for later inspection.
166 |         """
167 |         with open(path, 'wb') as f:
168 |             pickle.dump(self.policy, f)
169 | 
170 |     def logs(self, prefix='worker'):
171 |         """Generates a dictionary that contains all collected statistics.
172 |         """
173 |         logs = []
174 |         logs += [('success_rate', np.mean(self.success_history))]
175 |         if self.compute_Q:
176 |             logs += [('mean_Q', np.mean(self.Q_history))]
177 |         logs += [('episode', self.n_episodes)]
178 | 
179 |         if prefix is not '' and not prefix.endswith('/'):
180 |             return [(prefix + '/' + key, val) for key, val in logs]
181 |         else:
182 |             return logs
183 | 
184 |     def seed(self, seed):
185 |         """Seeds each environment with a distinct seed derived from the passed in global seed.
186 |         """
187 |         for idx, env in enumerate(self.envs):
188 |             env.seed(seed + 1000 * idx)
189 | 


--------------------------------------------------------------------------------
/baselines/her/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import sys
  4 | import importlib
  5 | import inspect
  6 | import functools
  7 | 
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | 
 11 | from baselines.common import tf_util as U
 12 | import platform
 13 | 
 14 | 
 15 | def store_args(method):
 16 |     """Stores provided method args as instance attributes.
 17 |     """
 18 |     argspec = inspect.getfullargspec(method)
 19 |     defaults = {}
 20 |     if argspec.defaults is not None:
 21 |         defaults = dict(
 22 |             zip(argspec.args[-len(argspec.defaults):], argspec.defaults))
 23 |     if argspec.kwonlydefaults is not None:
 24 |         defaults.update(argspec.kwonlydefaults)
 25 |     arg_names = argspec.args[1:]
 26 | 
 27 |     @functools.wraps(method)
 28 |     def wrapper(*positional_args, **keyword_args):
 29 |         self = positional_args[0]
 30 |         # Get default arg values
 31 |         args = defaults.copy()
 32 |         # Add provided arg values
 33 |         for name, value in zip(arg_names, positional_args[1:]):
 34 |             args[name] = value
 35 |         args.update(keyword_args)
 36 |         self.__dict__.update(args)
 37 |         return method(*positional_args, **keyword_args)
 38 | 
 39 |     return wrapper
 40 | 
 41 | 
 42 | def import_function(spec):
 43 |     """Import a function identified by a string like "pkg.module:fn_name".
 44 |     """
 45 |     mod_name, fn_name = spec.split(':')
 46 |     module = importlib.import_module(mod_name)
 47 |     fn = getattr(module, fn_name)
 48 |     return fn
 49 | 
 50 | 
 51 | def flatten_grads(var_list, grads):
 52 |     """Flattens a variables and their gradients.
 53 |     """
 54 |     return tf.concat([tf.reshape(grad, [U.numel(v)])
 55 |                       for (v, grad) in zip(var_list, grads)], 0)
 56 | 
 57 | 
 58 | def nn(input, layers_sizes, reuse=None, flatten=False, name=""):
 59 |     """Creates a simple neural network
 60 |     """
 61 |     for i, size in enumerate(layers_sizes):
 62 |         activation = tf.nn.relu if i < len(layers_sizes)-1 else None
 63 |         input = tf.layers.dense(inputs=input,
 64 |                                 units=size,
 65 |                                 kernel_initializer=tf.contrib.layers.xavier_initializer(),
 66 |                                 reuse=reuse,
 67 |                                 name=name+'_'+str(i))
 68 |         if activation:
 69 |             input = activation(input)
 70 |     if flatten:
 71 |         assert layers_sizes[-1] == 1
 72 |         input = tf.reshape(input, [-1])
 73 |     return input
 74 | 
 75 | 
 76 | def install_mpi_excepthook():
 77 |     import sys
 78 |     from mpi4py import MPI
 79 |     old_hook = sys.excepthook
 80 | 
 81 |     def new_hook(a, b, c):
 82 |         old_hook(a, b, c)
 83 |         sys.stdout.flush()
 84 |         sys.stderr.flush()
 85 |         MPI.COMM_WORLD.Abort()
 86 |     sys.excepthook = new_hook
 87 | 
 88 | 
 89 | def mpi_fork(n, binding="core"):
 90 |     """Re-launches the current script with workers
 91 |     Returns "parent" for original parent, "child" for MPI children
 92 |     """
 93 |     if n <= 1:
 94 |         return "child"
 95 |     if os.getenv("IN_MPI") is None:
 96 |         env = os.environ.copy()
 97 |         env.update(
 98 |             MKL_NUM_THREADS="1",
 99 |             OMP_NUM_THREADS="1",
100 |             IN_MPI="1"
101 |         )
102 |         # "-bind-to core" is crucial for good performance
103 |         if platform.system() == 'Darwin':
104 |             args = [
105 |                 "mpirun",
106 |                 "-np",
107 |                 str(n),
108 |                 "-allow-run-as-root",
109 |                 sys.executable
110 |             ]
111 |         else:
112 |             args = [
113 |             "mpirun",
114 |             "-np",
115 |             str(n),
116 |             "-bind-to",
117 |             binding, # core or none
118 |             "-allow-run-as-root",
119 |             sys.executable
120 |         ]
121 |         args += sys.argv
122 |         subprocess.check_call(args, env=env)
123 |         return "parent"
124 |     else:
125 |         install_mpi_excepthook()
126 |         return "child"
127 | 
128 | 
129 | def convert_episode_to_batch_major(episode):
130 |     """Converts an episode to have the batch dimension in the major (first)
131 |     dimension.
132 |     """
133 |     episode_batch = {}
134 |     for key in episode.keys():
135 |         val = np.array(episode[key]).copy()
136 |         # make inputs batch-major instead of time-major
137 |         episode_batch[key] = val.swapaxes(0, 1)
138 | 
139 |     return episode_batch
140 | 
141 | 
142 | def transitions_in_episode_batch(episode_batch):
143 |     """Number of transitions in a given episode batch.
144 |     """
145 |     shape = episode_batch['u'].shape
146 |     return shape[0] * shape[1]
147 | 
148 | 
149 | def reshape_for_broadcasting(source, target):
150 |     """Reshapes a tensor (source) to have the correct shape and dtype of the target
151 |     before broadcasting it with MPI.
152 |     """
153 |     dim = len(target.get_shape())
154 |     shape = ([1] * (dim-1)) + [-1]
155 |     return tf.reshape(tf.cast(source, target.dtype), shape)
156 | 


--------------------------------------------------------------------------------
/baselines/logger.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import shutil
  4 | import os.path as osp
  5 | import json
  6 | import time
  7 | import datetime
  8 | import tempfile
  9 | from collections import defaultdict
 10 | 
 11 | LOG_OUTPUT_FORMATS     = ['stdout', 'log', 'csv']
 12 | LOG_OUTPUT_FORMATS_MPI = ['log']
 13 | # Also valid: json, tensorboard
 14 | 
 15 | DEBUG = 10
 16 | INFO = 20
 17 | WARN = 30
 18 | ERROR = 40
 19 | 
 20 | DISABLED = 50
 21 | 
 22 | class KVWriter(object):
 23 |     def writekvs(self, kvs):
 24 |         raise NotImplementedError
 25 | 
 26 | class SeqWriter(object):
 27 |     def writeseq(self, seq):
 28 |         raise NotImplementedError
 29 | 
 30 | class HumanOutputFormat(KVWriter, SeqWriter):
 31 |     def __init__(self, filename_or_file):
 32 |         if isinstance(filename_or_file, str):
 33 |             self.file = open(filename_or_file, 'wt')
 34 |             self.own_file = True
 35 |         else:
 36 |             assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file
 37 |             self.file = filename_or_file
 38 |             self.own_file = False
 39 | 
 40 |     def writekvs(self, kvs):
 41 |         # Create strings for printing
 42 |         key2str = {}
 43 |         for (key, val) in sorted(kvs.items()):
 44 |             if isinstance(val, float):
 45 |                 valstr = '%-8.3g' % (val,)
 46 |             else:
 47 |                 valstr = str(val)
 48 |             key2str[self._truncate(key)] = self._truncate(valstr)
 49 | 
 50 |         # Find max widths
 51 |         if len(key2str) == 0:
 52 |             print('WARNING: tried to write empty key-value dict')
 53 |             return
 54 |         else:
 55 |             keywidth = max(map(len, key2str.keys()))
 56 |             valwidth = max(map(len, key2str.values()))
 57 | 
 58 |         # Write out the data
 59 |         dashes = '-' * (keywidth + valwidth + 7)
 60 |         lines = [dashes]
 61 |         for (key, val) in sorted(key2str.items()):
 62 |             lines.append('| %s%s | %s%s |' % (
 63 |                 key,
 64 |                 ' ' * (keywidth - len(key)),
 65 |                 val,
 66 |                 ' ' * (valwidth - len(val)),
 67 |             ))
 68 |         lines.append(dashes)
 69 |         self.file.write('\n'.join(lines) + '\n')
 70 | 
 71 |         # Flush the output to the file
 72 |         self.file.flush()
 73 | 
 74 |     def _truncate(self, s):
 75 |         return s[:20] + '...' if len(s) > 23 else s
 76 | 
 77 |     def writeseq(self, seq):
 78 |         for arg in seq:
 79 |             self.file.write(arg)
 80 |         self.file.write('\n')
 81 |         self.file.flush()
 82 | 
 83 |     def close(self):
 84 |         if self.own_file:
 85 |             self.file.close()
 86 | 
 87 | class JSONOutputFormat(KVWriter):
 88 |     def __init__(self, filename):
 89 |         self.file = open(filename, 'wt')
 90 | 
 91 |     def writekvs(self, kvs):
 92 |         for k, v in sorted(kvs.items()):
 93 |             if hasattr(v, 'dtype'):
 94 |                 v = v.tolist()
 95 |                 kvs[k] = float(v)
 96 |         self.file.write(json.dumps(kvs) + '\n')
 97 |         self.file.flush()
 98 | 
 99 |     def close(self):
100 |         self.file.close()
101 | 
102 | class CSVOutputFormat(KVWriter):
103 |     def __init__(self, filename):
104 |         self.file = open(filename, 'w+t')
105 |         self.keys = []
106 |         self.sep = ','
107 | 
108 |     def writekvs(self, kvs):
109 |         # Add our current row to the history
110 |         extra_keys = kvs.keys() - self.keys
111 |         if extra_keys:
112 |             self.keys.extend(extra_keys)
113 |             self.file.seek(0)
114 |             lines = self.file.readlines()
115 |             self.file.seek(0)
116 |             for (i, k) in enumerate(self.keys):
117 |                 if i > 0:
118 |                     self.file.write(',')
119 |                 self.file.write(k)
120 |             self.file.write('\n')
121 |             for line in lines[1:]:
122 |                 self.file.write(line[:-1])
123 |                 self.file.write(self.sep * len(extra_keys))
124 |                 self.file.write('\n')
125 |         for (i, k) in enumerate(self.keys):
126 |             if i > 0:
127 |                 self.file.write(',')
128 |             v = kvs.get(k)
129 |             if v is not None:
130 |                 self.file.write(str(v))
131 |         self.file.write('\n')
132 |         self.file.flush()
133 | 
134 |     def close(self):
135 |         self.file.close()
136 | 
137 | 
138 | class TensorBoardOutputFormat(KVWriter):
139 |     """
140 |     Dumps key/value pairs into TensorBoard's numeric format.
141 |     """
142 |     def __init__(self, dir):
143 |         os.makedirs(dir, exist_ok=True)
144 |         self.dir = dir
145 |         self.step = 1
146 |         prefix = 'events'
147 |         path = osp.join(osp.abspath(dir), prefix)
148 |         import tensorflow as tf
149 |         from tensorflow.python import pywrap_tensorflow
150 |         from tensorflow.core.util import event_pb2
151 |         from tensorflow.python.util import compat
152 |         self.tf = tf
153 |         self.event_pb2 = event_pb2
154 |         self.pywrap_tensorflow = pywrap_tensorflow
155 |         self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
156 | 
157 |     def writekvs(self, kvs):
158 |         def summary_val(k, v):
159 |             kwargs = {'tag': k, 'simple_value': float(v)}
160 |             return self.tf.Summary.Value(**kwargs)
161 |         summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
162 |         event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
163 |         event.step = self.step # is there any reason why you'd want to specify the step?
164 |         self.writer.WriteEvent(event)
165 |         self.writer.Flush()
166 |         self.step += 1
167 | 
168 |     def close(self):
169 |         if self.writer:
170 |             self.writer.Close()
171 |             self.writer = None
172 | 
173 | def make_output_format(format, ev_dir, log_suffix=''):
174 |     os.makedirs(ev_dir, exist_ok=True)
175 |     if format == 'stdout':
176 |         return HumanOutputFormat(sys.stdout)
177 |     elif format == 'log':
178 |         return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % log_suffix))
179 |     elif format == 'json':
180 |         return JSONOutputFormat(osp.join(ev_dir, 'progress%s.json' % log_suffix))
181 |     elif format == 'csv':
182 |         return CSVOutputFormat(osp.join(ev_dir, 'progress%s.csv' % log_suffix))
183 |     elif format == 'tensorboard':
184 |         return TensorBoardOutputFormat(osp.join(ev_dir, 'tb%s' % log_suffix))
185 |     else:
186 |         raise ValueError('Unknown format specified: %s' % (format,))
187 | 
188 | # ================================================================
189 | # API
190 | # ================================================================
191 | 
192 | def logkv(key, val):
193 |     """
194 |     Log a value of some diagnostic
195 |     Call this once for each diagnostic quantity, each iteration
196 |     If called many times, last value will be used.
197 |     """
198 |     Logger.CURRENT.logkv(key, val)
199 | 
200 | def logkv_mean(key, val):
201 |     """
202 |     The same as logkv(), but if called many times, values averaged.
203 |     """
204 |     Logger.CURRENT.logkv_mean(key, val)
205 | 
206 | def logkvs(d):
207 |     """
208 |     Log a dictionary of key-value pairs
209 |     """
210 |     for (k, v) in d.items():
211 |         logkv(k, v)
212 | 
213 | def dumpkvs():
214 |     """
215 |     Write all of the diagnostics from the current iteration
216 | 
217 |     level: int. (see logger.py docs) If the global logger level is higher than
218 |                 the level argument here, don't print to stdout.
219 |     """
220 |     Logger.CURRENT.dumpkvs()
221 | 
222 | def getkvs():
223 |     return Logger.CURRENT.name2val
224 | 
225 | 
226 | def log(*args, level=INFO):
227 |     """
228 |     Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
229 |     """
230 |     Logger.CURRENT.log(*args, level=level)
231 | 
232 | def debug(*args):
233 |     log(*args, level=DEBUG)
234 | 
235 | def info(*args):
236 |     log(*args, level=INFO)
237 | 
238 | def warn(*args):
239 |     log(*args, level=WARN)
240 | 
241 | def error(*args):
242 |     log(*args, level=ERROR)
243 | 
244 | 
245 | def set_level(level):
246 |     """
247 |     Set logging threshold on current logger.
248 |     """
249 |     Logger.CURRENT.set_level(level)
250 | 
251 | def get_dir():
252 |     """
253 |     Get directory that log files are being written to.
254 |     will be None if there is no output directory (i.e., if you didn't call start)
255 |     """
256 |     return Logger.CURRENT.get_dir()
257 | 
258 | record_tabular = logkv
259 | dump_tabular = dumpkvs
260 | 
261 | class ProfileKV:
262 |     """
263 |     Usage:
264 |     with logger.ProfileKV("interesting_scope"):
265 |         code
266 |     """
267 |     def __init__(self, n):
268 |         self.n = "wait_" + n
269 |     def __enter__(self):
270 |         self.t1 = time.time()
271 |     def __exit__(self ,type, value, traceback):
272 |         Logger.CURRENT.name2val[self.n] += time.time() - self.t1
273 | 
274 | def profile(n):
275 |     """
276 |     Usage:
277 |     @profile("my_func")
278 |     def my_func(): code
279 |     """
280 |     def decorator_with_name(func):
281 |         def func_wrapper(*args, **kwargs):
282 |             with ProfileKV(n):
283 |                 return func(*args, **kwargs)
284 |         return func_wrapper
285 |     return decorator_with_name
286 | 
287 | 
288 | # ================================================================
289 | # Backend
290 | # ================================================================
291 | 
292 | class Logger(object):
293 |     DEFAULT = None  # A logger with no output files. (See right below class definition)
294 |                     # So that you can still log to the terminal without setting up any output files
295 |     CURRENT = None  # Current logger being used by the free functions above
296 | 
297 |     def __init__(self, dir, output_formats):
298 |         self.name2val = defaultdict(float)  # values this iteration
299 |         self.name2cnt = defaultdict(int)
300 |         self.level = INFO
301 |         self.dir = dir
302 |         self.output_formats = output_formats
303 | 
304 |     # Logging API, forwarded
305 |     # ----------------------------------------
306 |     def logkv(self, key, val):
307 |         self.name2val[key] = val
308 | 
309 |     def logkv_mean(self, key, val):
310 |         if val is None:
311 |             self.name2val[key] = None
312 |             return
313 |         oldval, cnt = self.name2val[key], self.name2cnt[key]
314 |         self.name2val[key] = oldval*cnt/(cnt+1) + val/(cnt+1)
315 |         self.name2cnt[key] = cnt + 1
316 | 
317 |     def dumpkvs(self):
318 |         if self.level == DISABLED: return
319 |         for fmt in self.output_formats:
320 |             if isinstance(fmt, KVWriter):
321 |                 fmt.writekvs(self.name2val)
322 |         self.name2val.clear()
323 |         self.name2cnt.clear()
324 | 
325 |     def log(self, *args, level=INFO):
326 |         if self.level <= level:
327 |             self._do_log(args)
328 | 
329 |     # Configuration
330 |     # ----------------------------------------
331 |     def set_level(self, level):
332 |         self.level = level
333 | 
334 |     def get_dir(self):
335 |         return self.dir
336 | 
337 |     def close(self):
338 |         for fmt in self.output_formats:
339 |             fmt.close()
340 | 
341 |     # Misc
342 |     # ----------------------------------------
343 |     def _do_log(self, args):
344 |         for fmt in self.output_formats:
345 |             if isinstance(fmt, SeqWriter):
346 |                 fmt.writeseq(map(str, args))
347 | 
348 | Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
349 | 
350 | def configure(dir=None, format_strs=None):
351 |     if dir is None:
352 |         dir = os.getenv('OPENAI_LOGDIR')
353 |     if dir is None:
354 |         dir = osp.join(tempfile.gettempdir(),
355 |             datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
356 |     assert isinstance(dir, str)
357 |     os.makedirs(dir, exist_ok=True)
358 | 
359 |     log_suffix = ''
360 |     from mpi4py import MPI
361 |     rank = MPI.COMM_WORLD.Get_rank()
362 |     if rank > 0:
363 |         log_suffix = "-rank%03i" % rank
364 | 
365 |     if format_strs is None:
366 |         strs, strs_mpi = os.getenv('OPENAI_LOG_FORMAT'), os.getenv('OPENAI_LOG_FORMAT_MPI')
367 |         format_strs = strs_mpi if rank>0 else strs
368 |         if format_strs is not None:
369 |             format_strs = format_strs.split(',')
370 |         else:
371 |             format_strs = LOG_OUTPUT_FORMATS_MPI if rank>0 else LOG_OUTPUT_FORMATS
372 | 
373 |     output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
374 | 
375 |     Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
376 |     log('Logging to %s'%dir)
377 | 
378 | def reset():
379 |     if Logger.CURRENT is not Logger.DEFAULT:
380 |         Logger.CURRENT.close()
381 |         Logger.CURRENT = Logger.DEFAULT
382 |         log('Reset logger')
383 | 
384 | class scoped_configure(object):
385 |     def __init__(self, dir=None, format_strs=None):
386 |         self.dir = dir
387 |         self.format_strs = format_strs
388 |         self.prevlogger = None
389 |     def __enter__(self):
390 |         self.prevlogger = Logger.CURRENT
391 |         configure(dir=self.dir, format_strs=self.format_strs)
392 |     def __exit__(self, *args):
393 |         Logger.CURRENT.close()
394 |         Logger.CURRENT = self.prevlogger
395 | 
396 | # ================================================================
397 | 
398 | def _demo():
399 |     info("hi")
400 |     debug("shouldn't appear")
401 |     set_level(DEBUG)
402 |     debug("should appear")
403 |     dir = "/tmp/testlogging"
404 |     if os.path.exists(dir):
405 |         shutil.rmtree(dir)
406 |     configure(dir=dir)
407 |     logkv("a", 3)
408 |     logkv("b", 2.5)
409 |     dumpkvs()
410 |     logkv("b", -2.5)
411 |     logkv("a", 5.5)
412 |     dumpkvs()
413 |     info("^^^ should see a = 5.5")
414 |     logkv_mean("b", -22.5)
415 |     logkv_mean("b", -44.4)
416 |     logkv("a", 5.5)
417 |     dumpkvs()
418 |     info("^^^ should see b = 33.3")
419 | 
420 |     logkv("b", -2.5)
421 |     dumpkvs()
422 | 
423 |     logkv("a", "longasslongasslongasslongasslongasslongassvalue")
424 |     dumpkvs()
425 | 
426 | 
427 | # ================================================================
428 | # Readers
429 | # ================================================================
430 | 
431 | def read_json(fname):
432 |     import pandas
433 |     ds = []
434 |     with open(fname, 'rt') as fh:
435 |         for line in fh:
436 |             ds.append(json.loads(line))
437 |     return pandas.DataFrame(ds)
438 | 
439 | def read_csv(fname):
440 |     import pandas
441 |     return pandas.read_csv(fname, index_col=None, comment='#')
442 | 
443 | def read_tb(path):
444 |     """
445 |     path : a tensorboard file OR a directory, where we will find all TB files
446 |            of the form events.*
447 |     """
448 |     import pandas
449 |     import numpy as np
450 |     from glob import glob
451 |     from collections import defaultdict
452 |     import tensorflow as tf
453 |     if osp.isdir(path):
454 |         fnames = glob(osp.join(path, "events.*"))
455 |     elif osp.basename(path).startswith("events."):
456 |         fnames = [path]
457 |     else:
458 |         raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path)
459 |     tag2pairs = defaultdict(list)
460 |     maxstep = 0
461 |     for fname in fnames:
462 |         for summary in tf.train.summary_iterator(fname):
463 |             if summary.step > 0:
464 |                 for v in summary.summary.value:
465 |                     pair = (summary.step, v.simple_value)
466 |                     tag2pairs[v.tag].append(pair)
467 |                 maxstep = max(summary.step, maxstep)
468 |     data = np.empty((maxstep, len(tag2pairs)))
469 |     data[:] = np.nan
470 |     tags = sorted(tag2pairs.keys())
471 |     for (colidx,tag) in enumerate(tags):
472 |         pairs = tag2pairs[tag]
473 |         for (step, value) in pairs:
474 |             data[step-1, colidx] = value
475 |     return pandas.DataFrame(data, columns=tags)
476 | 
477 | if __name__ == "__main__":
478 |     _demo()
479 | 


--------------------------------------------------------------------------------
/baselines/results_plotter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib
 3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | plt.rcParams['svg.fonttype'] = 'none'
 7 | 
 8 | from baselines.bench.monitor import load_results
 9 | 
10 | X_TIMESTEPS = 'timesteps'
11 | X_EPISODES = 'episodes'
12 | X_WALLTIME = 'walltime_hrs'
13 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
14 | EPISODES_WINDOW = 100
15 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
16 |         'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
17 |         'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
18 | 
19 | def rolling_window(a, window):
20 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
21 |     strides = a.strides + (a.strides[-1],)
22 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
23 | 
24 | def window_func(x, y, window, func):
25 |     yw = rolling_window(y, window)
26 |     yw_func = func(yw, axis=-1)
27 |     return x[window-1:], yw_func
28 | 
29 | def ts2xy(ts, xaxis):
30 |     if xaxis == X_TIMESTEPS:
31 |         x = np.cumsum(ts.l.values)
32 |         y = ts.r.values
33 |     elif xaxis == X_EPISODES:
34 |         x = np.arange(len(ts))
35 |         y = ts.r.values
36 |     elif xaxis == X_WALLTIME:
37 |         x = ts.t.values / 3600.
38 |         y = ts.r.values
39 |     else:
40 |         raise NotImplementedError
41 |     return x, y
42 | 
43 | def plot_curves(xy_list, xaxis, title):
44 |     plt.figure(figsize=(8,2))
45 |     maxx = max(xy[0][-1] for xy in xy_list)
46 |     minx = 0
47 |     for (i, (x, y)) in enumerate(xy_list):
48 |         color = COLORS[i]
49 |         plt.scatter(x, y, s=2)
50 |         x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
51 |         plt.plot(x, y_mean, color=color)
52 |     plt.xlim(minx, maxx)
53 |     plt.title(title)
54 |     plt.xlabel(xaxis)
55 |     plt.ylabel("Episode Rewards")
56 |     plt.tight_layout()
57 | 
58 | def plot_results(dirs, num_timesteps, xaxis, task_name):
59 |     tslist = []
60 |     for dir in dirs:
61 |         ts = load_results(dir)
62 |         ts = ts[ts.l.cumsum() <= num_timesteps]
63 |         tslist.append(ts)
64 |     xy_list = [ts2xy(ts, xaxis) for ts in tslist]
65 |     plot_curves(xy_list, xaxis, task_name)
66 | 
67 | # Example usage in jupyter-notebook
68 | # from baselines import log_viewer
69 | # %matplotlib inline
70 | # log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
71 | # Here ./log is a directory containing the monitor.csv files
72 | 
73 | def main():
74 |     import argparse
75 |     import os
76 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
77 |     parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log'])
78 |     parser.add_argument('--num_timesteps', type=int, default=int(10e6))
79 |     parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
80 |     parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout')
81 |     args = parser.parse_args()
82 |     args.dirs = [os.path.abspath(dir) for dir in args.dirs]
83 |     plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name)
84 |     plt.show()
85 | 
86 | if __name__ == '__main__':
87 |     main()


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import sys
 3 | 
 4 | if sys.version_info.major != 3:
 5 |     print('This Python is only compatible with Python 3, but you are running '
 6 |           'Python {}. The installation will likely fail.'.format(sys.version_info.major))
 7 | 
 8 | 
 9 | setup(name='baselines',
10 |       packages=[package for package in find_packages()
11 |                 if package.startswith('baselines')],
12 |       install_requires=[
13 |           'gym[mujoco,atari,classic_control,robotics]',
14 |           'scipy',
15 |           'tqdm',
16 |           'joblib',
17 |           'zmq',
18 |           'dill',
19 |           'progressbar2',
20 |           'mpi4py',
21 |           'cloudpickle',
22 |           'tensorflow>=1.4.0',
23 |           'click',
24 |           'opencv-python'
25 |       ],
26 |       description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
27 |       author='OpenAI',
28 |       url='https://github.com/openai/baselines',
29 |       author_email='gym@openai.com',
30 |       version='0.1.5')
31 | 


--------------------------------------------------------------------------------