├── .gitignore ├── LICENSE ├── README.md ├── baselines ├── __init__.py ├── common │ ├── __init__.py │ ├── atari_wrappers.py │ ├── cg.py │ ├── cmd_util.py │ ├── console_util.py │ ├── dataset.py │ ├── distributions.py │ ├── filters.py │ ├── math_util.py │ ├── misc_util.py │ ├── mpi_adam.py │ ├── mpi_fork.py │ ├── mpi_moments.py │ ├── mpi_running_mean_std.py │ ├── runners.py │ ├── running_mean_std.py │ ├── running_stat.py │ ├── schedules.py │ ├── segment_tree.py │ ├── tests │ │ ├── test_schedules.py │ │ ├── test_segment_tree.py │ │ └── test_tf_util.py │ ├── tf_util.py │ └── vec_env │ │ ├── __init__.py │ │ ├── dummy_vec_env.py │ │ ├── subproc_vec_env.py │ │ ├── vec_frame_stack.py │ │ └── vec_normalize.py ├── her │ ├── __init__.py │ ├── actor_critic.py │ ├── ddpg.py │ ├── experiment │ │ ├── __init__.py │ │ ├── config.py │ │ ├── play.py │ │ └── train.py │ ├── her.py │ ├── normalizer.py │ ├── replay_buffer.py │ ├── rollout.py │ └── util.py ├── logger.py └── results_plotter.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.pkl 4 | *.py~ 5 | .pytest_cache 6 | .DS_Store 7 | .idea 8 | *.txt 9 | 10 | # Setuptools distribution and build folders. 11 | /dist/ 12 | /build 13 | keys/ 14 | 15 | # Virtualenv 16 | /env 17 | 18 | 19 | *.sublime-project 20 | *.sublime-workspace 21 | 22 | *log/ 23 | *logs/ 24 | 25 | .idea 26 | 27 | logs/ 28 | logs_backup/ 29 | 30 | .ipynb_checkpoints 31 | ghostdriver.log 32 | 33 | htmlcov 34 | 35 | junk 36 | src 37 | 38 | *.egg-info 39 | .cache 40 | 41 | MUJOCO_LOG.TXT 42 | 43 | openai.sublime-project 44 | openai.sublime-workspace 45 | 46 | train/ 47 | td-error.txt 48 | 49 | model/ 50 | Baseline/__pycache__ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | __author__ = "Rui Zhao" 2 | __copyright__ = "Siemens AG, 2018" 3 | __licencse__ = "MIT" 4 | __version__ = "0.1" 5 | 6 | MIT License 7 | Copyright (c) 2018 Siemens AG 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Energy-Based Hindsight Experience Prioritization 2 | 3 | Here is the code for our paper "Energy-Based Hindsight Experience Prioritization". 4 | 5 | The paper is published in 2018 Conference on Robot Learning (CoRL 2018) as oral presentation (7%). 6 | 7 | The paper is avaliable at Proceedings of Machine Learning Research: http://proceedings.mlr.press/v87/zhao18a.html 8 | 9 | The code was developed by Rui Zhao (Siemens AG & Ludwig Maximilian University of Munich). 10 | 11 | For details on Energy-Based Hindsight Experience Prioritization (EBP), please read the published paper. 12 | 13 | The code is developed based on OpenAI Baselines (link: https://github.com/openai/baselines). 14 | 15 | ## Prerequisites 16 | 17 | The code requires python3 (>=3.5) with the development headers. You'll also need system packages CMake, OpenMPI and zlib. Those can be installed as follows 18 | 19 | ### Usage 20 | 21 | ```bash 22 | sudo apt-get update && sudo apt-get install cmake libopenmpi-dev python3-dev zlib1g-dev 23 | ``` 24 | 25 | To run the code, you need to install OpenAI Gym (link: https://github.com/openai/gym). 26 | We use the robotics environment in OpenAI Gym, which needs the MuJoCu physics engine (link: http://www.mujoco.org/). 27 | 28 | The experiments were carried out on a 20-CPUs server. 29 | We use 19 CPUs for training. 30 | If you are running the experiments on a laptop, please configure a smaller number of CPUs. 31 | Note that, with less CPUs, the performance will be effected. 32 | 33 | After the installaton of dependicies, you can reproduce the experimental results by running the following commnands: 34 | ``` 35 | python baselines/her/experiment/train.py --env_name FetchPickAndPlace-v0 --prioritization none --n_epochs 50 --num_cpu 19 36 | python baselines/her/experiment/train.py --env_name FetchPickAndPlace-v0 --prioritization tderror --n_epochs 50 --num_cpu 19 37 | python baselines/her/experiment/train.py --env_name FetchPickAndPlace-v0 --prioritization energy --clip_energy 0.5 --n_epochs 50 --num_cpu 19 38 | ``` 39 | For FetchPickAndPlace-v0, we use clip_energy parameter 0.5. 40 | For the other three hand environments, we use clip_energy 2.5. 41 | 42 | ``` 43 | python baselines/her/experiment/train.py --env_name HandManipulateEggFull-v0 --prioritization none --n_epochs 200 --num_cpu 19 44 | python baselines/her/experiment/train.py --env_name HandManipulateEggFull-v0 --prioritization tderror --n_epochs 200 --num_cpu 19 45 | python baselines/her/experiment/train.py --env_name HandManipulateEggFull-v0 --prioritization energy --clip_energy 2.5 --n_epochs 200 --num_cpu 19 46 | ``` 47 | 48 | To test the learned policies, you can run the command: 49 | ``` 50 | python baselines/her/experiment/play.py /path/to/an/experiment/policy_latest.pkl 51 | ``` 52 | 53 | ## Citation: 54 | 55 | Citation of the arXiv version: 56 | 57 | ``` 58 | @article{zhao2018energy, 59 | title={Energy-Based Hindsight Experience Prioritization}, 60 | author={Zhao, Rui and Tresp, Volker}, 61 | journal={arXiv preprint arXiv:1810.01363}, 62 | year={2018} 63 | } 64 | ``` 65 | 66 | ## Licence: 67 | 68 | MIT 69 | -------------------------------------------------------------------------------- /baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruizhaogit/EnergyBasedPrioritization/2fd2f5bab0547848f4f76b837d16238435518dcc/baselines/__init__.py -------------------------------------------------------------------------------- /baselines/common/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F403 2 | from baselines.common.console_util import * 3 | from baselines.common.dataset import Dataset 4 | from baselines.common.math_util import * 5 | from baselines.common.misc_util import * 6 | -------------------------------------------------------------------------------- /baselines/common/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | import gym 4 | from gym import spaces 5 | import cv2 6 | cv2.ocl.setUseOpenCL(False) 7 | 8 | class NoopResetEnv(gym.Wrapper): 9 | def __init__(self, env, noop_max=30): 10 | """Sample initial states by taking random number of no-ops on reset. 11 | No-op is assumed to be action 0. 12 | """ 13 | gym.Wrapper.__init__(self, env) 14 | self.noop_max = noop_max 15 | self.override_num_noops = None 16 | self.noop_action = 0 17 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 18 | 19 | def reset(self, **kwargs): 20 | """ Do no-op action for a number of steps in [1, noop_max].""" 21 | self.env.reset(**kwargs) 22 | if self.override_num_noops is not None: 23 | noops = self.override_num_noops 24 | else: 25 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 26 | assert noops > 0 27 | obs = None 28 | for _ in range(noops): 29 | obs, _, done, _ = self.env.step(self.noop_action) 30 | if done: 31 | obs = self.env.reset(**kwargs) 32 | return obs 33 | 34 | def step(self, ac): 35 | return self.env.step(ac) 36 | 37 | class FireResetEnv(gym.Wrapper): 38 | def __init__(self, env): 39 | """Take action on reset for environments that are fixed until firing.""" 40 | gym.Wrapper.__init__(self, env) 41 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 42 | assert len(env.unwrapped.get_action_meanings()) >= 3 43 | 44 | def reset(self, **kwargs): 45 | self.env.reset(**kwargs) 46 | obs, _, done, _ = self.env.step(1) 47 | if done: 48 | self.env.reset(**kwargs) 49 | obs, _, done, _ = self.env.step(2) 50 | if done: 51 | self.env.reset(**kwargs) 52 | return obs 53 | 54 | def step(self, ac): 55 | return self.env.step(ac) 56 | 57 | class EpisodicLifeEnv(gym.Wrapper): 58 | def __init__(self, env): 59 | """Make end-of-life == end-of-episode, but only reset on true game over. 60 | Done by DeepMind for the DQN and co. since it helps value estimation. 61 | """ 62 | gym.Wrapper.__init__(self, env) 63 | self.lives = 0 64 | self.was_real_done = True 65 | 66 | def step(self, action): 67 | obs, reward, done, info = self.env.step(action) 68 | self.was_real_done = done 69 | # check current lives, make loss of life terminal, 70 | # then update lives to handle bonus lives 71 | lives = self.env.unwrapped.ale.lives() 72 | if lives < self.lives and lives > 0: 73 | # for Qbert sometimes we stay in lives == 0 condtion for a few frames 74 | # so its important to keep lives > 0, so that we only reset once 75 | # the environment advertises done. 76 | done = True 77 | self.lives = lives 78 | return obs, reward, done, info 79 | 80 | def reset(self, **kwargs): 81 | """Reset only when lives are exhausted. 82 | This way all states are still reachable even though lives are episodic, 83 | and the learner need not know about any of this behind-the-scenes. 84 | """ 85 | if self.was_real_done: 86 | obs = self.env.reset(**kwargs) 87 | else: 88 | # no-op step to advance from terminal/lost life state 89 | obs, _, _, _ = self.env.step(0) 90 | self.lives = self.env.unwrapped.ale.lives() 91 | return obs 92 | 93 | class MaxAndSkipEnv(gym.Wrapper): 94 | def __init__(self, env, skip=4): 95 | """Return only every `skip`-th frame""" 96 | gym.Wrapper.__init__(self, env) 97 | # most recent raw observations (for max pooling across time steps) 98 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 99 | self._skip = skip 100 | 101 | def step(self, action): 102 | """Repeat action, sum reward, and max over last observations.""" 103 | total_reward = 0.0 104 | done = None 105 | for i in range(self._skip): 106 | obs, reward, done, info = self.env.step(action) 107 | if i == self._skip - 2: self._obs_buffer[0] = obs 108 | if i == self._skip - 1: self._obs_buffer[1] = obs 109 | total_reward += reward 110 | if done: 111 | break 112 | # Note that the observation on the done=True frame 113 | # doesn't matter 114 | max_frame = self._obs_buffer.max(axis=0) 115 | 116 | return max_frame, total_reward, done, info 117 | 118 | def reset(self, **kwargs): 119 | return self.env.reset(**kwargs) 120 | 121 | class ClipRewardEnv(gym.RewardWrapper): 122 | def __init__(self, env): 123 | gym.RewardWrapper.__init__(self, env) 124 | 125 | def reward(self, reward): 126 | """Bin reward to {+1, 0, -1} by its sign.""" 127 | return np.sign(reward) 128 | 129 | class WarpFrame(gym.ObservationWrapper): 130 | def __init__(self, env): 131 | """Warp frames to 84x84 as done in the Nature paper and later work.""" 132 | gym.ObservationWrapper.__init__(self, env) 133 | self.width = 84 134 | self.height = 84 135 | self.observation_space = spaces.Box(low=0, high=255, 136 | shape=(self.height, self.width, 1), dtype=np.uint8) 137 | 138 | def observation(self, frame): 139 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 140 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) 141 | return frame[:, :, None] 142 | 143 | class FrameStack(gym.Wrapper): 144 | def __init__(self, env, k): 145 | """Stack k last frames. 146 | 147 | Returns lazy array, which is much more memory efficient. 148 | 149 | See Also 150 | -------- 151 | baselines.common.atari_wrappers.LazyFrames 152 | """ 153 | gym.Wrapper.__init__(self, env) 154 | self.k = k 155 | self.frames = deque([], maxlen=k) 156 | shp = env.observation_space.shape 157 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8) 158 | 159 | def reset(self): 160 | ob = self.env.reset() 161 | for _ in range(self.k): 162 | self.frames.append(ob) 163 | return self._get_ob() 164 | 165 | def step(self, action): 166 | ob, reward, done, info = self.env.step(action) 167 | self.frames.append(ob) 168 | return self._get_ob(), reward, done, info 169 | 170 | def _get_ob(self): 171 | assert len(self.frames) == self.k 172 | return LazyFrames(list(self.frames)) 173 | 174 | class ScaledFloatFrame(gym.ObservationWrapper): 175 | def __init__(self, env): 176 | gym.ObservationWrapper.__init__(self, env) 177 | 178 | def observation(self, observation): 179 | # careful! This undoes the memory optimization, use 180 | # with smaller replay buffers only. 181 | return np.array(observation).astype(np.float32) / 255.0 182 | 183 | class LazyFrames(object): 184 | def __init__(self, frames): 185 | """This object ensures that common frames between the observations are only stored once. 186 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 187 | buffers. 188 | 189 | This object should only be converted to numpy array before being passed to the model. 190 | 191 | You'd not believe how complex the previous solution was.""" 192 | self._frames = frames 193 | self._out = None 194 | 195 | def _force(self): 196 | if self._out is None: 197 | self._out = np.concatenate(self._frames, axis=2) 198 | self._frames = None 199 | return self._out 200 | 201 | def __array__(self, dtype=None): 202 | out = self._force() 203 | if dtype is not None: 204 | out = out.astype(dtype) 205 | return out 206 | 207 | def __len__(self): 208 | return len(self._force()) 209 | 210 | def __getitem__(self, i): 211 | return self._force()[i] 212 | 213 | def make_atari(env_id): 214 | env = gym.make(env_id) 215 | assert 'NoFrameskip' in env.spec.id 216 | env = NoopResetEnv(env, noop_max=30) 217 | env = MaxAndSkipEnv(env, skip=4) 218 | return env 219 | 220 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): 221 | """Configure environment for DeepMind-style Atari. 222 | """ 223 | if episode_life: 224 | env = EpisodicLifeEnv(env) 225 | if 'FIRE' in env.unwrapped.get_action_meanings(): 226 | env = FireResetEnv(env) 227 | env = WarpFrame(env) 228 | if scale: 229 | env = ScaledFloatFrame(env) 230 | if clip_rewards: 231 | env = ClipRewardEnv(env) 232 | if frame_stack: 233 | env = FrameStack(env, 4) 234 | return env 235 | 236 | -------------------------------------------------------------------------------- /baselines/common/cg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): 3 | """ 4 | Demmel p 312 5 | """ 6 | p = b.copy() 7 | r = b.copy() 8 | x = np.zeros_like(b) 9 | rdotr = r.dot(r) 10 | 11 | fmtstr = "%10i %10.3g %10.3g" 12 | titlestr = "%10s %10s %10s" 13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm")) 14 | 15 | for i in range(cg_iters): 16 | if callback is not None: 17 | callback(x) 18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x))) 19 | z = f_Ax(p) 20 | v = rdotr / p.dot(z) 21 | x += v*p 22 | r -= v*z 23 | newrdotr = r.dot(r) 24 | mu = newrdotr/rdotr 25 | p = r + mu*p 26 | 27 | rdotr = newrdotr 28 | if rdotr < residual_tol: 29 | break 30 | 31 | if callback is not None: 32 | callback(x) 33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 34 | return x -------------------------------------------------------------------------------- /baselines/common/cmd_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for scripts like run_atari.py. 3 | """ 4 | 5 | import os 6 | import gym 7 | from gym.wrappers import FlattenDictWrapper 8 | from baselines import logger 9 | from baselines.bench import Monitor 10 | from baselines.common import set_global_seeds 11 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 12 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv 13 | 14 | def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): 15 | """ 16 | Create a wrapped, monitored SubprocVecEnv for Atari. 17 | """ 18 | if wrapper_kwargs is None: wrapper_kwargs = {} 19 | def make_env(rank): # pylint: disable=C0111 20 | def _thunk(): 21 | env = make_atari(env_id) 22 | env.seed(seed + rank) 23 | env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) 24 | return wrap_deepmind(env, **wrapper_kwargs) 25 | return _thunk 26 | set_global_seeds(seed) 27 | return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)]) 28 | 29 | def make_mujoco_env(env_id, seed): 30 | """ 31 | Create a wrapped, monitored gym.Env for MuJoCo. 32 | """ 33 | set_global_seeds(seed) 34 | env = gym.make(env_id) 35 | env = Monitor(env, logger.get_dir()) 36 | env.seed(seed) 37 | return env 38 | 39 | def make_robotics_env(env_id, seed, rank=0): 40 | """ 41 | Create a wrapped, monitored gym.Env for MuJoCo. 42 | """ 43 | set_global_seeds(seed) 44 | env = gym.make(env_id) 45 | env = FlattenDictWrapper(env, ['observation', 'desired_goal']) 46 | env = Monitor( 47 | env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), 48 | info_keywords=('is_success',)) 49 | env.seed(seed) 50 | return env 51 | 52 | def arg_parser(): 53 | """ 54 | Create an empty argparse.ArgumentParser. 55 | """ 56 | import argparse 57 | return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 58 | 59 | def atari_arg_parser(): 60 | """ 61 | Create an argparse.ArgumentParser for run_atari.py. 62 | """ 63 | parser = arg_parser() 64 | parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') 65 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 66 | parser.add_argument('--num-timesteps', type=int, default=int(10e6)) 67 | return parser 68 | 69 | def mujoco_arg_parser(): 70 | """ 71 | Create an argparse.ArgumentParser for run_mujoco.py. 72 | """ 73 | parser = arg_parser() 74 | parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') 75 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 76 | parser.add_argument('--num-timesteps', type=int, default=int(1e6)) 77 | return parser 78 | 79 | def robotics_arg_parser(): 80 | """ 81 | Create an argparse.ArgumentParser for run_mujoco.py. 82 | """ 83 | parser = arg_parser() 84 | parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0') 85 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 86 | parser.add_argument('--num-timesteps', type=int, default=int(1e6)) 87 | return parser 88 | -------------------------------------------------------------------------------- /baselines/common/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from contextlib import contextmanager 3 | import numpy as np 4 | import time 5 | 6 | # ================================================================ 7 | # Misc 8 | # ================================================================ 9 | 10 | def fmt_row(width, row, header=False): 11 | out = " | ".join(fmt_item(x, width) for x in row) 12 | if header: out = out + "\n" + "-"*len(out) 13 | return out 14 | 15 | def fmt_item(x, l): 16 | if isinstance(x, np.ndarray): 17 | assert x.ndim==0 18 | x = x.item() 19 | if isinstance(x, (float, np.float32, np.float64)): 20 | v = abs(x) 21 | if (v < 1e-4 or v > 1e+4) and v > 0: 22 | rep = "%7.2e" % x 23 | else: 24 | rep = "%7.5f" % x 25 | else: rep = str(x) 26 | return " "*(l - len(rep)) + rep 27 | 28 | color2num = dict( 29 | gray=30, 30 | red=31, 31 | green=32, 32 | yellow=33, 33 | blue=34, 34 | magenta=35, 35 | cyan=36, 36 | white=37, 37 | crimson=38 38 | ) 39 | 40 | def colorize(string, color, bold=False, highlight=False): 41 | attr = [] 42 | num = color2num[color] 43 | if highlight: num += 10 44 | attr.append(str(num)) 45 | if bold: attr.append('1') 46 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 47 | 48 | 49 | MESSAGE_DEPTH = 0 50 | 51 | @contextmanager 52 | def timed(msg): 53 | global MESSAGE_DEPTH #pylint: disable=W0603 54 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) 55 | tstart = time.time() 56 | MESSAGE_DEPTH += 1 57 | yield 58 | MESSAGE_DEPTH -= 1 59 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) 60 | -------------------------------------------------------------------------------- /baselines/common/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Dataset(object): 4 | def __init__(self, data_map, deterministic=False, shuffle=True): 5 | self.data_map = data_map 6 | self.deterministic = deterministic 7 | self.enable_shuffle = shuffle 8 | self.n = next(iter(data_map.values())).shape[0] 9 | self._next_id = 0 10 | self.shuffle() 11 | 12 | def shuffle(self): 13 | if self.deterministic: 14 | return 15 | perm = np.arange(self.n) 16 | np.random.shuffle(perm) 17 | 18 | for key in self.data_map: 19 | self.data_map[key] = self.data_map[key][perm] 20 | 21 | self._next_id = 0 22 | 23 | def next_batch(self, batch_size): 24 | if self._next_id >= self.n and self.enable_shuffle: 25 | self.shuffle() 26 | 27 | cur_id = self._next_id 28 | cur_batch_size = min(batch_size, self.n - self._next_id) 29 | self._next_id += cur_batch_size 30 | 31 | data_map = dict() 32 | for key in self.data_map: 33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 34 | return data_map 35 | 36 | def iterate_once(self, batch_size): 37 | if self.enable_shuffle: self.shuffle() 38 | 39 | while self._next_id <= self.n - batch_size: 40 | yield self.next_batch(batch_size) 41 | self._next_id = 0 42 | 43 | def subset(self, num_elements, deterministic=True): 44 | data_map = dict() 45 | for key in self.data_map: 46 | data_map[key] = self.data_map[key][:num_elements] 47 | return Dataset(data_map, deterministic) 48 | 49 | 50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 52 | arrays = tuple(map(np.asarray, arrays)) 53 | n = arrays[0].shape[0] 54 | assert all(a.shape[0] == n for a in arrays[1:]) 55 | inds = np.arange(n) 56 | if shuffle: np.random.shuffle(inds) 57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches 58 | for batch_inds in np.array_split(inds, sections): 59 | if include_final_partial_batch or len(batch_inds) == batch_size: 60 | yield tuple(a[batch_inds] for a in arrays) 61 | -------------------------------------------------------------------------------- /baselines/common/distributions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import baselines.common.tf_util as U 4 | from baselines.a2c.utils import fc 5 | from tensorflow.python.ops import math_ops 6 | 7 | class Pd(object): 8 | """ 9 | A particular probability distribution 10 | """ 11 | def flatparam(self): 12 | raise NotImplementedError 13 | def mode(self): 14 | raise NotImplementedError 15 | def neglogp(self, x): 16 | # Usually it's easier to define the negative logprob 17 | raise NotImplementedError 18 | def kl(self, other): 19 | raise NotImplementedError 20 | def entropy(self): 21 | raise NotImplementedError 22 | def sample(self): 23 | raise NotImplementedError 24 | def logp(self, x): 25 | return - self.neglogp(x) 26 | 27 | class PdType(object): 28 | """ 29 | Parametrized family of probability distributions 30 | """ 31 | def pdclass(self): 32 | raise NotImplementedError 33 | def pdfromflat(self, flat): 34 | return self.pdclass()(flat) 35 | def pdfromlatent(self, latent_vector): 36 | raise NotImplementedError 37 | def param_shape(self): 38 | raise NotImplementedError 39 | def sample_shape(self): 40 | raise NotImplementedError 41 | def sample_dtype(self): 42 | raise NotImplementedError 43 | 44 | def param_placeholder(self, prepend_shape, name=None): 45 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) 46 | def sample_placeholder(self, prepend_shape, name=None): 47 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) 48 | 49 | class CategoricalPdType(PdType): 50 | def __init__(self, ncat): 51 | self.ncat = ncat 52 | def pdclass(self): 53 | return CategoricalPd 54 | def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): 55 | pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias) 56 | return self.pdfromflat(pdparam), pdparam 57 | 58 | def param_shape(self): 59 | return [self.ncat] 60 | def sample_shape(self): 61 | return [] 62 | def sample_dtype(self): 63 | return tf.int32 64 | 65 | 66 | class MultiCategoricalPdType(PdType): 67 | def __init__(self, nvec): 68 | self.ncats = nvec 69 | def pdclass(self): 70 | return MultiCategoricalPd 71 | def pdfromflat(self, flat): 72 | return MultiCategoricalPd(self.ncats, flat) 73 | def param_shape(self): 74 | return [sum(self.ncats)] 75 | def sample_shape(self): 76 | return [len(self.ncats)] 77 | def sample_dtype(self): 78 | return tf.int32 79 | 80 | class DiagGaussianPdType(PdType): 81 | def __init__(self, size): 82 | self.size = size 83 | def pdclass(self): 84 | return DiagGaussianPd 85 | 86 | def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): 87 | mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) 88 | logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) 89 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) 90 | return self.pdfromflat(pdparam), mean 91 | 92 | def param_shape(self): 93 | return [2*self.size] 94 | def sample_shape(self): 95 | return [self.size] 96 | def sample_dtype(self): 97 | return tf.float32 98 | 99 | class BernoulliPdType(PdType): 100 | def __init__(self, size): 101 | self.size = size 102 | def pdclass(self): 103 | return BernoulliPd 104 | def param_shape(self): 105 | return [self.size] 106 | def sample_shape(self): 107 | return [self.size] 108 | def sample_dtype(self): 109 | return tf.int32 110 | 111 | # WRONG SECOND DERIVATIVES 112 | # class CategoricalPd(Pd): 113 | # def __init__(self, logits): 114 | # self.logits = logits 115 | # self.ps = tf.nn.softmax(logits) 116 | # @classmethod 117 | # def fromflat(cls, flat): 118 | # return cls(flat) 119 | # def flatparam(self): 120 | # return self.logits 121 | # def mode(self): 122 | # return U.argmax(self.logits, axis=-1) 123 | # def logp(self, x): 124 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) 125 | # def kl(self, other): 126 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ 127 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 128 | # def entropy(self): 129 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 130 | # def sample(self): 131 | # u = tf.random_uniform(tf.shape(self.logits)) 132 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) 133 | 134 | class CategoricalPd(Pd): 135 | def __init__(self, logits): 136 | self.logits = logits 137 | def flatparam(self): 138 | return self.logits 139 | def mode(self): 140 | return tf.argmax(self.logits, axis=-1) 141 | def neglogp(self, x): 142 | # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 143 | # Note: we can't use sparse_softmax_cross_entropy_with_logits because 144 | # the implementation does not allow second-order derivatives... 145 | one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) 146 | return tf.nn.softmax_cross_entropy_with_logits( 147 | logits=self.logits, 148 | labels=one_hot_actions) 149 | def kl(self, other): 150 | a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) 151 | a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True) 152 | ea0 = tf.exp(a0) 153 | ea1 = tf.exp(a1) 154 | z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) 155 | z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True) 156 | p0 = ea0 / z0 157 | return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) 158 | def entropy(self): 159 | a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) 160 | ea0 = tf.exp(a0) 161 | z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) 162 | p0 = ea0 / z0 163 | return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1) 164 | def sample(self): 165 | u = tf.random_uniform(tf.shape(self.logits)) 166 | return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) 167 | @classmethod 168 | def fromflat(cls, flat): 169 | return cls(flat) 170 | 171 | class MultiCategoricalPd(Pd): 172 | def __init__(self, nvec, flat): 173 | self.flat = flat 174 | self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1))) 175 | def flatparam(self): 176 | return self.flat 177 | def mode(self): 178 | return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) 179 | def neglogp(self, x): 180 | return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))]) 181 | def kl(self, other): 182 | return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)]) 183 | def entropy(self): 184 | return tf.add_n([p.entropy() for p in self.categoricals]) 185 | def sample(self): 186 | return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) 187 | @classmethod 188 | def fromflat(cls, flat): 189 | raise NotImplementedError 190 | 191 | class DiagGaussianPd(Pd): 192 | def __init__(self, flat): 193 | self.flat = flat 194 | mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat) 195 | self.mean = mean 196 | self.logstd = logstd 197 | self.std = tf.exp(logstd) 198 | def flatparam(self): 199 | return self.flat 200 | def mode(self): 201 | return self.mean 202 | def neglogp(self, x): 203 | return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ 204 | + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \ 205 | + tf.reduce_sum(self.logstd, axis=-1) 206 | def kl(self, other): 207 | assert isinstance(other, DiagGaussianPd) 208 | return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1) 209 | def entropy(self): 210 | return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) 211 | def sample(self): 212 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) 213 | @classmethod 214 | def fromflat(cls, flat): 215 | return cls(flat) 216 | 217 | class BernoulliPd(Pd): 218 | def __init__(self, logits): 219 | self.logits = logits 220 | self.ps = tf.sigmoid(logits) 221 | def flatparam(self): 222 | return self.logits 223 | def mode(self): 224 | return tf.round(self.ps) 225 | def neglogp(self, x): 226 | return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1) 227 | def kl(self, other): 228 | return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) 229 | def entropy(self): 230 | return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) 231 | def sample(self): 232 | u = tf.random_uniform(tf.shape(self.ps)) 233 | return tf.to_float(math_ops.less(u, self.ps)) 234 | @classmethod 235 | def fromflat(cls, flat): 236 | return cls(flat) 237 | 238 | def make_pdtype(ac_space): 239 | from gym import spaces 240 | if isinstance(ac_space, spaces.Box): 241 | assert len(ac_space.shape) == 1 242 | return DiagGaussianPdType(ac_space.shape[0]) 243 | elif isinstance(ac_space, spaces.Discrete): 244 | return CategoricalPdType(ac_space.n) 245 | elif isinstance(ac_space, spaces.MultiDiscrete): 246 | return MultiCategoricalPdType(ac_space.nvec) 247 | elif isinstance(ac_space, spaces.MultiBinary): 248 | return BernoulliPdType(ac_space.n) 249 | else: 250 | raise NotImplementedError 251 | 252 | def shape_el(v, i): 253 | maybe = v.get_shape()[i] 254 | if maybe is not None: 255 | return maybe 256 | else: 257 | return tf.shape(v)[i] 258 | 259 | @U.in_session 260 | def test_probtypes(): 261 | np.random.seed(0) 262 | 263 | pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8]) 264 | diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101 265 | validate_probtype(diag_gauss, pdparam_diag_gauss) 266 | 267 | pdparam_categorical = np.array([-.2, .3, .5]) 268 | categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101 269 | validate_probtype(categorical, pdparam_categorical) 270 | 271 | nvec = [1,2,3] 272 | pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1]) 273 | multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101 274 | validate_probtype(multicategorical, pdparam_multicategorical) 275 | 276 | pdparam_bernoulli = np.array([-.2, .3, .5]) 277 | bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101 278 | validate_probtype(bernoulli, pdparam_bernoulli) 279 | 280 | 281 | def validate_probtype(probtype, pdparam): 282 | N = 100000 283 | # Check to see if mean negative log likelihood == differential entropy 284 | Mval = np.repeat(pdparam[None, :], N, axis=0) 285 | M = probtype.param_placeholder([N]) 286 | X = probtype.sample_placeholder([N]) 287 | pd = probtype.pdfromflat(M) 288 | calcloglik = U.function([X, M], pd.logp(X)) 289 | calcent = U.function([M], pd.entropy()) 290 | Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) 291 | logliks = calcloglik(Xval, Mval) 292 | entval_ll = - logliks.mean() #pylint: disable=E1101 293 | entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 294 | entval = calcent(Mval).mean() #pylint: disable=E1101 295 | assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas 296 | 297 | # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] 298 | M2 = probtype.param_placeholder([N]) 299 | pd2 = probtype.pdfromflat(M2) 300 | q = pdparam + np.random.randn(pdparam.size) * 0.1 301 | Mval2 = np.repeat(q[None, :], N, axis=0) 302 | calckl = U.function([M, M2], pd.kl(pd2)) 303 | klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 304 | logliks = calcloglik(Xval, Mval2) 305 | klval_ll = - entval - logliks.mean() #pylint: disable=E1101 306 | klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 307 | assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas 308 | print('ok on', probtype, pdparam) 309 | 310 | -------------------------------------------------------------------------------- /baselines/common/filters.py: -------------------------------------------------------------------------------- 1 | from .running_stat import RunningStat 2 | from collections import deque 3 | import numpy as np 4 | 5 | class Filter(object): 6 | def __call__(self, x, update=True): 7 | raise NotImplementedError 8 | def reset(self): 9 | pass 10 | 11 | class IdentityFilter(Filter): 12 | def __call__(self, x, update=True): 13 | return x 14 | 15 | class CompositionFilter(Filter): 16 | def __init__(self, fs): 17 | self.fs = fs 18 | def __call__(self, x, update=True): 19 | for f in self.fs: 20 | x = f(x) 21 | return x 22 | def output_shape(self, input_space): 23 | out = input_space.shape 24 | for f in self.fs: 25 | out = f.output_shape(out) 26 | return out 27 | 28 | class ZFilter(Filter): 29 | """ 30 | y = (x-mean)/std 31 | using running estimates of mean,std 32 | """ 33 | 34 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 35 | self.demean = demean 36 | self.destd = destd 37 | self.clip = clip 38 | 39 | self.rs = RunningStat(shape) 40 | 41 | def __call__(self, x, update=True): 42 | if update: self.rs.push(x) 43 | if self.demean: 44 | x = x - self.rs.mean 45 | if self.destd: 46 | x = x / (self.rs.std+1e-8) 47 | if self.clip: 48 | x = np.clip(x, -self.clip, self.clip) 49 | return x 50 | def output_shape(self, input_space): 51 | return input_space.shape 52 | 53 | class AddClock(Filter): 54 | def __init__(self): 55 | self.count = 0 56 | def reset(self): 57 | self.count = 0 58 | def __call__(self, x, update=True): 59 | return np.append(x, self.count/100.0) 60 | def output_shape(self, input_space): 61 | return (input_space.shape[0]+1,) 62 | 63 | class FlattenFilter(Filter): 64 | def __call__(self, x, update=True): 65 | return x.ravel() 66 | def output_shape(self, input_space): 67 | return (int(np.prod(input_space.shape)),) 68 | 69 | class Ind2OneHotFilter(Filter): 70 | def __init__(self, n): 71 | self.n = n 72 | def __call__(self, x, update=True): 73 | out = np.zeros(self.n) 74 | out[x] = 1 75 | return out 76 | def output_shape(self, input_space): 77 | return (input_space.n,) 78 | 79 | class DivFilter(Filter): 80 | def __init__(self, divisor): 81 | self.divisor = divisor 82 | def __call__(self, x, update=True): 83 | return x / self.divisor 84 | def output_shape(self, input_space): 85 | return input_space.shape 86 | 87 | class StackFilter(Filter): 88 | def __init__(self, length): 89 | self.stack = deque(maxlen=length) 90 | def reset(self): 91 | self.stack.clear() 92 | def __call__(self, x, update=True): 93 | self.stack.append(x) 94 | while len(self.stack) < self.stack.maxlen: 95 | self.stack.append(x) 96 | return np.concatenate(self.stack, axis=-1) 97 | def output_shape(self, input_space): 98 | return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,) 99 | -------------------------------------------------------------------------------- /baselines/common/math_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | 5 | def discount(x, gamma): 6 | """ 7 | computes discounted sums along 0th dimension of x. 8 | 9 | inputs 10 | ------ 11 | x: ndarray 12 | gamma: float 13 | 14 | outputs 15 | ------- 16 | y: ndarray with same shape as x, satisfying 17 | 18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 19 | where k = len(x) - t - 1 20 | 21 | """ 22 | assert x.ndim >= 1 23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 24 | 25 | def explained_variance(ypred,y): 26 | """ 27 | Computes fraction of variance that ypred explains about y. 28 | Returns 1 - Var[y-ypred] / Var[y] 29 | 30 | interpretation: 31 | ev=0 => might as well have predicted zero 32 | ev=1 => perfect prediction 33 | ev<0 => worse than just predicting zero 34 | 35 | """ 36 | assert y.ndim == 1 and ypred.ndim == 1 37 | vary = np.var(y) 38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 39 | 40 | def explained_variance_2d(ypred, y): 41 | assert y.ndim == 2 and ypred.ndim == 2 42 | vary = np.var(y, axis=0) 43 | out = 1 - np.var(y-ypred)/vary 44 | out[vary < 1e-10] = 0 45 | return out 46 | 47 | def ncc(ypred, y): 48 | return np.corrcoef(ypred, y)[1,0] 49 | 50 | def flatten_arrays(arrs): 51 | return np.concatenate([arr.flat for arr in arrs]) 52 | 53 | def unflatten_vector(vec, shapes): 54 | i=0 55 | arrs = [] 56 | for shape in shapes: 57 | size = np.prod(shape) 58 | arr = vec[i:i+size].reshape(shape) 59 | arrs.append(arr) 60 | i += size 61 | return arrs 62 | 63 | def discount_with_boundaries(X, New, gamma): 64 | """ 65 | X: 2d array of floats, time x features 66 | New: 2d array of bools, indicating when a new episode has started 67 | """ 68 | Y = np.zeros_like(X) 69 | T = X.shape[0] 70 | Y[T-1] = X[T-1] 71 | for t in range(T-2, -1, -1): 72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) 73 | return Y 74 | 75 | def test_discount_with_boundaries(): 76 | gamma=0.9 77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') 78 | starts = [1.0, 0.0, 0.0, 1.0] 79 | y = discount_with_boundaries(x, starts, gamma) 80 | assert np.allclose(y, [ 81 | 1 + gamma * 2 + gamma**2 * 3, 82 | 2 + gamma * 3, 83 | 3, 84 | 4 85 | ]) -------------------------------------------------------------------------------- /baselines/common/misc_util.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import os 4 | import pickle 5 | import random 6 | import tempfile 7 | import zipfile 8 | 9 | 10 | def zipsame(*seqs): 11 | L = len(seqs[0]) 12 | assert all(len(seq) == L for seq in seqs[1:]) 13 | return zip(*seqs) 14 | 15 | 16 | def unpack(seq, sizes): 17 | """ 18 | Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'. 19 | None = just one bare element, not a list 20 | 21 | Example: 22 | unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6]) 23 | """ 24 | seq = list(seq) 25 | it = iter(seq) 26 | assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes) 27 | for size in sizes: 28 | if size is None: 29 | yield it.__next__() 30 | else: 31 | li = [] 32 | for _ in range(size): 33 | li.append(it.__next__()) 34 | yield li 35 | 36 | 37 | class EzPickle(object): 38 | """Objects that are pickled and unpickled via their constructor 39 | arguments. 40 | 41 | Example usage: 42 | 43 | class Dog(Animal, EzPickle): 44 | def __init__(self, furcolor, tailkind="bushy"): 45 | Animal.__init__() 46 | EzPickle.__init__(furcolor, tailkind) 47 | ... 48 | 49 | When this object is unpickled, a new Dog will be constructed by passing the provided 50 | furcolor and tailkind into the constructor. However, philosophers are still not sure 51 | whether it is still the same dog. 52 | 53 | This is generally needed only for environments which wrap C/C++ code, such as MuJoCo 54 | and Atari. 55 | """ 56 | 57 | def __init__(self, *args, **kwargs): 58 | self._ezpickle_args = args 59 | self._ezpickle_kwargs = kwargs 60 | 61 | def __getstate__(self): 62 | return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs} 63 | 64 | def __setstate__(self, d): 65 | out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"]) 66 | self.__dict__.update(out.__dict__) 67 | 68 | 69 | def set_global_seeds(i): 70 | try: 71 | import tensorflow as tf 72 | except ImportError: 73 | pass 74 | else: 75 | tf.set_random_seed(i) 76 | np.random.seed(i) 77 | random.seed(i) 78 | 79 | 80 | def pretty_eta(seconds_left): 81 | """Print the number of seconds in human readable format. 82 | 83 | Examples: 84 | 2 days 85 | 2 hours and 37 minutes 86 | less than a minute 87 | 88 | Paramters 89 | --------- 90 | seconds_left: int 91 | Number of seconds to be converted to the ETA 92 | Returns 93 | ------- 94 | eta: str 95 | String representing the pretty ETA. 96 | """ 97 | minutes_left = seconds_left // 60 98 | seconds_left %= 60 99 | hours_left = minutes_left // 60 100 | minutes_left %= 60 101 | days_left = hours_left // 24 102 | hours_left %= 24 103 | 104 | def helper(cnt, name): 105 | return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else '')) 106 | 107 | if days_left > 0: 108 | msg = helper(days_left, 'day') 109 | if hours_left > 0: 110 | msg += ' and ' + helper(hours_left, 'hour') 111 | return msg 112 | if hours_left > 0: 113 | msg = helper(hours_left, 'hour') 114 | if minutes_left > 0: 115 | msg += ' and ' + helper(minutes_left, 'minute') 116 | return msg 117 | if minutes_left > 0: 118 | return helper(minutes_left, 'minute') 119 | return 'less than a minute' 120 | 121 | 122 | class RunningAvg(object): 123 | def __init__(self, gamma, init_value=None): 124 | """Keep a running estimate of a quantity. This is a bit like mean 125 | but more sensitive to recent changes. 126 | 127 | Parameters 128 | ---------- 129 | gamma: float 130 | Must be between 0 and 1, where 0 is the most sensitive to recent 131 | changes. 132 | init_value: float or None 133 | Initial value of the estimate. If None, it will be set on the first update. 134 | """ 135 | self._value = init_value 136 | self._gamma = gamma 137 | 138 | def update(self, new_val): 139 | """Update the estimate. 140 | 141 | Parameters 142 | ---------- 143 | new_val: float 144 | new observated value of estimated quantity. 145 | """ 146 | if self._value is None: 147 | self._value = new_val 148 | else: 149 | self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val 150 | 151 | def __float__(self): 152 | """Get the current estimate""" 153 | return self._value 154 | 155 | def boolean_flag(parser, name, default=False, help=None): 156 | """Add a boolean flag to argparse parser. 157 | 158 | Parameters 159 | ---------- 160 | parser: argparse.Parser 161 | parser to add the flag to 162 | name: str 163 | -- will enable the flag, while --no- will disable it 164 | default: bool or None 165 | default value of the flag 166 | help: str 167 | help string for the flag 168 | """ 169 | dest = name.replace('-', '_') 170 | parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help) 171 | parser.add_argument("--no-" + name, action="store_false", dest=dest) 172 | 173 | 174 | def get_wrapper_by_name(env, classname): 175 | """Given an a gym environment possibly wrapped multiple times, returns a wrapper 176 | of class named classname or raises ValueError if no such wrapper was applied 177 | 178 | Parameters 179 | ---------- 180 | env: gym.Env of gym.Wrapper 181 | gym environment 182 | classname: str 183 | name of the wrapper 184 | 185 | Returns 186 | ------- 187 | wrapper: gym.Wrapper 188 | wrapper named classname 189 | """ 190 | currentenv = env 191 | while True: 192 | if classname == currentenv.class_name(): 193 | return currentenv 194 | elif isinstance(currentenv, gym.Wrapper): 195 | currentenv = currentenv.env 196 | else: 197 | raise ValueError("Couldn't find wrapper named %s" % classname) 198 | 199 | 200 | def relatively_safe_pickle_dump(obj, path, compression=False): 201 | """This is just like regular pickle dump, except from the fact that failure cases are 202 | different: 203 | 204 | - It's never possible that we end up with a pickle in corrupted state. 205 | - If a there was a different file at the path, that file will remain unchanged in the 206 | even of failure (provided that filesystem rename is atomic). 207 | - it is sometimes possible that we end up with useless temp file which needs to be 208 | deleted manually (it will be removed automatically on the next function call) 209 | 210 | The indended use case is periodic checkpoints of experiment state, such that we never 211 | corrupt previous checkpoints if the current one fails. 212 | 213 | Parameters 214 | ---------- 215 | obj: object 216 | object to pickle 217 | path: str 218 | path to the output file 219 | compression: bool 220 | if true pickle will be compressed 221 | """ 222 | temp_storage = path + ".relatively_safe" 223 | if compression: 224 | # Using gzip here would be simpler, but the size is limited to 2GB 225 | with tempfile.NamedTemporaryFile() as uncompressed_file: 226 | pickle.dump(obj, uncompressed_file) 227 | uncompressed_file.file.flush() 228 | with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip: 229 | myzip.write(uncompressed_file.name, "data") 230 | else: 231 | with open(temp_storage, "wb") as f: 232 | pickle.dump(obj, f) 233 | os.rename(temp_storage, path) 234 | 235 | 236 | def pickle_load(path, compression=False): 237 | """Unpickle a possible compressed pickle. 238 | 239 | Parameters 240 | ---------- 241 | path: str 242 | path to the output file 243 | compression: bool 244 | if true assumes that pickle was compressed when created and attempts decompression. 245 | 246 | Returns 247 | ------- 248 | obj: object 249 | the unpickled object 250 | """ 251 | 252 | if compression: 253 | with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip: 254 | with myzip.open("data") as f: 255 | return pickle.load(f) 256 | else: 257 | with open(path, "rb") as f: 258 | return pickle.load(f) 259 | -------------------------------------------------------------------------------- /baselines/common/mpi_adam.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import baselines.common.tf_util as U 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | class MpiAdam(object): 7 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): 8 | self.var_list = var_list 9 | self.beta1 = beta1 10 | self.beta2 = beta2 11 | self.epsilon = epsilon 12 | self.scale_grad_by_procs = scale_grad_by_procs 13 | size = sum(U.numel(v) for v in var_list) 14 | self.m = np.zeros(size, 'float32') 15 | self.v = np.zeros(size, 'float32') 16 | self.t = 0 17 | self.setfromflat = U.SetFromFlat(var_list) 18 | self.getflat = U.GetFlat(var_list) 19 | self.comm = MPI.COMM_WORLD if comm is None else comm 20 | 21 | def update(self, localg, stepsize): 22 | if self.t % 100 == 0: 23 | self.check_synced() 24 | localg = localg.astype('float32') 25 | globalg = np.zeros_like(localg) 26 | self.comm.Allreduce(localg, globalg, op=MPI.SUM) 27 | if self.scale_grad_by_procs: 28 | globalg /= self.comm.Get_size() 29 | 30 | self.t += 1 31 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) 32 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg 33 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) 34 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) 35 | self.setfromflat(self.getflat() + step) 36 | 37 | def sync(self): 38 | theta = self.getflat() 39 | self.comm.Bcast(theta, root=0) 40 | self.setfromflat(theta) 41 | 42 | def check_synced(self): 43 | if self.comm.Get_rank() == 0: # this is root 44 | theta = self.getflat() 45 | self.comm.Bcast(theta, root=0) 46 | else: 47 | thetalocal = self.getflat() 48 | thetaroot = np.empty_like(thetalocal) 49 | self.comm.Bcast(thetaroot, root=0) 50 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) 51 | 52 | @U.in_session 53 | def test_MpiAdam(): 54 | np.random.seed(0) 55 | tf.set_random_seed(0) 56 | 57 | a = tf.Variable(np.random.randn(3).astype('float32')) 58 | b = tf.Variable(np.random.randn(2,5).astype('float32')) 59 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) 60 | 61 | stepsize = 1e-2 62 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) 63 | do_update = U.function([], loss, updates=[update_op]) 64 | 65 | tf.get_default_session().run(tf.global_variables_initializer()) 66 | for i in range(10): 67 | print(i,do_update()) 68 | 69 | tf.set_random_seed(0) 70 | tf.get_default_session().run(tf.global_variables_initializer()) 71 | 72 | var_list = [a,b] 73 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) 74 | adam = MpiAdam(var_list) 75 | 76 | for i in range(10): 77 | l,g = lossandgrad() 78 | adam.update(g, stepsize) 79 | print(i,l) -------------------------------------------------------------------------------- /baselines/common/mpi_fork.py: -------------------------------------------------------------------------------- 1 | import os, subprocess, sys 2 | 3 | def mpi_fork(n, bind_to_core=False): 4 | """Re-launches the current script with workers 5 | Returns "parent" for original parent, "child" for MPI children 6 | """ 7 | if n<=1: 8 | return "child" 9 | if os.getenv("IN_MPI") is None: 10 | env = os.environ.copy() 11 | env.update( 12 | MKL_NUM_THREADS="1", 13 | OMP_NUM_THREADS="1", 14 | IN_MPI="1" 15 | ) 16 | args = ["mpirun", "-np", str(n)] 17 | if bind_to_core: 18 | args += ["-bind-to", "core"] 19 | args += [sys.executable] + sys.argv 20 | subprocess.check_call(args, env=env) 21 | return "parent" 22 | else: 23 | return "child" 24 | -------------------------------------------------------------------------------- /baselines/common/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | from baselines.common import zipsame 4 | 5 | 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False): 7 | x = np.asarray(x) 8 | assert x.ndim > 0 9 | if comm is None: comm = MPI.COMM_WORLD 10 | xsum = x.sum(axis=axis, keepdims=keepdims) 11 | n = xsum.size 12 | localsum = np.zeros(n+1, x.dtype) 13 | localsum[:n] = xsum.ravel() 14 | localsum[n] = x.shape[axis] 15 | globalsum = np.zeros_like(localsum) 16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM) 17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n] 18 | 19 | def mpi_moments(x, axis=0, comm=None, keepdims=False): 20 | x = np.asarray(x) 21 | assert x.ndim > 0 22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True) 23 | sqdiffs = np.square(x - mean) 24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) 25 | assert count1 == count 26 | std = np.sqrt(meansqdiff) 27 | if not keepdims: 28 | newshape = mean.shape[:axis] + mean.shape[axis+1:] 29 | mean = mean.reshape(newshape) 30 | std = std.reshape(newshape) 31 | return mean, std, count 32 | 33 | 34 | def test_runningmeanstd(): 35 | import subprocess 36 | subprocess.check_call(['mpirun', '-np', '3', 37 | 'python','-c', 38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()']) 39 | 40 | def _helper_runningmeanstd(): 41 | comm = MPI.COMM_WORLD 42 | np.random.seed(0) 43 | for (triple,axis) in [ 44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), 45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), 46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), 47 | ]: 48 | 49 | 50 | x = np.concatenate(triple, axis=axis) 51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] 52 | 53 | 54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) 55 | 56 | for (a1,a2) in zipsame(ms1, ms2): 57 | print(a1, a2) 58 | assert np.allclose(a1, a2) 59 | print("ok!") 60 | 61 | -------------------------------------------------------------------------------- /baselines/common/mpi_running_mean_std.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np 3 | 4 | class RunningMeanStd(object): 5 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 6 | def __init__(self, epsilon=1e-2, shape=()): 7 | 8 | self._sum = tf.get_variable( 9 | dtype=tf.float64, 10 | shape=shape, 11 | initializer=tf.constant_initializer(0.0), 12 | name="runningsum", trainable=False) 13 | self._sumsq = tf.get_variable( 14 | dtype=tf.float64, 15 | shape=shape, 16 | initializer=tf.constant_initializer(epsilon), 17 | name="runningsumsq", trainable=False) 18 | self._count = tf.get_variable( 19 | dtype=tf.float64, 20 | shape=(), 21 | initializer=tf.constant_initializer(epsilon), 22 | name="count", trainable=False) 23 | self.shape = shape 24 | 25 | self.mean = tf.to_float(self._sum / self._count) 26 | self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) 27 | 28 | newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') 29 | newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') 30 | newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') 31 | self.incfiltparams = U.function([newsum, newsumsq, newcount], [], 32 | updates=[tf.assign_add(self._sum, newsum), 33 | tf.assign_add(self._sumsq, newsumsq), 34 | tf.assign_add(self._count, newcount)]) 35 | 36 | 37 | def update(self, x): 38 | x = x.astype('float64') 39 | n = int(np.prod(self.shape)) 40 | totalvec = np.zeros(n*2+1, 'float64') 41 | addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')]) 42 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) 43 | self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n]) 44 | 45 | @U.in_session 46 | def test_runningmeanstd(): 47 | for (x1, x2, x3) in [ 48 | (np.random.randn(3), np.random.randn(4), np.random.randn(5)), 49 | (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), 50 | ]: 51 | 52 | rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) 53 | U.initialize() 54 | 55 | x = np.concatenate([x1, x2, x3], axis=0) 56 | ms1 = [x.mean(axis=0), x.std(axis=0)] 57 | rms.update(x1) 58 | rms.update(x2) 59 | rms.update(x3) 60 | ms2 = [rms.mean.eval(), rms.std.eval()] 61 | 62 | assert np.allclose(ms1, ms2) 63 | 64 | @U.in_session 65 | def test_dist(): 66 | np.random.seed(0) 67 | p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1)) 68 | q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1)) 69 | 70 | # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) 71 | # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) 72 | 73 | comm = MPI.COMM_WORLD 74 | assert comm.Get_size()==2 75 | if comm.Get_rank()==0: 76 | x1,x2,x3 = p1,p2,p3 77 | elif comm.Get_rank()==1: 78 | x1,x2,x3 = q1,q2,q3 79 | else: 80 | assert False 81 | 82 | rms = RunningMeanStd(epsilon=0.0, shape=(1,)) 83 | U.initialize() 84 | 85 | rms.update(x1) 86 | rms.update(x2) 87 | rms.update(x3) 88 | 89 | bigvec = np.concatenate([p1,p2,p3,q1,q2,q3]) 90 | 91 | def checkallclose(x,y): 92 | print(x,y) 93 | return np.allclose(x,y) 94 | 95 | assert checkallclose( 96 | bigvec.mean(axis=0), 97 | rms.mean.eval(), 98 | ) 99 | assert checkallclose( 100 | bigvec.std(axis=0), 101 | rms.std.eval(), 102 | ) 103 | 104 | 105 | if __name__ == "__main__": 106 | # Run with mpirun -np 2 python 107 | test_dist() 108 | -------------------------------------------------------------------------------- /baselines/common/runners.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC, abstractmethod 3 | 4 | class AbstractEnvRunner(ABC): 5 | def __init__(self, *, env, model, nsteps): 6 | self.env = env 7 | self.model = model 8 | nenv = env.num_envs 9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape 10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name) 11 | self.obs[:] = env.reset() 12 | self.nsteps = nsteps 13 | self.states = model.initial_state 14 | self.dones = [False for _ in range(nenv)] 15 | 16 | @abstractmethod 17 | def run(self): 18 | raise NotImplementedError 19 | -------------------------------------------------------------------------------- /baselines/common/running_mean_std.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | class RunningMeanStd(object): 3 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 4 | def __init__(self, epsilon=1e-4, shape=()): 5 | self.mean = np.zeros(shape, 'float64') 6 | self.var = np.ones(shape, 'float64') 7 | self.count = epsilon 8 | 9 | def update(self, x): 10 | batch_mean = np.mean(x, axis=0) 11 | batch_var = np.var(x, axis=0) 12 | batch_count = x.shape[0] 13 | self.update_from_moments(batch_mean, batch_var, batch_count) 14 | 15 | def update_from_moments(self, batch_mean, batch_var, batch_count): 16 | delta = batch_mean - self.mean 17 | tot_count = self.count + batch_count 18 | 19 | new_mean = self.mean + delta * batch_count / tot_count 20 | m_a = self.var * (self.count) 21 | m_b = batch_var * (batch_count) 22 | M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) 23 | new_var = M2 / (self.count + batch_count) 24 | 25 | new_count = batch_count + self.count 26 | 27 | self.mean = new_mean 28 | self.var = new_var 29 | self.count = new_count 30 | 31 | def test_runningmeanstd(): 32 | for (x1, x2, x3) in [ 33 | (np.random.randn(3), np.random.randn(4), np.random.randn(5)), 34 | (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), 35 | ]: 36 | 37 | rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) 38 | 39 | x = np.concatenate([x1, x2, x3], axis=0) 40 | ms1 = [x.mean(axis=0), x.var(axis=0)] 41 | rms.update(x1) 42 | rms.update(x2) 43 | rms.update(x3) 44 | ms2 = [rms.mean, rms.var] 45 | 46 | assert np.allclose(ms1, ms2) 47 | -------------------------------------------------------------------------------- /baselines/common/running_stat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # http://www.johndcook.com/blog/standard_deviation/ 4 | class RunningStat(object): 5 | def __init__(self, shape): 6 | self._n = 0 7 | self._M = np.zeros(shape) 8 | self._S = np.zeros(shape) 9 | def push(self, x): 10 | x = np.asarray(x) 11 | assert x.shape == self._M.shape 12 | self._n += 1 13 | if self._n == 1: 14 | self._M[...] = x 15 | else: 16 | oldM = self._M.copy() 17 | self._M[...] = oldM + (x - oldM)/self._n 18 | self._S[...] = self._S + (x - oldM)*(x - self._M) 19 | @property 20 | def n(self): 21 | return self._n 22 | @property 23 | def mean(self): 24 | return self._M 25 | @property 26 | def var(self): 27 | return self._S/(self._n - 1) if self._n > 1 else np.square(self._M) 28 | @property 29 | def std(self): 30 | return np.sqrt(self.var) 31 | @property 32 | def shape(self): 33 | return self._M.shape 34 | 35 | def test_running_stat(): 36 | for shp in ((), (3,), (3,4)): 37 | li = [] 38 | rs = RunningStat(shp) 39 | for _ in range(5): 40 | val = np.random.randn(*shp) 41 | rs.push(val) 42 | li.append(val) 43 | m = np.mean(li, axis=0) 44 | assert np.allclose(rs.mean, m) 45 | v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0) 46 | assert np.allclose(rs.var, v) 47 | -------------------------------------------------------------------------------- /baselines/common/schedules.py: -------------------------------------------------------------------------------- 1 | """This file is used for specifying various schedules that evolve over 2 | time throughout the execution of the algorithm, such as: 3 | - learning rate for the optimizer 4 | - exploration epsilon for the epsilon greedy exploration strategy 5 | - beta parameter for beta parameter in prioritized replay 6 | 7 | Each schedule has a function `value(t)` which returns the current value 8 | of the parameter given the timestep t of the optimization procedure. 9 | """ 10 | 11 | 12 | class Schedule(object): 13 | def value(self, t): 14 | """Value of the schedule at time t""" 15 | raise NotImplementedError() 16 | 17 | 18 | class ConstantSchedule(object): 19 | def __init__(self, value): 20 | """Value remains constant over time. 21 | 22 | Parameters 23 | ---------- 24 | value: float 25 | Constant value of the schedule 26 | """ 27 | self._v = value 28 | 29 | def value(self, t): 30 | """See Schedule.value""" 31 | return self._v 32 | 33 | 34 | def linear_interpolation(l, r, alpha): 35 | return l + alpha * (r - l) 36 | 37 | 38 | class PiecewiseSchedule(object): 39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 40 | """Piecewise schedule. 41 | 42 | endpoints: [(int, int)] 43 | list of pairs `(time, value)` meanining that schedule should output 44 | `value` when `t==time`. All the values for time must be sorted in 45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 48 | time passed between `time_a` and `time_b` for time `t`. 49 | interpolation: lambda float, float, float: float 50 | a function that takes value to the left and to the right of t according 51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 52 | right endpoint that t has covered. See linear_interpolation for example. 53 | outside_value: float 54 | if the value is requested outside of all the intervals sepecified in 55 | `endpoints` this value is returned. If None then AssertionError is 56 | raised when outside value is requested. 57 | """ 58 | idxes = [e[0] for e in endpoints] 59 | assert idxes == sorted(idxes) 60 | self._interpolation = interpolation 61 | self._outside_value = outside_value 62 | self._endpoints = endpoints 63 | 64 | def value(self, t): 65 | """See Schedule.value""" 66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 67 | if l_t <= t and t < r_t: 68 | alpha = float(t - l_t) / (r_t - l_t) 69 | return self._interpolation(l, r, alpha) 70 | 71 | # t does not belong to any of the pieces, so doom. 72 | assert self._outside_value is not None 73 | return self._outside_value 74 | 75 | 76 | class LinearSchedule(object): 77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 78 | """Linear interpolation between initial_p and final_p over 79 | schedule_timesteps. After this many timesteps pass final_p is 80 | returned. 81 | 82 | Parameters 83 | ---------- 84 | schedule_timesteps: int 85 | Number of timesteps for which to linearly anneal initial_p 86 | to final_p 87 | initial_p: float 88 | initial output value 89 | final_p: float 90 | final output value 91 | """ 92 | self.schedule_timesteps = schedule_timesteps 93 | self.final_p = final_p 94 | self.initial_p = initial_p 95 | 96 | def value(self, t): 97 | """See Schedule.value""" 98 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 99 | return self.initial_p + fraction * (self.final_p - self.initial_p) 100 | -------------------------------------------------------------------------------- /baselines/common/segment_tree.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | 4 | class SegmentTree(object): 5 | def __init__(self, capacity, operation, neutral_element): 6 | """Build a Segment Tree data structure. 7 | 8 | https://en.wikipedia.org/wiki/Segment_tree 9 | 10 | Can be used as regular array, but with two 11 | important differences: 12 | 13 | a) setting item's value is slightly slower. 14 | It is O(lg capacity) instead of O(1). 15 | b) user has access to an efficient ( O(log segment size) ) 16 | `reduce` operation which reduces `operation` over 17 | a contiguous subsequence of items in the array. 18 | 19 | Paramters 20 | --------- 21 | capacity: int 22 | Total size of the array - must be a power of two. 23 | operation: lambda obj, obj -> obj 24 | and operation for combining elements (eg. sum, max) 25 | must form a mathematical group together with the set of 26 | possible values for array elements (i.e. be associative) 27 | neutral_element: obj 28 | neutral element for the operation above. eg. float('-inf') 29 | for max and 0 for sum. 30 | """ 31 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." 32 | self._capacity = capacity 33 | self._value = [neutral_element for _ in range(2 * capacity)] 34 | self._operation = operation 35 | 36 | def _reduce_helper(self, start, end, node, node_start, node_end): 37 | if start == node_start and end == node_end: 38 | return self._value[node] 39 | mid = (node_start + node_end) // 2 40 | if end <= mid: 41 | return self._reduce_helper(start, end, 2 * node, node_start, mid) 42 | else: 43 | if mid + 1 <= start: 44 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) 45 | else: 46 | return self._operation( 47 | self._reduce_helper(start, mid, 2 * node, node_start, mid), 48 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) 49 | ) 50 | 51 | def reduce(self, start=0, end=None): 52 | """Returns result of applying `self.operation` 53 | to a contiguous subsequence of the array. 54 | 55 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) 56 | 57 | Parameters 58 | ---------- 59 | start: int 60 | beginning of the subsequence 61 | end: int 62 | end of the subsequences 63 | 64 | Returns 65 | ------- 66 | reduced: obj 67 | result of reducing self.operation over the specified range of array elements. 68 | """ 69 | if end is None: 70 | end = self._capacity 71 | if end < 0: 72 | end += self._capacity 73 | end -= 1 74 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1) 75 | 76 | def __setitem__(self, idx, val): 77 | # index of the leaf 78 | idx += self._capacity 79 | self._value[idx] = val 80 | idx //= 2 81 | while idx >= 1: 82 | self._value[idx] = self._operation( 83 | self._value[2 * idx], 84 | self._value[2 * idx + 1] 85 | ) 86 | idx //= 2 87 | 88 | def __getitem__(self, idx): 89 | assert 0 <= idx < self._capacity 90 | return self._value[self._capacity + idx] 91 | 92 | 93 | class SumSegmentTree(SegmentTree): 94 | def __init__(self, capacity): 95 | super(SumSegmentTree, self).__init__( 96 | capacity=capacity, 97 | operation=operator.add, 98 | neutral_element=0.0 99 | ) 100 | 101 | def sum(self, start=0, end=None): 102 | """Returns arr[start] + ... + arr[end]""" 103 | return super(SumSegmentTree, self).reduce(start, end) 104 | 105 | def find_prefixsum_idx(self, prefixsum): 106 | """Find the highest index `i` in the array such that 107 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum 108 | 109 | if array values are probabilities, this function 110 | allows to sample indexes according to the discrete 111 | probability efficiently. 112 | 113 | Parameters 114 | ---------- 115 | perfixsum: float 116 | upperbound on the sum of array prefix 117 | 118 | Returns 119 | ------- 120 | idx: int 121 | highest index satisfying the prefixsum constraint 122 | """ 123 | assert 0 <= prefixsum <= self.sum() + 1e-5 124 | idx = 1 125 | while idx < self._capacity: # while non-leaf 126 | if self._value[2 * idx] > prefixsum: 127 | idx = 2 * idx 128 | else: 129 | prefixsum -= self._value[2 * idx] 130 | idx = 2 * idx + 1 131 | return idx - self._capacity 132 | 133 | 134 | class MinSegmentTree(SegmentTree): 135 | def __init__(self, capacity): 136 | super(MinSegmentTree, self).__init__( 137 | capacity=capacity, 138 | operation=min, 139 | neutral_element=float('inf') 140 | ) 141 | 142 | def min(self, start=0, end=None): 143 | """Returns min(arr[start], ..., arr[end])""" 144 | 145 | return super(MinSegmentTree, self).reduce(start, end) 146 | -------------------------------------------------------------------------------- /baselines/common/tests/test_schedules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule 4 | 5 | 6 | def test_piecewise_schedule(): 7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) 8 | 9 | assert np.isclose(ps.value(-10), 500) 10 | assert np.isclose(ps.value(0), 150) 11 | assert np.isclose(ps.value(5), 200) 12 | assert np.isclose(ps.value(9), 80) 13 | assert np.isclose(ps.value(50), 50) 14 | assert np.isclose(ps.value(80), 50) 15 | assert np.isclose(ps.value(150), 0) 16 | assert np.isclose(ps.value(175), -25) 17 | assert np.isclose(ps.value(201), 500) 18 | assert np.isclose(ps.value(500), 500) 19 | 20 | assert np.isclose(ps.value(200 - 1e-10), -50) 21 | 22 | 23 | def test_constant_schedule(): 24 | cs = ConstantSchedule(5) 25 | for i in range(-100, 100): 26 | assert np.isclose(cs.value(i), 5) 27 | -------------------------------------------------------------------------------- /baselines/common/tests/test_segment_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 4 | 5 | 6 | def test_tree_set(): 7 | tree = SumSegmentTree(4) 8 | 9 | tree[2] = 1.0 10 | tree[3] = 3.0 11 | 12 | assert np.isclose(tree.sum(), 4.0) 13 | assert np.isclose(tree.sum(0, 2), 0.0) 14 | assert np.isclose(tree.sum(0, 3), 1.0) 15 | assert np.isclose(tree.sum(2, 3), 1.0) 16 | assert np.isclose(tree.sum(2, -1), 1.0) 17 | assert np.isclose(tree.sum(2, 4), 4.0) 18 | 19 | 20 | def test_tree_set_overlap(): 21 | tree = SumSegmentTree(4) 22 | 23 | tree[2] = 1.0 24 | tree[2] = 3.0 25 | 26 | assert np.isclose(tree.sum(), 3.0) 27 | assert np.isclose(tree.sum(2, 3), 3.0) 28 | assert np.isclose(tree.sum(2, -1), 3.0) 29 | assert np.isclose(tree.sum(2, 4), 3.0) 30 | assert np.isclose(tree.sum(1, 2), 0.0) 31 | 32 | 33 | def test_prefixsum_idx(): 34 | tree = SumSegmentTree(4) 35 | 36 | tree[2] = 1.0 37 | tree[3] = 3.0 38 | 39 | assert tree.find_prefixsum_idx(0.0) == 2 40 | assert tree.find_prefixsum_idx(0.5) == 2 41 | assert tree.find_prefixsum_idx(0.99) == 2 42 | assert tree.find_prefixsum_idx(1.01) == 3 43 | assert tree.find_prefixsum_idx(3.00) == 3 44 | assert tree.find_prefixsum_idx(4.00) == 3 45 | 46 | 47 | def test_prefixsum_idx2(): 48 | tree = SumSegmentTree(4) 49 | 50 | tree[0] = 0.5 51 | tree[1] = 1.0 52 | tree[2] = 1.0 53 | tree[3] = 3.0 54 | 55 | assert tree.find_prefixsum_idx(0.00) == 0 56 | assert tree.find_prefixsum_idx(0.55) == 1 57 | assert tree.find_prefixsum_idx(0.99) == 1 58 | assert tree.find_prefixsum_idx(1.51) == 2 59 | assert tree.find_prefixsum_idx(3.00) == 3 60 | assert tree.find_prefixsum_idx(5.50) == 3 61 | 62 | 63 | def test_max_interval_tree(): 64 | tree = MinSegmentTree(4) 65 | 66 | tree[0] = 1.0 67 | tree[2] = 0.5 68 | tree[3] = 3.0 69 | 70 | assert np.isclose(tree.min(), 0.5) 71 | assert np.isclose(tree.min(0, 2), 1.0) 72 | assert np.isclose(tree.min(0, 3), 0.5) 73 | assert np.isclose(tree.min(0, -1), 0.5) 74 | assert np.isclose(tree.min(2, 4), 0.5) 75 | assert np.isclose(tree.min(3, 4), 3.0) 76 | 77 | tree[2] = 0.7 78 | 79 | assert np.isclose(tree.min(), 0.7) 80 | assert np.isclose(tree.min(0, 2), 1.0) 81 | assert np.isclose(tree.min(0, 3), 0.7) 82 | assert np.isclose(tree.min(0, -1), 0.7) 83 | assert np.isclose(tree.min(2, 4), 0.7) 84 | assert np.isclose(tree.min(3, 4), 3.0) 85 | 86 | tree[2] = 4.0 87 | 88 | assert np.isclose(tree.min(), 1.0) 89 | assert np.isclose(tree.min(0, 2), 1.0) 90 | assert np.isclose(tree.min(0, 3), 1.0) 91 | assert np.isclose(tree.min(0, -1), 1.0) 92 | assert np.isclose(tree.min(2, 4), 3.0) 93 | assert np.isclose(tree.min(2, 3), 4.0) 94 | assert np.isclose(tree.min(2, -1), 4.0) 95 | assert np.isclose(tree.min(3, 4), 3.0) 96 | 97 | 98 | if __name__ == '__main__': 99 | test_tree_set() 100 | test_tree_set_overlap() 101 | test_prefixsum_idx() 102 | test_prefixsum_idx2() 103 | test_max_interval_tree() 104 | -------------------------------------------------------------------------------- /baselines/common/tests/test_tf_util.py: -------------------------------------------------------------------------------- 1 | # tests for tf_util 2 | import tensorflow as tf 3 | from baselines.common.tf_util import ( 4 | function, 5 | initialize, 6 | single_threaded_session 7 | ) 8 | 9 | 10 | def test_function(): 11 | with tf.Graph().as_default(): 12 | x = tf.placeholder(tf.int32, (), name="x") 13 | y = tf.placeholder(tf.int32, (), name="y") 14 | z = 3 * x + 2 * y 15 | lin = function([x, y], z, givens={y: 0}) 16 | 17 | with single_threaded_session(): 18 | initialize() 19 | 20 | assert lin(2) == 6 21 | assert lin(2, 2) == 10 22 | 23 | 24 | def test_multikwargs(): 25 | with tf.Graph().as_default(): 26 | x = tf.placeholder(tf.int32, (), name="x") 27 | with tf.variable_scope("other"): 28 | x2 = tf.placeholder(tf.int32, (), name="x") 29 | z = 3 * x + 2 * x2 30 | 31 | lin = function([x, x2], z, givens={x2: 0}) 32 | with single_threaded_session(): 33 | initialize() 34 | assert lin(2) == 6 35 | assert lin(2, 2) == 10 36 | 37 | 38 | if __name__ == '__main__': 39 | test_function() 40 | test_multikwargs() 41 | -------------------------------------------------------------------------------- /baselines/common/tf_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf # pylint: ignore-module 3 | import copy 4 | import os 5 | import functools 6 | import collections 7 | import multiprocessing 8 | 9 | def switch(condition, then_expression, else_expression): 10 | """Switches between two operations depending on a scalar value (int or bool). 11 | Note that both `then_expression` and `else_expression` 12 | should be symbolic tensors of the *same shape*. 13 | 14 | # Arguments 15 | condition: scalar tensor. 16 | then_expression: TensorFlow operation. 17 | else_expression: TensorFlow operation. 18 | """ 19 | x_shape = copy.copy(then_expression.get_shape()) 20 | x = tf.cond(tf.cast(condition, 'bool'), 21 | lambda: then_expression, 22 | lambda: else_expression) 23 | x.set_shape(x_shape) 24 | return x 25 | 26 | # ================================================================ 27 | # Extras 28 | # ================================================================ 29 | 30 | def lrelu(x, leak=0.2): 31 | f1 = 0.5 * (1 + leak) 32 | f2 = 0.5 * (1 - leak) 33 | return f1 * x + f2 * abs(x) 34 | 35 | # ================================================================ 36 | # Mathematical utils 37 | # ================================================================ 38 | 39 | def huber_loss(x, delta=1.0): 40 | """Reference: https://en.wikipedia.org/wiki/Huber_loss""" 41 | return tf.where( 42 | tf.abs(x) < delta, 43 | tf.square(x) * 0.5, 44 | delta * (tf.abs(x) - 0.5 * delta) 45 | ) 46 | 47 | # ================================================================ 48 | # Global session 49 | # ================================================================ 50 | 51 | def make_session(num_cpu=None, make_default=False, graph=None): 52 | """Returns a session that will use CPU's only""" 53 | if num_cpu is None: 54 | num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count())) 55 | tf_config = tf.ConfigProto( 56 | inter_op_parallelism_threads=num_cpu, 57 | intra_op_parallelism_threads=num_cpu) 58 | tf_config.gpu_options.allocator_type = 'BFC' 59 | if make_default: 60 | return tf.InteractiveSession(config=tf_config, graph=graph) 61 | else: 62 | return tf.Session(config=tf_config, graph=graph) 63 | 64 | def single_threaded_session(): 65 | """Returns a session which will only use a single CPU""" 66 | return make_session(num_cpu=1) 67 | 68 | def in_session(f): 69 | @functools.wraps(f) 70 | def newfunc(*args, **kwargs): 71 | with tf.Session(): 72 | f(*args, **kwargs) 73 | return newfunc 74 | 75 | ALREADY_INITIALIZED = set() 76 | 77 | def initialize(): 78 | """Initialize all the uninitialized variables in the global scope.""" 79 | new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED 80 | tf.get_default_session().run(tf.variables_initializer(new_variables)) 81 | ALREADY_INITIALIZED.update(new_variables) 82 | 83 | # ================================================================ 84 | # Model components 85 | # ================================================================ 86 | 87 | def normc_initializer(std=1.0, axis=0): 88 | def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613 89 | out = np.random.randn(*shape).astype(np.float32) 90 | out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True)) 91 | return tf.constant(out) 92 | return _initializer 93 | 94 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None, 95 | summary_tag=None): 96 | with tf.variable_scope(name): 97 | stride_shape = [1, stride[0], stride[1], 1] 98 | filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters] 99 | 100 | # there are "num input feature maps * filter height * filter width" 101 | # inputs to each hidden unit 102 | fan_in = intprod(filter_shape[:3]) 103 | # each unit in the lower layer receives a gradient from: 104 | # "num output feature maps * filter height * filter width" / 105 | # pooling size 106 | fan_out = intprod(filter_shape[:2]) * num_filters 107 | # initialize weights with random weights 108 | w_bound = np.sqrt(6. / (fan_in + fan_out)) 109 | 110 | w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound), 111 | collections=collections) 112 | b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(), 113 | collections=collections) 114 | 115 | if summary_tag is not None: 116 | tf.summary.image(summary_tag, 117 | tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]), 118 | [2, 0, 1, 3]), 119 | max_images=10) 120 | 121 | return tf.nn.conv2d(x, w, stride_shape, pad) + b 122 | 123 | # ================================================================ 124 | # Theano-like Function 125 | # ================================================================ 126 | 127 | def function(inputs, outputs, updates=None, givens=None): 128 | """Just like Theano function. Take a bunch of tensorflow placeholders and expressions 129 | computed based on those placeholders and produces f(inputs) -> outputs. Function f takes 130 | values to be fed to the input's placeholders and produces the values of the expressions 131 | in outputs. 132 | 133 | Input values can be passed in the same order as inputs or can be provided as kwargs based 134 | on placeholder name (passed to constructor or accessible via placeholder.op.name). 135 | 136 | Example: 137 | x = tf.placeholder(tf.int32, (), name="x") 138 | y = tf.placeholder(tf.int32, (), name="y") 139 | z = 3 * x + 2 * y 140 | lin = function([x, y], z, givens={y: 0}) 141 | 142 | with single_threaded_session(): 143 | initialize() 144 | 145 | assert lin(2) == 6 146 | assert lin(x=3) == 9 147 | assert lin(2, 2) == 10 148 | assert lin(x=2, y=3) == 12 149 | 150 | Parameters 151 | ---------- 152 | inputs: [tf.placeholder, tf.constant, or object with make_feed_dict method] 153 | list of input arguments 154 | outputs: [tf.Variable] or tf.Variable 155 | list of outputs or a single output to be returned from function. Returned 156 | value will also have the same shape. 157 | """ 158 | if isinstance(outputs, list): 159 | return _Function(inputs, outputs, updates, givens=givens) 160 | elif isinstance(outputs, (dict, collections.OrderedDict)): 161 | f = _Function(inputs, outputs.values(), updates, givens=givens) 162 | return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs))) 163 | else: 164 | f = _Function(inputs, [outputs], updates, givens=givens) 165 | return lambda *args, **kwargs: f(*args, **kwargs)[0] 166 | 167 | 168 | class _Function(object): 169 | def __init__(self, inputs, outputs, updates, givens): 170 | for inpt in inputs: 171 | if not hasattr(inpt, 'make_feed_dict') and not (type(inpt) is tf.Tensor and len(inpt.op.inputs) == 0): 172 | assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method" 173 | self.inputs = inputs 174 | updates = updates or [] 175 | self.update_group = tf.group(*updates) 176 | self.outputs_update = list(outputs) + [self.update_group] 177 | self.givens = {} if givens is None else givens 178 | 179 | def _feed_input(self, feed_dict, inpt, value): 180 | if hasattr(inpt, 'make_feed_dict'): 181 | feed_dict.update(inpt.make_feed_dict(value)) 182 | else: 183 | feed_dict[inpt] = value 184 | 185 | def __call__(self, *args): 186 | assert len(args) <= len(self.inputs), "Too many arguments provided" 187 | feed_dict = {} 188 | # Update the args 189 | for inpt, value in zip(self.inputs, args): 190 | self._feed_input(feed_dict, inpt, value) 191 | # Update feed dict with givens. 192 | for inpt in self.givens: 193 | feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) 194 | results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 195 | return results 196 | 197 | # ================================================================ 198 | # Flat vectors 199 | # ================================================================ 200 | 201 | def var_shape(x): 202 | out = x.get_shape().as_list() 203 | assert all(isinstance(a, int) for a in out), \ 204 | "shape function assumes that shape is fully known" 205 | return out 206 | 207 | def numel(x): 208 | return intprod(var_shape(x)) 209 | 210 | def intprod(x): 211 | return int(np.prod(x)) 212 | 213 | def flatgrad(loss, var_list, clip_norm=None): 214 | grads = tf.gradients(loss, var_list) 215 | if clip_norm is not None: 216 | grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads] 217 | return tf.concat(axis=0, values=[ 218 | tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)]) 219 | for (v, grad) in zip(var_list, grads) 220 | ]) 221 | 222 | class SetFromFlat(object): 223 | def __init__(self, var_list, dtype=tf.float32): 224 | assigns = [] 225 | shapes = list(map(var_shape, var_list)) 226 | total_size = np.sum([intprod(shape) for shape in shapes]) 227 | 228 | self.theta = theta = tf.placeholder(dtype, [total_size]) 229 | start = 0 230 | assigns = [] 231 | for (shape, v) in zip(shapes, var_list): 232 | size = intprod(shape) 233 | assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape))) 234 | start += size 235 | self.op = tf.group(*assigns) 236 | 237 | def __call__(self, theta): 238 | tf.get_default_session().run(self.op, feed_dict={self.theta: theta}) 239 | 240 | class GetFlat(object): 241 | def __init__(self, var_list): 242 | self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) 243 | 244 | def __call__(self): 245 | return tf.get_default_session().run(self.op) 246 | 247 | _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) 248 | 249 | def get_placeholder(name, dtype, shape): 250 | if name in _PLACEHOLDER_CACHE: 251 | out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] 252 | assert dtype1 == dtype and shape1 == shape 253 | return out 254 | else: 255 | out = tf.placeholder(dtype=dtype, shape=shape, name=name) 256 | _PLACEHOLDER_CACHE[name] = (out, dtype, shape) 257 | return out 258 | 259 | def get_placeholder_cached(name): 260 | return _PLACEHOLDER_CACHE[name][0] 261 | 262 | def flattenallbut0(x): 263 | return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) 264 | 265 | 266 | # ================================================================ 267 | # Diagnostics 268 | # ================================================================ 269 | 270 | def display_var_info(vars): 271 | from baselines import logger 272 | count_params = 0 273 | for v in vars: 274 | name = v.name 275 | if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue 276 | v_params = np.prod(v.shape.as_list()) 277 | count_params += v_params 278 | if "/b:" in name or "/biases" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print 279 | logger.info(" %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape))) 280 | 281 | logger.info("Total model parameters: %0.2f million" % (count_params*1e-6)) 282 | -------------------------------------------------------------------------------- /baselines/common/vec_env/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from baselines import logger 3 | 4 | class AlreadySteppingError(Exception): 5 | """ 6 | Raised when an asynchronous step is running while 7 | step_async() is called again. 8 | """ 9 | def __init__(self): 10 | msg = 'already running an async step' 11 | Exception.__init__(self, msg) 12 | 13 | class NotSteppingError(Exception): 14 | """ 15 | Raised when an asynchronous step is not running but 16 | step_wait() is called. 17 | """ 18 | def __init__(self): 19 | msg = 'not running an async step' 20 | Exception.__init__(self, msg) 21 | 22 | class VecEnv(ABC): 23 | """ 24 | An abstract asynchronous, vectorized environment. 25 | """ 26 | def __init__(self, num_envs, observation_space, action_space): 27 | self.num_envs = num_envs 28 | self.observation_space = observation_space 29 | self.action_space = action_space 30 | 31 | @abstractmethod 32 | def reset(self): 33 | """ 34 | Reset all the environments and return an array of 35 | observations, or a tuple of observation arrays. 36 | 37 | If step_async is still doing work, that work will 38 | be cancelled and step_wait() should not be called 39 | until step_async() is invoked again. 40 | """ 41 | pass 42 | 43 | @abstractmethod 44 | def step_async(self, actions): 45 | """ 46 | Tell all the environments to start taking a step 47 | with the given actions. 48 | Call step_wait() to get the results of the step. 49 | 50 | You should not call this if a step_async run is 51 | already pending. 52 | """ 53 | pass 54 | 55 | @abstractmethod 56 | def step_wait(self): 57 | """ 58 | Wait for the step taken with step_async(). 59 | 60 | Returns (obs, rews, dones, infos): 61 | - obs: an array of observations, or a tuple of 62 | arrays of observations. 63 | - rews: an array of rewards 64 | - dones: an array of "episode done" booleans 65 | - infos: a sequence of info objects 66 | """ 67 | pass 68 | 69 | @abstractmethod 70 | def close(self): 71 | """ 72 | Clean up the environments' resources. 73 | """ 74 | pass 75 | 76 | def step(self, actions): 77 | self.step_async(actions) 78 | return self.step_wait() 79 | 80 | def render(self): 81 | logger.warn('Render not defined for %s'%self) 82 | 83 | @property 84 | def unwrapped(self): 85 | if isinstance(self, VecEnvWrapper): 86 | return self.venv.unwrapped 87 | else: 88 | return self 89 | 90 | class VecEnvWrapper(VecEnv): 91 | def __init__(self, venv, observation_space=None, action_space=None): 92 | self.venv = venv 93 | VecEnv.__init__(self, 94 | num_envs=venv.num_envs, 95 | observation_space=observation_space or venv.observation_space, 96 | action_space=action_space or venv.action_space) 97 | 98 | def step_async(self, actions): 99 | self.venv.step_async(actions) 100 | 101 | @abstractmethod 102 | def reset(self): 103 | pass 104 | 105 | @abstractmethod 106 | def step_wait(self): 107 | pass 108 | 109 | def close(self): 110 | return self.venv.close() 111 | 112 | def render(self): 113 | self.venv.render() 114 | 115 | class CloudpickleWrapper(object): 116 | """ 117 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 118 | """ 119 | def __init__(self, x): 120 | self.x = x 121 | def __getstate__(self): 122 | import cloudpickle 123 | return cloudpickle.dumps(self.x) 124 | def __setstate__(self, ob): 125 | import pickle 126 | self.x = pickle.loads(ob) 127 | -------------------------------------------------------------------------------- /baselines/common/vec_env/dummy_vec_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | from collections import OrderedDict 4 | from . import VecEnv 5 | 6 | class DummyVecEnv(VecEnv): 7 | def __init__(self, env_fns): 8 | self.envs = [fn() for fn in env_fns] 9 | env = self.envs[0] 10 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 11 | shapes, dtypes = {}, {} 12 | self.keys = [] 13 | obs_space = env.observation_space 14 | if isinstance(obs_space, spaces.Dict): 15 | assert isinstance(obs_space.spaces, OrderedDict) 16 | for key, box in obs_space.spaces.items(): 17 | assert isinstance(box, spaces.Box) 18 | shapes[key] = box.shape 19 | dtypes[key] = box.dtype 20 | self.keys.append(key) 21 | else: 22 | box = obs_space 23 | assert isinstance(box, spaces.Box) 24 | self.keys = [None] 25 | shapes, dtypes = { None: box.shape }, { None: box.dtype } 26 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } 27 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) 28 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) 29 | self.buf_infos = [{} for _ in range(self.num_envs)] 30 | self.actions = None 31 | 32 | def step_async(self, actions): 33 | self.actions = actions 34 | 35 | def step_wait(self): 36 | for e in range(self.num_envs): 37 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e]) 38 | if self.buf_dones[e]: 39 | obs = self.envs[e].reset() 40 | self._save_obs(e, obs) 41 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), 42 | self.buf_infos.copy()) 43 | 44 | def reset(self): 45 | for e in range(self.num_envs): 46 | obs = self.envs[e].reset() 47 | self._save_obs(e, obs) 48 | return self._obs_from_buf() 49 | 50 | def close(self): 51 | return 52 | 53 | def _save_obs(self, e, obs): 54 | for k in self.keys: 55 | if k is None: 56 | self.buf_obs[k][e] = obs 57 | else: 58 | self.buf_obs[k][e] = obs[k] 59 | 60 | def _obs_from_buf(self): 61 | if self.keys==[None]: 62 | return self.buf_obs[None] 63 | else: 64 | return self.buf_obs 65 | -------------------------------------------------------------------------------- /baselines/common/vec_env/subproc_vec_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiprocessing import Process, Pipe 3 | from baselines.common.vec_env import VecEnv, CloudpickleWrapper 4 | 5 | 6 | def worker(remote, parent_remote, env_fn_wrapper): 7 | parent_remote.close() 8 | env = env_fn_wrapper.x() 9 | while True: 10 | cmd, data = remote.recv() 11 | if cmd == 'step': 12 | ob, reward, done, info = env.step(data) 13 | if done: 14 | ob = env.reset() 15 | remote.send((ob, reward, done, info)) 16 | elif cmd == 'reset': 17 | ob = env.reset() 18 | remote.send(ob) 19 | elif cmd == 'reset_task': 20 | ob = env.reset_task() 21 | remote.send(ob) 22 | elif cmd == 'close': 23 | remote.close() 24 | break 25 | elif cmd == 'get_spaces': 26 | remote.send((env.observation_space, env.action_space)) 27 | else: 28 | raise NotImplementedError 29 | 30 | 31 | class SubprocVecEnv(VecEnv): 32 | def __init__(self, env_fns, spaces=None): 33 | """ 34 | envs: list of gym environments to run in subprocesses 35 | """ 36 | self.waiting = False 37 | self.closed = False 38 | nenvs = len(env_fns) 39 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 40 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 41 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 42 | for p in self.ps: 43 | p.daemon = True # if the main process crashes, we should not cause things to hang 44 | p.start() 45 | for remote in self.work_remotes: 46 | remote.close() 47 | 48 | self.remotes[0].send(('get_spaces', None)) 49 | observation_space, action_space = self.remotes[0].recv() 50 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 51 | 52 | def step_async(self, actions): 53 | for remote, action in zip(self.remotes, actions): 54 | remote.send(('step', action)) 55 | self.waiting = True 56 | 57 | def step_wait(self): 58 | results = [remote.recv() for remote in self.remotes] 59 | self.waiting = False 60 | obs, rews, dones, infos = zip(*results) 61 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 62 | 63 | def reset(self): 64 | for remote in self.remotes: 65 | remote.send(('reset', None)) 66 | return np.stack([remote.recv() for remote in self.remotes]) 67 | 68 | def reset_task(self): 69 | for remote in self.remotes: 70 | remote.send(('reset_task', None)) 71 | return np.stack([remote.recv() for remote in self.remotes]) 72 | 73 | def close(self): 74 | if self.closed: 75 | return 76 | if self.waiting: 77 | for remote in self.remotes: 78 | remote.recv() 79 | for remote in self.remotes: 80 | remote.send(('close', None)) 81 | for p in self.ps: 82 | p.join() 83 | self.closed = True 84 | -------------------------------------------------------------------------------- /baselines/common/vec_env/vec_frame_stack.py: -------------------------------------------------------------------------------- 1 | from baselines.common.vec_env import VecEnvWrapper 2 | import numpy as np 3 | from gym import spaces 4 | 5 | class VecFrameStack(VecEnvWrapper): 6 | """ 7 | Vectorized environment base class 8 | """ 9 | def __init__(self, venv, nstack): 10 | self.venv = venv 11 | self.nstack = nstack 12 | wos = venv.observation_space # wrapped ob space 13 | low = np.repeat(wos.low, self.nstack, axis=-1) 14 | high = np.repeat(wos.high, self.nstack, axis=-1) 15 | self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype) 16 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) 17 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 18 | 19 | def step_wait(self): 20 | obs, rews, news, infos = self.venv.step_wait() 21 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) 22 | for (i, new) in enumerate(news): 23 | if new: 24 | self.stackedobs[i] = 0 25 | self.stackedobs[..., -obs.shape[-1]:] = obs 26 | return self.stackedobs, rews, news, infos 27 | 28 | def reset(self): 29 | """ 30 | Reset all environments 31 | """ 32 | obs = self.venv.reset() 33 | self.stackedobs[...] = 0 34 | self.stackedobs[..., -obs.shape[-1]:] = obs 35 | return self.stackedobs 36 | 37 | def close(self): 38 | self.venv.close() 39 | -------------------------------------------------------------------------------- /baselines/common/vec_env/vec_normalize.py: -------------------------------------------------------------------------------- 1 | from baselines.common.vec_env import VecEnvWrapper 2 | from baselines.common.running_mean_std import RunningMeanStd 3 | import numpy as np 4 | 5 | class VecNormalize(VecEnvWrapper): 6 | """ 7 | Vectorized environment base class 8 | """ 9 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): 10 | VecEnvWrapper.__init__(self, venv) 11 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None 12 | self.ret_rms = RunningMeanStd(shape=()) if ret else None 13 | self.clipob = clipob 14 | self.cliprew = cliprew 15 | self.ret = np.zeros(self.num_envs) 16 | self.gamma = gamma 17 | self.epsilon = epsilon 18 | 19 | def step_wait(self): 20 | """ 21 | Apply sequence of actions to sequence of environments 22 | actions -> (observations, rewards, news) 23 | 24 | where 'news' is a boolean vector indicating whether each element is new. 25 | """ 26 | obs, rews, news, infos = self.venv.step_wait() 27 | self.ret = self.ret * self.gamma + rews 28 | obs = self._obfilt(obs) 29 | if self.ret_rms: 30 | self.ret_rms.update(self.ret) 31 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) 32 | return obs, rews, news, infos 33 | 34 | def _obfilt(self, obs): 35 | if self.ob_rms: 36 | self.ob_rms.update(obs) 37 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) 38 | return obs 39 | else: 40 | return obs 41 | 42 | def reset(self): 43 | """ 44 | Reset all environments 45 | """ 46 | obs = self.venv.reset() 47 | return self._obfilt(obs) 48 | -------------------------------------------------------------------------------- /baselines/her/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruizhaogit/EnergyBasedPrioritization/2fd2f5bab0547848f4f76b837d16238435518dcc/baselines/her/__init__.py -------------------------------------------------------------------------------- /baselines/her/actor_critic.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from baselines.her.util import store_args, nn 3 | 4 | 5 | class ActorCritic: 6 | @store_args 7 | def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, 8 | **kwargs): 9 | """The actor-critic network and related training code. 10 | 11 | Args: 12 | inputs_tf (dict of tensors): all necessary inputs for the network: the 13 | observation (o), the goal (g), and the action (u) 14 | dimo (int): the dimension of the observations 15 | dimg (int): the dimension of the goals 16 | dimu (int): the dimension of the actions 17 | max_u (float): the maximum magnitude of actions; action outputs will be scaled 18 | accordingly 19 | o_stats (baselines.her.Normalizer): normalizer for observations 20 | g_stats (baselines.her.Normalizer): normalizer for goals 21 | hidden (int): number of hidden units that should be used in hidden layers 22 | layers (int): number of hidden layers 23 | """ 24 | self.o_tf = inputs_tf['o'] 25 | self.g_tf = inputs_tf['g'] 26 | self.u_tf = inputs_tf['u'] 27 | 28 | # Prepare inputs for actor and critic. 29 | o = self.o_stats.normalize(self.o_tf) 30 | g = self.g_stats.normalize(self.g_tf) 31 | input_pi = tf.concat(axis=1, values=[o, g]) # for actor 32 | 33 | # Networks. 34 | with tf.variable_scope('pi'): 35 | self.pi_tf = self.max_u * tf.tanh(nn( 36 | input_pi, [self.hidden] * self.layers + [self.dimu])) 37 | with tf.variable_scope('Q'): 38 | # for policy training 39 | input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u]) 40 | self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) 41 | # for critic training 42 | input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u]) 43 | self._input_Q = input_Q # exposed for tests 44 | self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True) 45 | -------------------------------------------------------------------------------- /baselines/her/ddpg.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import numpy as np 3 | import tensorflow as tf 4 | from tensorflow.contrib.staging import StagingArea 5 | from baselines import logger 6 | from baselines.her.util import ( 7 | import_function, store_args, flatten_grads, transitions_in_episode_batch) 8 | from baselines.her.normalizer import Normalizer 9 | from baselines.her.replay_buffer import ReplayBuffer, ReplayBufferEnergy, PrioritizedReplayBuffer 10 | from baselines.common.mpi_adam import MpiAdam 11 | import baselines.common.tf_util as U 12 | from baselines.common.schedules import LinearSchedule, PiecewiseSchedule 13 | import json 14 | 15 | 16 | 17 | def dims_to_shapes(input_dims): 18 | return {key: tuple([val]) if val > 0 else tuple() for key, val in input_dims.items()} 19 | 20 | 21 | class DDPG(object): 22 | @store_args 23 | def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, 24 | Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, 25 | rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, 26 | sample_transitions, gamma, temperature, prioritization, env_name, 27 | alpha, beta0, beta_iters, eps, max_timesteps, rank_method, reuse=False, **kwargs): 28 | """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). 29 | 30 | Args: 31 | input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the 32 | actions (u) 33 | buffer_size (int): number of transitions that are stored in the replay buffer 34 | hidden (int): number of units in the hidden layers 35 | layers (int): number of hidden layers 36 | network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') 37 | polyak (float): coefficient for Polyak-averaging of the target network 38 | batch_size (int): batch size for training 39 | Q_lr (float): learning rate for the Q (critic) network 40 | pi_lr (float): learning rate for the pi (actor) network 41 | norm_eps (float): a small value used in the normalizer to avoid numerical instabilities 42 | norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] 43 | max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] 44 | action_l2 (float): coefficient for L2 penalty on the actions 45 | clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] 46 | scope (str): the scope used for the TensorFlow graph 47 | T (int): the time horizon for rollouts 48 | rollout_batch_size (int): number of parallel rollouts per DDPG agent 49 | subtract_goals (function): function that subtracts goals from each other 50 | relative_goals (boolean): whether or not relative goals should be fed into the network 51 | clip_pos_returns (boolean): whether or not positive returns should be clipped 52 | clip_return (float): clip returns to be in [-clip_return, clip_return] 53 | sample_transitions (function) function that samples from the replay buffer 54 | gamma (float): gamma used for Q learning updates 55 | reuse (boolean): whether or not the networks should be reused 56 | """ 57 | if self.clip_return is None: 58 | self.clip_return = np.inf 59 | 60 | self.create_actor_critic = import_function(self.network_class) 61 | 62 | input_shapes = dims_to_shapes(self.input_dims) 63 | self.dimo = self.input_dims['o'] 64 | self.dimg = self.input_dims['g'] 65 | self.dimu = self.input_dims['u'] 66 | 67 | self.prioritization = prioritization 68 | self.env_name = env_name 69 | self.temperature = temperature 70 | self.rank_method = rank_method 71 | 72 | # Prepare staging area for feeding data to the model. 73 | stage_shapes = OrderedDict() 74 | for key in sorted(self.input_dims.keys()): 75 | if key.startswith('info_'): 76 | continue 77 | stage_shapes[key] = (None, *input_shapes[key]) 78 | for key in ['o', 'g']: 79 | stage_shapes[key + '_2'] = stage_shapes[key] 80 | stage_shapes['r'] = (None,) 81 | stage_shapes['w'] = (None,) 82 | self.stage_shapes = stage_shapes 83 | 84 | # Create network. 85 | with tf.variable_scope(self.scope): 86 | self.staging_tf = StagingArea( 87 | dtypes=[tf.float32 for _ in self.stage_shapes.keys()], 88 | shapes=list(self.stage_shapes.values())) 89 | self.buffer_ph_tf = [ 90 | tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] 91 | self.stage_op = self.staging_tf.put(self.buffer_ph_tf) 92 | 93 | self._create_network(reuse=reuse) 94 | 95 | # Configure the replay buffer. 96 | buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) 97 | for key, val in input_shapes.items()} 98 | buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) 99 | buffer_shapes['ag'] = (self.T+1, self.dimg) 100 | buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size 101 | 102 | if self.prioritization == 'energy': 103 | self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size, self.T, self.sample_transitions, 104 | self.prioritization, self.env_name) 105 | elif self.prioritization == 'tderror': 106 | self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha, self.env_name) 107 | if beta_iters is None: 108 | beta_iters = max_timesteps 109 | self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0) 110 | else: 111 | self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) 112 | 113 | def _random_action(self, n): 114 | return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu)) 115 | 116 | def _preprocess_og(self, o, ag, g): 117 | if self.relative_goals: 118 | g_shape = g.shape 119 | g = g.reshape(-1, self.dimg) 120 | ag = ag.reshape(-1, self.dimg) 121 | g = self.subtract_goals(g, ag) 122 | g = g.reshape(*g_shape) 123 | o = np.clip(o, -self.clip_obs, self.clip_obs) 124 | g = np.clip(g, -self.clip_obs, self.clip_obs) 125 | return o, g 126 | 127 | def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False, 128 | compute_Q=False): 129 | o, g = self._preprocess_og(o, ag, g) 130 | policy = self.target if use_target_net else self.main 131 | # values to compute 132 | vals = [policy.pi_tf] 133 | if compute_Q: 134 | vals += [policy.Q_pi_tf] 135 | # feed 136 | feed = { 137 | policy.o_tf: o.reshape(-1, self.dimo), 138 | policy.g_tf: g.reshape(-1, self.dimg), 139 | policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32) 140 | } 141 | 142 | ret = self.sess.run(vals, feed_dict=feed) 143 | 144 | # action postprocessing 145 | u = ret[0] 146 | noise = noise_eps * self.max_u * np.random.randn(*u.shape) # gaussian noise 147 | u += noise 148 | u = np.clip(u, -self.max_u, self.max_u) 149 | u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u) # eps-greedy 150 | if u.shape[0] == 1: 151 | u = u[0] 152 | u = u.copy() 153 | ret[0] = u 154 | 155 | if len(ret) == 1: 156 | return ret[0] 157 | else: 158 | return ret 159 | 160 | def get_td_errors(self, o, g, u): 161 | o, g = self._preprocess_og(o, g, g) 162 | vals = [self.td_error_tf] 163 | r = np.ones((o.reshape(-1, self.dimo).shape[0],1)) 164 | 165 | feed = { 166 | self.target.o_tf: o.reshape(-1, self.dimo), 167 | self.target.g_tf: g.reshape(-1, self.dimg), 168 | self.bath_tf_r: r, 169 | self.main.o_tf: o.reshape(-1, self.dimo), 170 | self.main.g_tf: g.reshape(-1, self.dimg), 171 | self.main.u_tf: u.reshape(-1, self.dimu) 172 | } 173 | td_errors = self.sess.run(vals, feed_dict=feed) 174 | td_errors = td_errors.copy() 175 | 176 | return td_errors 177 | 178 | def store_episode(self, episode_batch, dump_buffer, w_potential, w_linear, w_rotational, rank_method, clip_energy, update_stats=True): 179 | """ 180 | episode_batch: array of batch_size x (T or T+1) x dim_key 181 | 'o' is of size T+1, others are of size T 182 | """ 183 | if self.prioritization == 'tderror': 184 | self.buffer.store_episode(episode_batch, dump_buffer) 185 | elif self.prioritization == 'energy': 186 | self.buffer.store_episode(episode_batch, w_potential, w_linear, w_rotational, rank_method, clip_energy) 187 | else: 188 | self.buffer.store_episode(episode_batch) 189 | 190 | if update_stats: 191 | # add transitions to normalizer 192 | episode_batch['o_2'] = episode_batch['o'][:, 1:, :] 193 | episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :] 194 | num_normalizing_transitions = transitions_in_episode_batch(episode_batch) 195 | 196 | if self.prioritization == 'energy': 197 | if not self.buffer.current_size==0 and not len(episode_batch['ag'])==0: 198 | transitions = self.sample_transitions(episode_batch, num_normalizing_transitions, 'none', 1.0, True) 199 | elif self.prioritization == 'tderror': 200 | transitions, weights, episode_idxs = \ 201 | self.sample_transitions(self.buffer, episode_batch, num_normalizing_transitions, beta=0) 202 | else: 203 | transitions = self.sample_transitions(episode_batch, num_normalizing_transitions) 204 | 205 | 206 | o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag'] 207 | transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) 208 | 209 | self.o_stats.update(transitions['o']) 210 | self.g_stats.update(transitions['g']) 211 | 212 | self.o_stats.recompute_stats() 213 | self.g_stats.recompute_stats() 214 | 215 | def get_current_buffer_size(self): 216 | return self.buffer.get_current_size() 217 | 218 | def dump_buffer(self, epoch): 219 | self.buffer.dump_buffer(epoch) 220 | 221 | def _sync_optimizers(self): 222 | self.Q_adam.sync() 223 | self.pi_adam.sync() 224 | 225 | def _grads(self): 226 | # Avoid feed_dict here for performance! 227 | critic_loss, actor_loss, Q_grad, pi_grad, td_error = self.sess.run([ 228 | self.Q_loss_tf, 229 | self.main.Q_pi_tf, 230 | self.Q_grad_tf, 231 | self.pi_grad_tf, 232 | self.td_error_tf 233 | ]) 234 | return critic_loss, actor_loss, Q_grad, pi_grad, td_error 235 | 236 | def _update(self, Q_grad, pi_grad): 237 | self.Q_adam.update(Q_grad, self.Q_lr) 238 | self.pi_adam.update(pi_grad, self.pi_lr) 239 | 240 | def sample_batch(self, t): 241 | 242 | if self.prioritization == 'energy': 243 | transitions = self.buffer.sample(self.batch_size, self.rank_method, temperature=self.temperature) 244 | weights = np.ones_like(transitions['r']).copy() 245 | elif self.prioritization == 'tderror': 246 | transitions, weights, idxs = self.buffer.sample(self.batch_size, beta=self.beta_schedule.value(t)) 247 | else: 248 | transitions = self.buffer.sample(self.batch_size) 249 | weights = np.ones_like(transitions['r']).copy() 250 | 251 | o, o_2, g = transitions['o'], transitions['o_2'], transitions['g'] 252 | ag, ag_2 = transitions['ag'], transitions['ag_2'] 253 | transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) 254 | transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g) 255 | 256 | transitions['w'] = weights.flatten().copy() # note: ordered dict 257 | transitions_batch = [transitions[key] for key in self.stage_shapes.keys()] 258 | 259 | if self.prioritization == 'tderror': 260 | return (transitions_batch, idxs) 261 | else: 262 | return transitions_batch 263 | 264 | def stage_batch(self, t, batch=None): # 265 | if batch is None: 266 | if self.prioritization == 'tderror': 267 | batch, idxs = self.sample_batch(t) 268 | else: 269 | batch = self.sample_batch(t) 270 | assert len(self.buffer_ph_tf) == len(batch) 271 | self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch))) 272 | 273 | if self.prioritization == 'tderror': 274 | return idxs 275 | 276 | def train(self, t, dump_buffer, stage=True): 277 | if not self.buffer.current_size==0: 278 | if stage: 279 | if self.prioritization == 'tderror': 280 | idxs = self.stage_batch(t) 281 | else: 282 | self.stage_batch(t) 283 | critic_loss, actor_loss, Q_grad, pi_grad, td_error = self._grads() 284 | if self.prioritization == 'tderror': 285 | new_priorities = np.abs(td_error) + self.eps # td_error 286 | 287 | if dump_buffer: 288 | T = self.buffer.buffers['u'].shape[1] 289 | episode_idxs = idxs // T 290 | t_samples = idxs % T 291 | batch_size = td_error.shape[0] 292 | with self.buffer.lock: 293 | for i in range(batch_size): 294 | self.buffer.buffers['td'][episode_idxs[i]][t_samples[i]] = td_error[i] 295 | 296 | self.buffer.update_priorities(idxs, new_priorities) 297 | self._update(Q_grad, pi_grad) 298 | return critic_loss, actor_loss 299 | 300 | def _init_target_net(self): 301 | self.sess.run(self.init_target_net_op) 302 | 303 | def update_target_net(self): 304 | self.sess.run(self.update_target_net_op) 305 | 306 | def clear_buffer(self): 307 | self.buffer.clear_buffer() 308 | 309 | def _vars(self, scope): 310 | res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope) 311 | assert len(res) > 0 312 | return res 313 | 314 | def _global_vars(self, scope): 315 | res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope) 316 | return res 317 | 318 | def _create_network(self, reuse=False): 319 | logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) 320 | 321 | self.sess = tf.get_default_session() 322 | if self.sess is None: 323 | self.sess = tf.InteractiveSession() 324 | 325 | # running averages 326 | with tf.variable_scope('o_stats') as vs: 327 | if reuse: 328 | vs.reuse_variables() 329 | self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) 330 | with tf.variable_scope('g_stats') as vs: 331 | if reuse: 332 | vs.reuse_variables() 333 | self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) 334 | 335 | # mini-batch sampling. 336 | batch = self.staging_tf.get() 337 | batch_tf = OrderedDict([(key, batch[i]) 338 | for i, key in enumerate(self.stage_shapes.keys())]) 339 | batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) 340 | batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1]) 341 | 342 | # networks 343 | with tf.variable_scope('main') as vs: 344 | if reuse: 345 | vs.reuse_variables() 346 | self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) 347 | vs.reuse_variables() 348 | with tf.variable_scope('target') as vs: 349 | if reuse: 350 | vs.reuse_variables() 351 | target_batch_tf = batch_tf.copy() 352 | target_batch_tf['o'] = batch_tf['o_2'] 353 | target_batch_tf['g'] = batch_tf['g_2'] 354 | self.target = self.create_actor_critic( 355 | target_batch_tf, net_type='target', **self.__dict__) 356 | vs.reuse_variables() 357 | assert len(self._vars("main")) == len(self._vars("target")) 358 | 359 | # loss functions 360 | target_Q_pi_tf = self.target.Q_pi_tf 361 | clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) 362 | target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) 363 | 364 | self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf 365 | self.errors_tf = tf.square(self.td_error_tf) 366 | self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf) 367 | self.Q_loss_tf = tf.reduce_mean(self.errors_tf) 368 | 369 | self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) 370 | self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) 371 | Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) 372 | pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) 373 | assert len(self._vars('main/Q')) == len(Q_grads_tf) 374 | assert len(self._vars('main/pi')) == len(pi_grads_tf) 375 | self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) 376 | self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) 377 | self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) 378 | self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) 379 | 380 | # optimizers 381 | self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) 382 | self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) 383 | 384 | # polyak averaging 385 | self.main_vars = self._vars('main/Q') + self._vars('main/pi') 386 | self.target_vars = self._vars('target/Q') + self._vars('target/pi') 387 | self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') 388 | self.init_target_net_op = list( 389 | map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) 390 | self.update_target_net_op = list( 391 | map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) 392 | 393 | # initialize all variables 394 | tf.variables_initializer(self._global_vars('')).run() 395 | self._sync_optimizers() 396 | self._init_target_net() 397 | 398 | def logs(self, prefix=''): 399 | logs = [] 400 | logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))] 401 | logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))] 402 | logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))] 403 | logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))] 404 | 405 | if prefix is not '' and not prefix.endswith('/'): 406 | return [(prefix + '/' + key, val) for key, val in logs] 407 | else: 408 | return logs 409 | 410 | def __getstate__(self): 411 | """Our policies can be loaded from pkl, but after unpickling you cannot continue training. 412 | """ 413 | excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 414 | 'main', 'target', 'lock', 'env', 'sample_transitions', 415 | 'stage_shapes', 'create_actor_critic'] 416 | 417 | state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])} 418 | state['buffer_size'] = self.buffer_size 419 | state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name]) 420 | return state 421 | 422 | def __setstate__(self, state): 423 | if 'sample_transitions' not in state: 424 | # We don't need this for playing the policy. 425 | state['sample_transitions'] = None 426 | state['env_name'] = None # No need for playing the policy 427 | 428 | self.__init__(**state) 429 | # set up stats (they are overwritten in __init__) 430 | for k, v in state.items(): 431 | if k[-6:] == '_stats': 432 | self.__dict__[k] = v 433 | # load TF variables 434 | vars = [x for x in self._global_vars('') if 'buffer' not in x.name] 435 | assert(len(vars) == len(state["tf"])) 436 | node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])] 437 | self.sess.run(node) 438 | -------------------------------------------------------------------------------- /baselines/her/experiment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruizhaogit/EnergyBasedPrioritization/2fd2f5bab0547848f4f76b837d16238435518dcc/baselines/her/experiment/__init__.py -------------------------------------------------------------------------------- /baselines/her/experiment/config.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | import json 4 | import os 5 | import gym 6 | 7 | from baselines import logger 8 | from baselines.her.ddpg import DDPG 9 | from baselines.her.her import make_sample_her_transitions, \ 10 | make_sample_her_transitions_energy, \ 11 | make_sample_her_transitions_prioritized_replay 12 | 13 | 14 | DEFAULT_ENV_PARAMS = { 15 | 'FetchReach-v0': { 16 | 'n_cycles': 10, 17 | }, 18 | } 19 | 20 | 21 | DEFAULT_PARAMS = { 22 | # env 23 | 'max_u': 1., # max absolute value of actions on different coordinates 24 | # ddpg 25 | 'layers': 3, # number of layers in the critic/actor networks 26 | 'hidden': 256, # number of neurons in each hidden layers 27 | 'network_class': 'baselines.her.actor_critic:ActorCritic', 28 | 'Q_lr': 0.001, # critic learning rate 29 | 'pi_lr': 0.001, # actor learning rate 30 | 'buffer_size': int(1E6), # int(1E6) int(1E6) bug for experience replay 31 | 'polyak': 0.95, # polyak averaging coefficient 32 | 'action_l2': 1.0, # quadratic penalty on actions (before rescaling by max_u) 33 | 'clip_obs': 200., 34 | 'scope': 'ddpg', # can be tweaked for testing 35 | 'relative_goals': False, 36 | # training 37 | 'n_cycles': 50, # per epoch 38 | 'rollout_batch_size': 2, # per mpi thread 39 | 'n_batches': 40, # training batches per cycle 40 | 'batch_size': 256, # per mpi thread, measured in transitions and reduced to even multiple of chunk_length. 41 | 'n_test_rollouts': 10, # number of test rollouts per epoch, each consists of rollout_batch_size rollouts 42 | 'test_with_polyak': False, # run test episodes with the target network 43 | # exploration 44 | 'random_eps': 0.3, # percentage of time a random action is taken 45 | 'noise_eps': 0.2, # std of gaussian noise added to not-completely-random actions as a percentage of max_u 46 | # HER 47 | 'replay_strategy': 'future', # supported modes: future, none 48 | 'replay_k': 4, # number of additional goals used for replay, only used if off_policy_data=future 49 | # normalization 50 | 'norm_eps': 0.01, # epsilon used for observation normalization 51 | 'norm_clip': 5, # normalized observations are cropped to this values 52 | 53 | # prioritized_replay (tderror) 54 | 'alpha': 0.6, # 0.6 55 | 'beta0': 0.4, # 0.4 56 | 'beta_iters': None, # None 57 | 'eps': 1e-6, 58 | 59 | # energy-based prioritization 60 | 'w_potential': 1.0, 61 | 'w_linear': 1.0, 62 | 'w_rotational': 1.0, 63 | } 64 | 65 | 66 | CACHED_ENVS = {} 67 | def cached_make_env(make_env): 68 | """ 69 | Only creates a new environment from the provided function if one has not yet already been 70 | created. This is useful here because we need to infer certain properties of the env, e.g. 71 | its observation and action spaces, without any intend of actually using it. 72 | """ 73 | if make_env not in CACHED_ENVS: 74 | env = make_env() 75 | CACHED_ENVS[make_env] = env 76 | return CACHED_ENVS[make_env] 77 | 78 | 79 | def prepare_params(kwargs): 80 | # DDPG params 81 | ddpg_params = dict() 82 | 83 | env_name = kwargs['env_name'] 84 | def make_env(): 85 | return gym.make(env_name) 86 | kwargs['make_env'] = make_env 87 | tmp_env = cached_make_env(kwargs['make_env']) 88 | assert hasattr(tmp_env, '_max_episode_steps') 89 | kwargs['T'] = tmp_env._max_episode_steps 90 | tmp_env.reset() 91 | kwargs['max_u'] = np.array(kwargs['max_u']) if type(kwargs['max_u']) == list else kwargs['max_u'] 92 | kwargs['gamma'] = 1. - 1. / kwargs['T'] 93 | if 'lr' in kwargs: 94 | kwargs['pi_lr'] = kwargs['lr'] 95 | kwargs['Q_lr'] = kwargs['lr'] 96 | del kwargs['lr'] 97 | for name in ['buffer_size', 'hidden', 'layers', 98 | 'network_class', 99 | 'polyak', 100 | 'batch_size', 'Q_lr', 'pi_lr', 101 | 'norm_eps', 'norm_clip', 'max_u', 102 | 'action_l2', 'clip_obs', 'scope', 'relative_goals', 103 | 'alpha', 'beta0', 'beta_iters', 'eps']: 104 | ddpg_params[name] = kwargs[name] 105 | kwargs['_' + name] = kwargs[name] 106 | del kwargs[name] 107 | kwargs['ddpg_params'] = ddpg_params 108 | 109 | return kwargs 110 | 111 | 112 | def log_params(params, logger=logger): 113 | for key in sorted(params.keys()): 114 | logger.info('{}: {}'.format(key, params[key])) 115 | 116 | 117 | def configure_her(params): 118 | env = cached_make_env(params['make_env']) 119 | env.reset() 120 | def reward_fun(ag_2, g, info): # vectorized 121 | return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) 122 | 123 | # Prepare configuration for HER. 124 | her_params = { 125 | 'reward_fun': reward_fun, 126 | } 127 | for name in ['replay_strategy', 'replay_k']: 128 | her_params[name] = params[name] 129 | params['_' + name] = her_params[name] 130 | del params[name] 131 | 132 | if params['prioritization'] == 'energy': 133 | sample_her_transitions = make_sample_her_transitions_energy(**her_params) 134 | elif params['prioritization'] == 'tderror': 135 | sample_her_transitions = make_sample_her_transitions_prioritized_replay(**her_params) 136 | else: 137 | sample_her_transitions = make_sample_her_transitions(**her_params) 138 | 139 | return sample_her_transitions 140 | 141 | 142 | def simple_goal_subtract(a, b): 143 | assert a.shape == b.shape 144 | return a - b 145 | 146 | 147 | def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): 148 | sample_her_transitions = configure_her(params) 149 | # Extract relevant parameters. 150 | gamma = params['gamma'] 151 | rollout_batch_size = params['rollout_batch_size'] 152 | ddpg_params = params['ddpg_params'] 153 | temperature = params['temperature'] 154 | prioritization = params['prioritization'] 155 | env_name = params['env_name'] 156 | max_timesteps = params['max_timesteps'] 157 | rank_method = params['rank_method'] 158 | 159 | input_dims = dims.copy() 160 | 161 | # DDPG agent 162 | env = cached_make_env(params['make_env']) 163 | env.reset() 164 | ddpg_params.update({'input_dims': input_dims, # agent takes an input observations 165 | 'T': params['T'], 166 | 'clip_pos_returns': True, # clip positive returns 167 | 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 168 | 'rollout_batch_size': rollout_batch_size, 169 | 'subtract_goals': simple_goal_subtract, 170 | 'sample_transitions': sample_her_transitions, 171 | 'gamma': gamma, 172 | 'temperature': temperature, 173 | 'prioritization': prioritization, 174 | 'env_name': env_name, 175 | 'max_timesteps': max_timesteps, 176 | 'rank_method': rank_method, 177 | }) 178 | ddpg_params['info'] = { 179 | 'env_name': params['env_name'], 180 | } 181 | policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi) 182 | return policy 183 | 184 | 185 | def configure_dims(params): 186 | env = cached_make_env(params['make_env']) 187 | env.reset() 188 | obs, _, _, info = env.step(env.action_space.sample()) 189 | 190 | dims = { 191 | 'o': obs['observation'].shape[0], 192 | 'u': env.action_space.shape[0], 193 | 'g': obs['desired_goal'].shape[0], 194 | } 195 | for key, value in info.items(): 196 | value = np.array(value) 197 | if value.ndim == 0: 198 | value = value.reshape(1) 199 | dims['info_{}'.format(key)] = value.shape[0] 200 | return dims 201 | -------------------------------------------------------------------------------- /baselines/her/experiment/play.py: -------------------------------------------------------------------------------- 1 | import click 2 | import numpy as np 3 | import pickle 4 | 5 | from baselines import logger 6 | from baselines.common import set_global_seeds 7 | import baselines.her.experiment.config as config 8 | from baselines.her.rollout import RolloutWorker 9 | 10 | 11 | @click.command() 12 | @click.argument('policy_file', type=str) 13 | @click.option('--seed', type=int, default=0) 14 | @click.option('--n_test_rollouts', type=int, default=20) 15 | @click.option('--render', type=int, default=1) 16 | def main(policy_file, seed, n_test_rollouts, render): 17 | set_global_seeds(seed) 18 | 19 | # Load policy. 20 | with open(policy_file, 'rb') as f: 21 | policy = pickle.load(f) 22 | env_name = policy.info['env_name'] 23 | 24 | # Prepare params. 25 | params = config.DEFAULT_PARAMS 26 | if env_name in config.DEFAULT_ENV_PARAMS: 27 | params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in 28 | params['env_name'] = env_name 29 | params = config.prepare_params(params) 30 | config.log_params(params, logger=logger) 31 | 32 | dims = config.configure_dims(params) 33 | 34 | eval_params = { 35 | 'exploit': True, 36 | 'use_target_net': params['test_with_polyak'], 37 | 'compute_Q': True, 38 | 'rollout_batch_size': 1, 39 | 'render': bool(render), 40 | } 41 | 42 | for name in ['T', 'gamma', 'noise_eps', 'random_eps']: 43 | eval_params[name] = params[name] 44 | 45 | evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) 46 | evaluator.seed(seed) 47 | 48 | # Run evaluation. 49 | evaluator.clear_history() 50 | for _ in range(n_test_rollouts): 51 | evaluator.generate_rollouts() 52 | 53 | # record logs 54 | for key, val in evaluator.logs('test'): 55 | logger.record_tabular(key, np.mean(val)) 56 | logger.dump_tabular() 57 | 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /baselines/her/experiment/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import click 5 | import numpy as np 6 | import json 7 | from mpi4py import MPI 8 | 9 | from baselines import logger 10 | from baselines.common import set_global_seeds 11 | from baselines.common.mpi_moments import mpi_moments 12 | import baselines.her.experiment.config as config 13 | from baselines.her.rollout import RolloutWorker 14 | from baselines.her.util import mpi_fork 15 | 16 | import os.path as osp 17 | import tempfile 18 | import datetime 19 | 20 | 21 | def mpi_average(value): 22 | if value == []: 23 | value = [0.] 24 | if not isinstance(value, list): 25 | value = [value] 26 | return mpi_moments(np.array(value))[0] 27 | 28 | 29 | def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, 30 | policy_save_interval, save_policies, num_cpu, dump_buffer, w_potential, w_linear, 31 | w_rotational, rank_method, clip_energy, **kwargs): 32 | rank = MPI.COMM_WORLD.Get_rank() 33 | 34 | latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl') 35 | best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl') 36 | periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl') 37 | 38 | logger.info("Training...") 39 | best_success_rate = -1 40 | t = 1 41 | for epoch in range(n_epochs): 42 | # train 43 | rollout_worker.clear_history() 44 | for cycle in range(n_cycles): 45 | episode = rollout_worker.generate_rollouts() 46 | policy.store_episode(episode, dump_buffer, w_potential, w_linear, w_rotational, rank_method, clip_energy) 47 | for batch in range(n_batches): 48 | t = ((epoch*n_cycles*n_batches)+(cycle*n_batches)+batch)*num_cpu 49 | policy.train(t, dump_buffer) 50 | 51 | policy.update_target_net() 52 | 53 | # test 54 | evaluator.clear_history() 55 | for _ in range(n_test_rollouts): 56 | evaluator.generate_rollouts() 57 | 58 | # record logs 59 | logger.record_tabular('epoch', epoch) 60 | for key, val in evaluator.logs('test'): 61 | logger.record_tabular(key, mpi_average(val)) 62 | for key, val in rollout_worker.logs('train'): 63 | logger.record_tabular(key, mpi_average(val)) 64 | for key, val in policy.logs(): 65 | logger.record_tabular(key, mpi_average(val)) 66 | 67 | if rank == 0: 68 | logger.dump_tabular() 69 | 70 | if dump_buffer: 71 | policy.dump_buffer(epoch) 72 | 73 | # save the policy if it's better than the previous ones 74 | success_rate = mpi_average(evaluator.current_success_rate()) 75 | if rank == 0 and success_rate >= best_success_rate and save_policies: 76 | best_success_rate = success_rate 77 | logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path)) 78 | evaluator.save_policy(best_policy_path) 79 | evaluator.save_policy(latest_policy_path) 80 | if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies: 81 | policy_path = periodic_policy_path.format(epoch) 82 | logger.info('Saving periodic policy to {} ...'.format(policy_path)) 83 | evaluator.save_policy(policy_path) 84 | 85 | # make sure that different threads have different seeds 86 | local_uniform = np.random.uniform(size=(1,)) 87 | root_uniform = local_uniform.copy() 88 | MPI.COMM_WORLD.Bcast(root_uniform, root=0) 89 | if rank != 0: 90 | assert local_uniform[0] != root_uniform[0] 91 | 92 | 93 | def launch( 94 | env_name, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, 95 | temperature, prioritization, binding, logging, version, dump_buffer, n_cycles, rank_method, 96 | w_potential, w_linear, w_rotational, clip_energy, override_params={}, save_policies=True): 97 | 98 | # Fork for multi-CPU MPI implementation. 99 | if num_cpu > 1: 100 | whoami = mpi_fork(num_cpu, binding) 101 | if whoami == 'parent': 102 | sys.exit(0) 103 | import baselines.common.tf_util as U 104 | U.single_threaded_session().__enter__() 105 | rank = MPI.COMM_WORLD.Get_rank() 106 | 107 | # Configure logging 108 | 109 | if logging: 110 | logdir = 'logs/'+str(env_name)+'-temperature'+str(temperature)+\ 111 | '-prioritization'+str(prioritization)+'-replay_strategy'+str(replay_strategy)+\ 112 | '-n_epochs'+str(n_epochs)+'-num_cpu'+str(num_cpu)+'-seed'+str(seed)+\ 113 | '-n_cycles'+str(n_cycles)+'-rank_method'+str(rank_method)+\ 114 | '-w_potential'+str(w_potential)+'-w_linear'+str(w_linear)+'-w_rotational'+str(w_rotational)+\ 115 | '-clip_energy'+str(clip_energy)+\ 116 | '-version'+str(version) 117 | else: 118 | logdir = osp.join(tempfile.gettempdir(), 119 | datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) 120 | 121 | if rank == 0: 122 | if logdir or logger.get_dir() is None: 123 | logger.configure(dir=logdir) 124 | else: 125 | logger.configure() 126 | logdir = logger.get_dir() 127 | assert logdir is not None 128 | os.makedirs(logdir, exist_ok=True) 129 | 130 | # Seed everything. 131 | rank_seed = seed + 1000000 * rank 132 | set_global_seeds(rank_seed) 133 | 134 | # Prepare params. 135 | params = config.DEFAULT_PARAMS 136 | params['env_name'] = env_name 137 | params['replay_strategy'] = replay_strategy 138 | params['temperature'] = temperature 139 | params['prioritization'] = prioritization 140 | params['binding'] = binding 141 | params['max_timesteps'] = n_epochs * params['n_cycles'] * params['n_batches'] * num_cpu 142 | params['version'] = version 143 | params['dump_buffer'] = dump_buffer 144 | params['n_cycles'] = n_cycles 145 | params['rank_method'] = rank_method 146 | params['w_potential'] = w_potential 147 | params['w_linear'] = w_linear 148 | params['w_rotational'] = w_rotational 149 | params['clip_energy'] = clip_energy 150 | params['n_epochs'] = n_epochs 151 | params['num_cpu'] = num_cpu 152 | 153 | if params['dump_buffer']: 154 | params['alpha'] =0 155 | 156 | if env_name in config.DEFAULT_ENV_PARAMS: 157 | params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in 158 | params.update(**override_params) # makes it possible to override any parameter 159 | with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: 160 | json.dump(params, f) 161 | params = config.prepare_params(params) 162 | config.log_params(params, logger=logger) 163 | 164 | dims = config.configure_dims(params) 165 | policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) 166 | 167 | rollout_params = { 168 | 'exploit': False, 169 | 'use_target_net': False, 170 | 'use_demo_states': True, 171 | 'compute_Q': False, 172 | 'T': params['T'], 173 | } 174 | 175 | eval_params = { 176 | 'exploit': True, 177 | 'use_target_net': params['test_with_polyak'], 178 | 'use_demo_states': False, 179 | 'compute_Q': True, 180 | 'T': params['T'], 181 | } 182 | 183 | for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: 184 | rollout_params[name] = params[name] 185 | eval_params[name] = params[name] 186 | 187 | rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) 188 | rollout_worker.seed(rank_seed) 189 | 190 | evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) 191 | evaluator.seed(rank_seed) 192 | 193 | train( 194 | logdir=logdir, policy=policy, rollout_worker=rollout_worker, 195 | evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], 196 | n_cycles=params['n_cycles'], n_batches=params['n_batches'], 197 | policy_save_interval=policy_save_interval, save_policies=save_policies, 198 | num_cpu=num_cpu, dump_buffer=dump_buffer, w_potential=params['w_potential'], 199 | w_linear=params['w_linear'], w_rotational=params['w_rotational'], rank_method=rank_method, 200 | clip_energy=clip_energy) 201 | 202 | 203 | @click.command() 204 | @click.option('--env_name', type=click.Choice(['FetchPickAndPlace-v0', 'HandManipulateBlockFull-v0', \ 205 | 'HandManipulateEggFull-v0', 'HandManipulatePenRotate-v0']), default='FetchPickAndPlace-v0', help='the name of the OpenAI Gym \ 206 | environment that you want to train on. We tested EBP on four challenging robotic manipulation tasks, including: \ 207 | FetchPickAndPlace-v0, HandManipulateBlockFull-v0, HandManipulateEggFull-v0, HandManipulatePenRotate-v0') 208 | @click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run') 209 | @click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)') 210 | @click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code') 211 | @click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.') 212 | @click.option('--replay_strategy', type=click.Choice(['future', 'final', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.') 213 | @click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped') 214 | @click.option('--temperature', type=float, default=1.0, help='temperature value for Enery-Based Prioritization (EBP)') 215 | @click.option('--prioritization', type=click.Choice(['none', 'energy', 'tderror']), default='energy', help='the prioritization strategy to be used. "energy" uses EBP;\ 216 | "none" is vanilla HER; tderror is Prioritized Experience Replay.') 217 | @click.option('--binding', type=click.Choice(['none', 'core']), default='core', help='configure mpi using bind-to none or core.') 218 | @click.option('--logging', type=bool, default=False, help='whether or not logging') 219 | @click.option('--version', type=int, default=0, help='version') 220 | @click.option('--dump_buffer', type=bool, default=False, help='dump buffer contains achieved goals, energy, tderrors for analysis') 221 | @click.option('--n_cycles', type=int, default=50, help='n_cycles') 222 | @click.option('--rank_method', type=click.Choice(['none', 'min', 'dense', 'average']), default='none', help='energy ranking method') 223 | @click.option('--w_potential', type=float, default=1.0, help='w_potential') 224 | @click.option('--w_linear', type=float, default=1.0, help='w_linear') 225 | @click.option('--w_rotational', type=float, default=1.0, help='w_rotational') 226 | @click.option('--clip_energy', type=float, default=999, help='clip_energy') 227 | 228 | def main(**kwargs): 229 | launch(**kwargs) 230 | 231 | if __name__ == '__main__': 232 | main() 233 | -------------------------------------------------------------------------------- /baselines/her/her.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from scipy.stats import rankdata 4 | 5 | import random 6 | 7 | 8 | def make_sample_her_transitions(replay_strategy, replay_k, reward_fun): 9 | """Creates a sample function that can be used for HER experience replay. 10 | 11 | Args: 12 | replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none', 13 | regular DDPG experience replay is used 14 | replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times 15 | as many HER replays as regular replays are used) 16 | reward_fun (function): function to re-compute the reward with substituted goals 17 | """ 18 | if (replay_strategy == 'future') or (replay_strategy == 'final'): 19 | future_p = 1 - (1. / (1 + replay_k)) 20 | else: # 'replay_strategy' == 'none' 21 | future_p = 0 22 | 23 | def _sample_her_transitions(episode_batch, batch_size_in_transitions): 24 | """episode_batch is {key: array(buffer_size x T x dim_key)} 25 | """ 26 | T = episode_batch['u'].shape[1] 27 | rollout_batch_size = episode_batch['u'].shape[0] 28 | batch_size = batch_size_in_transitions 29 | 30 | # Select which episodes and time steps to use. 31 | episode_idxs = np.random.randint(0, rollout_batch_size, batch_size) 32 | t_samples = np.random.randint(T, size=batch_size) 33 | transitions = {key: episode_batch[key][episode_idxs, t_samples].copy() 34 | for key in episode_batch.keys()} 35 | 36 | # Select future time indexes proportional with probability future_p. These 37 | # will be used for HER replay by substituting in future goals. 38 | her_indexes = np.where(np.random.uniform(size=batch_size) < future_p) 39 | future_offset = np.random.uniform(size=batch_size) * (T - t_samples) 40 | future_offset = future_offset.astype(int) 41 | future_t = (t_samples + 1 + future_offset)[her_indexes] 42 | 43 | if replay_strategy == 'final': 44 | future_t[:] = T 45 | 46 | # Replace goal with achieved goal but only for the previously-selected 47 | # HER transitions (as defined by her_indexes). For the other transitions, 48 | # keep the original goal. 49 | future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t] 50 | transitions['g'][her_indexes] = future_ag 51 | 52 | # Reconstruct info dictionary for reward computation. 53 | info = {} 54 | for key, value in transitions.items(): 55 | if key.startswith('info_'): 56 | info[key.replace('info_', '')] = value 57 | 58 | # Re-compute reward since we may have substituted the goal. 59 | reward_params = {k: transitions[k] for k in ['ag_2', 'g']} 60 | reward_params['info'] = info 61 | transitions['r'] = reward_fun(**reward_params) 62 | 63 | transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) 64 | for k in transitions.keys()} 65 | 66 | assert(transitions['u'].shape[0] == batch_size_in_transitions) 67 | 68 | return transitions 69 | 70 | return _sample_her_transitions 71 | 72 | 73 | def make_sample_her_transitions_energy(replay_strategy, replay_k, reward_fun): 74 | 75 | if (replay_strategy == 'future') or (replay_strategy == 'final'): 76 | future_p = 1 - (1. / (1 + replay_k)) 77 | else: 78 | future_p = 0 79 | 80 | 81 | def _sample_her_transitions(episode_batch, batch_size_in_transitions, rank_method, temperature, update_stats=False): 82 | 83 | T = episode_batch['u'].shape[1] 84 | rollout_batch_size = episode_batch['u'].shape[0] 85 | batch_size = batch_size_in_transitions 86 | 87 | episode_idxs = np.random.randint(0, rollout_batch_size, batch_size) 88 | t_samples = np.random.randint(T, size=batch_size) 89 | 90 | if not update_stats: 91 | if rank_method == 'none': 92 | energy_trajectory = episode_batch['e'] 93 | else: 94 | energy_trajectory = episode_batch['p'] 95 | p_trajectory = np.power(energy_trajectory, 1/(temperature+1e-2)) 96 | p_trajectory = p_trajectory / p_trajectory.sum() 97 | episode_idxs_energy = np.random.choice(rollout_batch_size, size=batch_size, replace=True, p=p_trajectory.flatten()) 98 | episode_idxs = episode_idxs_energy 99 | 100 | 101 | transitions = {} 102 | for key in episode_batch.keys(): 103 | if not key =='p' and not key == 's' and not key == 'e': 104 | transitions[key] = episode_batch[key][episode_idxs, t_samples].copy() 105 | 106 | # Select future time indexes proportional with probability future_p. These 107 | # will be used for HER replay by substituting in future goals. 108 | her_indexes = np.where(np.random.uniform(size=batch_size) < future_p) 109 | 110 | future_offset = np.random.uniform(size=batch_size) * (T - t_samples) 111 | future_offset = future_offset.astype(int) 112 | future_t = (t_samples + 1 + future_offset)[her_indexes] 113 | 114 | if replay_strategy == 'final': 115 | future_t[:] = T 116 | 117 | # Replace goal with achieved goal but only for the previously-selected 118 | # HER transitions (as defined by her_indexes). For the other transitions, 119 | # keep the original goal. 120 | future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t] 121 | 122 | transitions['g'][her_indexes] = future_ag 123 | 124 | # Reconstruct info dictionary for reward computation. 125 | info = {} 126 | for key, value in transitions.items(): 127 | if key.startswith('info_'): 128 | info[key.replace('info_', '')] = value 129 | 130 | # Re-compute reward since we may have substituted the goal. 131 | reward_params = {k: transitions[k] for k in ['ag_2', 'g']} 132 | reward_params['info'] = info 133 | 134 | transitions['r'] = reward_fun(**reward_params) 135 | 136 | transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) 137 | for k in transitions.keys()} 138 | 139 | assert(transitions['u'].shape[0] == batch_size_in_transitions) 140 | 141 | return transitions 142 | 143 | return _sample_her_transitions 144 | 145 | 146 | def make_sample_her_transitions_prioritized_replay(replay_strategy, replay_k, reward_fun): 147 | 148 | if (replay_strategy == 'future') or (replay_strategy == 'final'): 149 | future_p = 1 - (1. / (1 + replay_k)) 150 | else: 151 | future_p = 0 152 | 153 | def _sample_proportional(self, rollout_batch_size, batch_size, T): 154 | episode_idxs = [] 155 | t_samples = [] 156 | for _ in range(batch_size): 157 | self.n_transitions_stored = min(self.n_transitions_stored, self.size_in_transitions) 158 | mass = random.random() * self._it_sum.sum(0, self.n_transitions_stored - 1) 159 | idx = self._it_sum.find_prefixsum_idx(mass) 160 | assert idx < self.n_transitions_stored 161 | episode_idx = idx//T 162 | assert episode_idx < rollout_batch_size 163 | t_sample = idx%T 164 | episode_idxs.append(episode_idx) 165 | t_samples.append(t_sample) 166 | 167 | return (episode_idxs, t_samples) 168 | 169 | def _sample_her_transitions(self, episode_batch, batch_size_in_transitions, beta): 170 | """episode_batch is {key: array(buffer_size x T x dim_key)} 171 | """ 172 | 173 | T = episode_batch['u'].shape[1] 174 | rollout_batch_size = episode_batch['u'].shape[0] 175 | batch_size = batch_size_in_transitions 176 | 177 | if rollout_batch_size < self.current_size: 178 | episode_idxs = np.random.randint(0, rollout_batch_size, batch_size) 179 | t_samples = np.random.randint(T, size=batch_size) 180 | else: 181 | assert beta >= 0 182 | episode_idxs, t_samples = _sample_proportional(self, rollout_batch_size, batch_size, T) 183 | episode_idxs = np.array(episode_idxs) 184 | t_samples = np.array(t_samples) 185 | 186 | weights = [] 187 | p_min = self._it_min.min() / self._it_sum.sum() 188 | max_weight = (p_min * self.n_transitions_stored) ** (-beta) 189 | 190 | for episode_idx, t_sample in zip(episode_idxs, t_samples): 191 | p_sample = self._it_sum[episode_idx*T+t_sample] / self._it_sum.sum() 192 | weight = (p_sample * self.n_transitions_stored) ** (-beta) 193 | weights.append(weight / max_weight) 194 | 195 | weights = np.array(weights) 196 | 197 | transitions = {} 198 | for key in episode_batch.keys(): 199 | if not key == "td" and not key == "e": 200 | episode_batch_key = episode_batch[key].copy() 201 | transitions[key] = episode_batch_key[episode_idxs, t_samples].copy() 202 | 203 | # Select future time indexes proportional with probability future_p. These 204 | # will be used for HER replay by substituting in future goals. 205 | her_indexes = np.where(np.random.uniform(size=batch_size) < future_p) 206 | 207 | future_offset = np.random.uniform(size=batch_size) * (T - t_samples) 208 | future_offset = future_offset.astype(int) 209 | future_t = (t_samples + 1 + future_offset)[her_indexes] 210 | 211 | if replay_strategy == 'final': 212 | future_t[:] = T 213 | 214 | # Replace goal with achieved goal but only for the previously-selected 215 | # HER transitions (as defined by her_indexes). For the other transitions, 216 | # keep the original goal. 217 | future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t] 218 | 219 | # Reconstruct info dictionary for reward computation. 220 | info = {} 221 | for key, value in transitions.items(): 222 | if key.startswith('info_'): 223 | info[key.replace('info_', '')] = value 224 | 225 | reward_params = {k: transitions[k] for k in ['ag_2', 'g']} 226 | reward_params['info'] = info 227 | 228 | transitions['g'][her_indexes] = future_ag 229 | 230 | # Re-compute reward since we may have substituted the goal. 231 | reward_params = {k: transitions[k] for k in ['ag_2', 'g']} 232 | reward_params['info'] = info 233 | 234 | transitions['r'] = reward_fun(**reward_params) 235 | 236 | transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) 237 | for k in transitions.keys()} 238 | 239 | assert(transitions['u'].shape[0] == batch_size_in_transitions) 240 | 241 | idxs = episode_idxs * T + t_samples 242 | 243 | return (transitions, weights, idxs) 244 | 245 | return _sample_her_transitions 246 | -------------------------------------------------------------------------------- /baselines/her/normalizer.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import numpy as np 4 | from mpi4py import MPI 5 | import tensorflow as tf 6 | 7 | from baselines.her.util import reshape_for_broadcasting 8 | 9 | 10 | class Normalizer: 11 | def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None): 12 | """A normalizer that ensures that observations are approximately distributed according to 13 | a standard Normal distribution (i.e. have mean zero and variance one). 14 | 15 | Args: 16 | size (int): the size of the observation to be normalized 17 | eps (float): a small constant that avoids underflows 18 | default_clip_range (float): normalized observations are clipped to be in 19 | [-default_clip_range, default_clip_range] 20 | sess (object): the TensorFlow session to be used 21 | """ 22 | self.size = size 23 | self.eps = eps 24 | self.default_clip_range = default_clip_range 25 | self.sess = sess if sess is not None else tf.get_default_session() 26 | 27 | self.local_sum = np.zeros(self.size, np.float32) 28 | self.local_sumsq = np.zeros(self.size, np.float32) 29 | self.local_count = np.zeros(1, np.float32) 30 | 31 | self.sum_tf = tf.get_variable( 32 | initializer=tf.zeros_initializer(), shape=self.local_sum.shape, name='sum', 33 | trainable=False, dtype=tf.float32) 34 | self.sumsq_tf = tf.get_variable( 35 | initializer=tf.zeros_initializer(), shape=self.local_sumsq.shape, name='sumsq', 36 | trainable=False, dtype=tf.float32) 37 | self.count_tf = tf.get_variable( 38 | initializer=tf.ones_initializer(), shape=self.local_count.shape, name='count', 39 | trainable=False, dtype=tf.float32) 40 | self.mean = tf.get_variable( 41 | initializer=tf.zeros_initializer(), shape=(self.size,), name='mean', 42 | trainable=False, dtype=tf.float32) 43 | self.std = tf.get_variable( 44 | initializer=tf.ones_initializer(), shape=(self.size,), name='std', 45 | trainable=False, dtype=tf.float32) 46 | self.count_pl = tf.placeholder(name='count_pl', shape=(1,), dtype=tf.float32) 47 | self.sum_pl = tf.placeholder(name='sum_pl', shape=(self.size,), dtype=tf.float32) 48 | self.sumsq_pl = tf.placeholder(name='sumsq_pl', shape=(self.size,), dtype=tf.float32) 49 | 50 | self.update_op = tf.group( 51 | self.count_tf.assign_add(self.count_pl), 52 | self.sum_tf.assign_add(self.sum_pl), 53 | self.sumsq_tf.assign_add(self.sumsq_pl) 54 | ) 55 | self.recompute_op = tf.group( 56 | tf.assign(self.mean, self.sum_tf / self.count_tf), 57 | tf.assign(self.std, tf.sqrt(tf.maximum( 58 | tf.square(self.eps), 59 | self.sumsq_tf / self.count_tf - tf.square(self.sum_tf / self.count_tf) 60 | ))), 61 | ) 62 | self.lock = threading.Lock() 63 | 64 | def update(self, v): 65 | v = v.reshape(-1, self.size) 66 | 67 | with self.lock: 68 | self.local_sum += v.sum(axis=0) 69 | self.local_sumsq += (np.square(v)).sum(axis=0) 70 | self.local_count[0] += v.shape[0] 71 | 72 | def normalize(self, v, clip_range=None): 73 | if clip_range is None: 74 | clip_range = self.default_clip_range 75 | mean = reshape_for_broadcasting(self.mean, v) 76 | std = reshape_for_broadcasting(self.std, v) 77 | return tf.clip_by_value((v - mean) / std, -clip_range, clip_range) 78 | 79 | def denormalize(self, v): 80 | mean = reshape_for_broadcasting(self.mean, v) 81 | std = reshape_for_broadcasting(self.std, v) 82 | return mean + v * std 83 | 84 | def _mpi_average(self, x): 85 | buf = np.zeros_like(x) 86 | MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM) 87 | buf /= MPI.COMM_WORLD.Get_size() 88 | return buf 89 | 90 | def synchronize(self, local_sum, local_sumsq, local_count, root=None): 91 | local_sum[...] = self._mpi_average(local_sum) 92 | local_sumsq[...] = self._mpi_average(local_sumsq) 93 | local_count[...] = self._mpi_average(local_count) 94 | return local_sum, local_sumsq, local_count 95 | 96 | def recompute_stats(self): 97 | with self.lock: 98 | # Copy over results. 99 | local_count = self.local_count.copy() 100 | local_sum = self.local_sum.copy() 101 | local_sumsq = self.local_sumsq.copy() 102 | 103 | # Reset. 104 | self.local_count[...] = 0 105 | self.local_sum[...] = 0 106 | self.local_sumsq[...] = 0 107 | 108 | # We perform the synchronization outside of the lock to keep the critical section as short 109 | # as possible. 110 | synced_sum, synced_sumsq, synced_count = self.synchronize( 111 | local_sum=local_sum, local_sumsq=local_sumsq, local_count=local_count) 112 | 113 | self.sess.run(self.update_op, feed_dict={ 114 | self.count_pl: synced_count, 115 | self.sum_pl: synced_sum, 116 | self.sumsq_pl: synced_sumsq, 117 | }) 118 | self.sess.run(self.recompute_op) 119 | 120 | 121 | class IdentityNormalizer: 122 | def __init__(self, size, std=1.): 123 | self.size = size 124 | self.mean = tf.zeros(self.size, tf.float32) 125 | self.std = std * tf.ones(self.size, tf.float32) 126 | 127 | def update(self, x): 128 | pass 129 | 130 | def normalize(self, x, clip_range=None): 131 | return x / self.std 132 | 133 | def denormalize(self, x): 134 | return self.std * x 135 | 136 | def synchronize(self): 137 | pass 138 | 139 | def recompute_stats(self): 140 | pass 141 | -------------------------------------------------------------------------------- /baselines/her/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import numpy as np 4 | 5 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 6 | 7 | import math 8 | 9 | from scipy.stats import rankdata 10 | 11 | import json 12 | 13 | def quaternion_to_euler_angle(array): 14 | w = array[0] 15 | x = array[1] 16 | y = array[2] 17 | z = array[3] 18 | ysqr = y * y 19 | t0 = +2.0 * (w * x + y * z) 20 | t1 = +1.0 - 2.0 * (x * x + ysqr) 21 | X = math.atan2(t0, t1) 22 | t2 = +2.0 * (w * y - z * x) 23 | t2 = +1.0 if t2 > +1.0 else t2 24 | t2 = -1.0 if t2 < -1.0 else t2 25 | Y = math.asin(t2) 26 | t3 = +2.0 * (w * z + x * y) 27 | t4 = +1.0 - 2.0 * (ysqr + z * z) 28 | Z = math.atan2(t3, t4) 29 | result = np.array([X, Y, Z]) 30 | return result 31 | 32 | class ReplayBuffer: 33 | def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions): 34 | """Creates a replay buffer. 35 | 36 | Args: 37 | buffer_shapes (dict of ints): the shape for all buffers that are used in the replay 38 | buffer 39 | size_in_transitions (int): the size of the buffer, measured in transitions 40 | T (int): the time horizon for episodes 41 | sample_transitions (function): a function that samples from the replay buffer 42 | """ 43 | self.buffer_shapes = buffer_shapes 44 | self.size = size_in_transitions // T 45 | self.T = T 46 | self.sample_transitions = sample_transitions 47 | 48 | # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)} 49 | self.buffers = {key: np.empty([self.size, *shape]) 50 | for key, shape in buffer_shapes.items()} 51 | 52 | # memory management 53 | self.current_size = 0 54 | self.n_transitions_stored = 0 55 | 56 | self.lock = threading.Lock() 57 | 58 | @property 59 | def full(self): 60 | with self.lock: 61 | return self.current_size == self.size 62 | 63 | def sample(self, batch_size): 64 | """Returns a dict {key: array(batch_size x shapes[key])} 65 | """ 66 | buffers = {} 67 | 68 | with self.lock: 69 | assert self.current_size > 0 70 | for key in self.buffers.keys(): 71 | buffers[key] = self.buffers[key][:self.current_size] 72 | 73 | buffers['o_2'] = buffers['o'][:, 1:, :] 74 | buffers['ag_2'] = buffers['ag'][:, 1:, :] 75 | 76 | transitions = self.sample_transitions(buffers, batch_size) 77 | 78 | for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())): 79 | assert key in transitions, "key %s missing from transitions" % key 80 | 81 | return transitions 82 | 83 | def store_episode(self, episode_batch): 84 | """episode_batch: array(batch_size x (T or T+1) x dim_key) 85 | """ 86 | batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()] 87 | assert np.all(np.array(batch_sizes) == batch_sizes[0]) 88 | batch_size = batch_sizes[0] 89 | 90 | with self.lock: 91 | idxs = self._get_storage_idx(batch_size) 92 | 93 | # load inputs into buffers 94 | for key in self.buffers.keys(): 95 | self.buffers[key][idxs] = episode_batch[key] 96 | 97 | self.n_transitions_stored += batch_size * self.T 98 | 99 | def get_current_episode_size(self): 100 | with self.lock: 101 | return self.current_size 102 | 103 | def get_current_size(self): 104 | with self.lock: 105 | return self.current_size * self.T 106 | 107 | def get_transitions_stored(self): 108 | with self.lock: 109 | return self.n_transitions_stored 110 | 111 | def clear_buffer(self): 112 | with self.lock: 113 | self.current_size = 0 114 | 115 | def _get_storage_idx(self, inc=None): 116 | inc = inc or 1 # size increment 117 | assert inc <= self.size, "Batch committed to replay is too large!" 118 | # go consecutively until you hit the end, and then go randomly. 119 | if self.current_size+inc <= self.size: 120 | idx = np.arange(self.current_size, self.current_size+inc) 121 | elif self.current_size < self.size: 122 | overflow = inc - (self.size - self.current_size) 123 | idx_a = np.arange(self.current_size, self.size) 124 | idx_b = np.random.randint(0, self.current_size, overflow) 125 | idx = np.concatenate([idx_a, idx_b]) 126 | else: 127 | idx = np.random.randint(0, self.size, inc) 128 | 129 | # update replay size 130 | self.current_size = min(self.size, self.current_size+inc) 131 | 132 | if inc == 1: 133 | idx = idx[0] 134 | return idx 135 | 136 | 137 | class ReplayBufferEnergy: 138 | def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions, prioritization, env_name): 139 | """Creates a replay buffer. 140 | 141 | Args: 142 | buffer_shapes (dict of ints): the shape for all buffers that are used in the replay 143 | buffer 144 | size_in_transitions (int): the size of the buffer, measured in transitions 145 | T (int): the time horizon for episodes 146 | sample_transitions (function): a function that samples from the replay buffer 147 | """ 148 | self.buffer_shapes = buffer_shapes 149 | self.size = size_in_transitions // T 150 | self.T = T 151 | self.sample_transitions = sample_transitions 152 | 153 | self.buffers = {key: np.empty([self.size, *shape]) 154 | for key, shape in buffer_shapes.items()} 155 | self.buffers['e'] = np.empty([self.size, 1]) # energy 156 | self.buffers['p'] = np.empty([self.size, 1]) # priority/ranking 157 | 158 | self.prioritization = prioritization 159 | self.env_name = env_name 160 | 161 | # memory management 162 | self.current_size = 0 163 | self.n_transitions_stored = 0 164 | 165 | self.current_size_test = 0 166 | self.n_transitions_stored_test = 0 167 | 168 | self.lock = threading.Lock() 169 | 170 | @property 171 | def full(self): 172 | with self.lock: 173 | return self.current_size == self.size 174 | 175 | def sample(self, batch_size, rank_method, temperature): 176 | """Returns a dict {key: array(batch_size x shapes[key])} 177 | """ 178 | buffers = {} 179 | 180 | with self.lock: 181 | assert self.current_size > 0 182 | for key in self.buffers.keys(): 183 | buffers[key] = self.buffers[key][:self.current_size] 184 | 185 | buffers['o_2'] = buffers['o'][:, 1:, :] 186 | buffers['ag_2'] = buffers['ag'][:, 1:, :] 187 | 188 | transitions = self.sample_transitions(buffers, batch_size, rank_method, temperature) 189 | 190 | for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())): 191 | if not key == 'p' and not key == 'e': 192 | assert key in transitions, "key %s missing from transitions" % key 193 | 194 | return transitions 195 | 196 | def store_episode(self, episode_batch, w_potential, w_linear, w_rotational, rank_method, clip_energy): 197 | """episode_batch: array(batch_size x (T or T+1) x dim_key) 198 | """ 199 | batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()] 200 | assert np.all(np.array(batch_sizes) == batch_sizes[0]) 201 | batch_size = batch_sizes[0] 202 | 203 | buffers = {} 204 | for key in episode_batch.keys(): 205 | buffers[key] = episode_batch[key] 206 | 207 | if self.prioritization == 'energy': 208 | if self.env_name in ['FetchPickAndPlace-v0', 'FetchSlide-v0', 'FetchPush-v0']: 209 | height = buffers['ag'][:, :, 2] 210 | height_0 = np.repeat(height[:,0].reshape(-1,1), height[:,1::].shape[1], axis=1) 211 | height = height[:,1::] - height_0 212 | g, m, delta_t = 9.81, 1, 0.04 213 | potential_energy = g*m*height 214 | diff = np.diff(buffers['ag'], axis=1) 215 | velocity = diff / delta_t 216 | kinetic_energy = 0.5 * m * np.power(velocity, 2) 217 | kinetic_energy = np.sum(kinetic_energy, axis=2) 218 | energy_totoal = w_potential*potential_energy + w_linear*kinetic_energy 219 | energy_diff = np.diff(energy_totoal, axis=1) 220 | energy_transition = energy_totoal.copy() 221 | energy_transition[:,1::] = energy_diff.copy() 222 | energy_transition = np.clip(energy_transition, 0, clip_energy) 223 | energy_transition_total = np.sum(energy_transition, axis=1) 224 | episode_batch['e'] = energy_transition_total.reshape(-1,1) 225 | elif self.env_name in ['HandManipulatePenRotate-v0', \ 226 | 'HandManipulateEggFull-v0', \ 227 | 'HandManipulateBlockFull-v0', \ 228 | 'HandManipulateBlockRotateXYZ-v0']: 229 | g, m, delta_t, inertia = 9.81, 1, 0.04, 1 230 | quaternion = buffers['ag'][:,:,3:].copy() 231 | angle = np.apply_along_axis(quaternion_to_euler_angle, 2, quaternion) 232 | diff_angle = np.diff(angle, axis=1) 233 | angular_velocity = diff_angle / delta_t 234 | rotational_energy = 0.5 * inertia * np.power(angular_velocity, 2) 235 | rotational_energy = np.sum(rotational_energy, axis=2) 236 | buffers['ag'] = buffers['ag'][:,:,:3] 237 | height = buffers['ag'][:, :, 2] 238 | height_0 = np.repeat(height[:,0].reshape(-1,1), height[:,1::].shape[1], axis=1) 239 | height = height[:,1::] - height_0 240 | potential_energy = g*m*height 241 | diff = np.diff(buffers['ag'], axis=1) 242 | velocity = diff / delta_t 243 | kinetic_energy = 0.5 * m * np.power(velocity, 2) 244 | kinetic_energy = np.sum(kinetic_energy, axis=2) 245 | energy_totoal = w_potential*potential_energy + w_linear*kinetic_energy + w_rotational*rotational_energy 246 | energy_diff = np.diff(energy_totoal, axis=1) 247 | energy_transition = energy_totoal.copy() 248 | energy_transition[:,1::] = energy_diff.copy() 249 | energy_transition = np.clip(energy_transition, 0, clip_energy) 250 | energy_transition_total = np.sum(energy_transition, axis=1) 251 | episode_batch['e'] = energy_transition_total.reshape(-1,1) 252 | else: 253 | print('Trajectory Energy Function Not Implemented') 254 | exit() 255 | 256 | with self.lock: 257 | idxs = self._get_storage_idx(batch_size) 258 | 259 | # load inputs into buffers 260 | for key in self.buffers.keys(): 261 | if not key == 'p': 262 | self.buffers[key][idxs] = episode_batch[key] 263 | 264 | self.n_transitions_stored += batch_size * self.T 265 | 266 | energy_transition_total = self.buffers['e'][:self.current_size] 267 | if rank_method == 'none': 268 | rank_method = 'dense' 269 | energy_rank = rankdata(energy_transition_total, method=rank_method) 270 | energy_rank = energy_rank - 1 271 | energy_rank = energy_rank.reshape(-1, 1) 272 | self.buffers['p'][:self.current_size] = energy_rank.copy() 273 | 274 | def get_current_episode_size(self): 275 | with self.lock: 276 | return self.current_size 277 | 278 | def get_current_size(self): 279 | with self.lock: 280 | return self.current_size * self.T 281 | 282 | def get_transitions_stored(self): 283 | with self.lock: 284 | return self.n_transitions_stored 285 | 286 | def clear_buffer(self): 287 | with self.lock: 288 | self.current_size = 0 289 | 290 | def _get_storage_idx(self, inc=None): 291 | inc = inc or 1 # size increment 292 | assert inc <= self.size, "Batch committed to replay is too large!" 293 | # go consecutively until you hit the end, and then go randomly. 294 | if self.current_size+inc <= self.size: 295 | idx = np.arange(self.current_size, self.current_size+inc) 296 | elif self.current_size < self.size: 297 | overflow = inc - (self.size - self.current_size) 298 | idx_a = np.arange(self.current_size, self.size) 299 | idx_b = np.random.randint(0, self.current_size, overflow) 300 | idx = np.concatenate([idx_a, idx_b]) 301 | else: 302 | idx = np.random.randint(0, self.size, inc) 303 | 304 | # update replay size 305 | self.current_size = min(self.size, self.current_size+inc) 306 | 307 | if inc == 1: 308 | idx = idx[0] 309 | return idx 310 | 311 | 312 | class PrioritizedReplayBuffer(ReplayBuffer): 313 | def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions, alpha, env_name): 314 | """Create Prioritized Replay buffer. 315 | """ 316 | super(PrioritizedReplayBuffer, self).__init__(buffer_shapes, size_in_transitions, T, sample_transitions) 317 | assert alpha >= 0 318 | self._alpha = alpha 319 | 320 | it_capacity = 1 321 | self.size_in_transitions = size_in_transitions 322 | while it_capacity < size_in_transitions: 323 | it_capacity *= 2 324 | 325 | self._it_sum = SumSegmentTree(it_capacity) 326 | self._it_min = MinSegmentTree(it_capacity) 327 | self._max_priority = 1.0 328 | 329 | self.T = T 330 | self.buffers['td'] = np.zeros([self.size, self.T]) # accumulated td-error 331 | self.buffers['e'] = np.zeros([self.size, self.T]) # trajectory energy 332 | self.env_name = env_name 333 | 334 | def store_episode(self, episode_batch, dump_buffer): 335 | """episode_batch: array(batch_size x (T or T+1) x dim_key) 336 | """ 337 | 338 | batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()] 339 | assert np.all(np.array(batch_sizes) == batch_sizes[0]) 340 | batch_size = batch_sizes[0] 341 | 342 | if dump_buffer: 343 | 344 | buffers = {} 345 | for key in episode_batch.keys(): 346 | buffers[key] = episode_batch[key] 347 | 348 | if self.env_name in ['FetchPickAndPlace-v0', 'FetchSlide-v0', 'FetchPush-v0']: 349 | height = buffers['ag'][:, :, 2] 350 | height_0 = np.repeat(height[:,0].reshape(-1,1), height[:,1::].shape[1], axis=1) 351 | height = height[:,1::] - height_0 352 | g, m, delta_t = 9.81, 1, 0.04 353 | potential_energy = g*m*height 354 | diff = np.diff(buffers['ag'], axis=1) 355 | velocity = diff / delta_t 356 | kinetic_energy = 0.5 * m * np.power(velocity, 2) 357 | kinetic_energy = np.sum(kinetic_energy, axis=2) 358 | energy_totoal = potential_energy + kinetic_energy 359 | energy_diff = np.diff(energy_totoal, axis=1) 360 | energy_transition = energy_totoal.copy() 361 | energy_transition[:,1::] = energy_diff.copy() 362 | episode_batch['e'] = energy_transition 363 | elif self.env_name in ['HandManipulatePenRotate-v0', \ 364 | 'HandManipulateEggFull-v0', \ 365 | 'HandManipulateBlockFull-v0', \ 366 | 'HandManipulateBlockRotateXYZ-v0']: 367 | g, m, delta_t, inertia = 9.81, 1, 0.04, 1 368 | quaternion = buffers['ag'][:,:,3:].copy() 369 | angle = np.apply_along_axis(quaternion_to_euler_angle, 2, quaternion) 370 | diff_angle = np.diff(angle, axis=1) 371 | angular_velocity = diff_angle / delta_t 372 | rotational_energy = 0.5 * inertia * np.power(angular_velocity, 2) 373 | rotational_energy = np.sum(rotational_energy, axis=2) 374 | buffers['ag'] = buffers['ag'][:,:,:3] 375 | height = buffers['ag'][:, :, 2] 376 | height_0 = np.repeat(height[:,0].reshape(-1,1), height[:,1::].shape[1], axis=1) 377 | height = height[:,1::] - height_0 378 | potential_energy = g*m*height 379 | diff = np.diff(buffers['ag'], axis=1) 380 | velocity = diff / delta_t 381 | kinetic_energy = 0.5 * m * np.power(velocity, 2) 382 | kinetic_energy = np.sum(kinetic_energy, axis=2) 383 | energy_totoal = potential_energy + kinetic_energy + rotational_energy 384 | energy_diff = np.diff(energy_totoal, axis=1) 385 | energy_transition = energy_totoal.copy() 386 | energy_transition[:,1::] = energy_diff.copy() 387 | episode_batch['e'] = energy_transition 388 | 389 | 390 | with self.lock: 391 | idxs = self._get_storage_idx(batch_size) 392 | 393 | # load inputs into buffers 394 | for key in self.buffers.keys(): 395 | if not key == 'td': 396 | if dump_buffer: 397 | self.buffers[key][idxs] = episode_batch[key] 398 | else: 399 | if not key == 'e': 400 | self.buffers[key][idxs] = episode_batch[key] 401 | 402 | self.n_transitions_stored += batch_size * self.T 403 | 404 | for idx in idxs: 405 | episode_idx = idx 406 | for t in range(episode_idx*self.T, (episode_idx+1)*self.T): 407 | assert (episode_idx+1)*self.T-1 < min(self.n_transitions_stored, self.size_in_transitions) 408 | self._it_sum[t] = self._max_priority ** self._alpha 409 | self._it_min[t] = self._max_priority ** self._alpha 410 | 411 | def dump_buffer(self, epoch): 412 | for i in range(self.current_size): 413 | entry = {"e": self.buffers['e'][i].tolist(), \ 414 | "td": self.buffers['td'][i].tolist(), \ 415 | "ag": self.buffers['ag'][i].tolist() } 416 | with open('buffer_epoch_{0}.txt'.format(epoch), 'a') as file: 417 | file.write(json.dumps(entry)) # use `json.loads` to do the reverse 418 | file.write("\n") 419 | 420 | print("dump buffer") 421 | 422 | 423 | def sample(self, batch_size, beta): 424 | """Returns a dict {key: array(batch_size x shapes[key])} 425 | """ 426 | 427 | """Sample a batch of experiences. 428 | 429 | compared to ReplayBuffer.sample 430 | it also returns importance weights and idxes 431 | of sampled experiences. 432 | 433 | Parameters 434 | ---------- 435 | batch_size: int 436 | How many transitions to sample. 437 | beta: float 438 | To what degree to use importance weights 439 | (0 - no corrections, 1 - full correction) 440 | """ 441 | buffers = {} 442 | 443 | with self.lock: 444 | assert self.current_size > 0 445 | for key in self.buffers.keys(): 446 | buffers[key] = self.buffers[key][:self.current_size] 447 | 448 | buffers['o_2'] = buffers['o'][:, 1:, :] 449 | buffers['ag_2'] = buffers['ag'][:, 1:, :] 450 | 451 | transitions, weights, idxs = self.sample_transitions(self, buffers, batch_size, beta) 452 | 453 | for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())): 454 | if not key == 'td' and not key == 'e': 455 | assert key in transitions, "key %s missing from transitions" % key 456 | 457 | return (transitions, weights, idxs) 458 | 459 | 460 | def update_priorities(self, idxes, priorities): 461 | """Update priorities of sampled transitions. 462 | 463 | sets priority of transition at index idxes[i] in buffer 464 | to priorities[i]. 465 | 466 | Parameters 467 | ---------- 468 | idxes: [int] 469 | List of idxes of sampled transitions 470 | priorities: [float] 471 | List of updated priorities corresponding to 472 | transitions at the sampled idxes denoted by 473 | variable `idxes`. 474 | """ 475 | assert len(idxes) == len(priorities) 476 | for idx, priority in zip(idxes, priorities.flatten()): 477 | assert priority > 0 478 | assert 0 <= idx < self.n_transitions_stored 479 | self._it_sum[idx] = priority ** self._alpha 480 | self._it_min[idx] = priority ** self._alpha 481 | 482 | self._max_priority = max(self._max_priority, priority) 483 | -------------------------------------------------------------------------------- /baselines/her/rollout.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | import numpy as np 4 | import pickle 5 | from mujoco_py import MujocoException 6 | 7 | from baselines.her.util import convert_episode_to_batch_major, store_args 8 | 9 | class RolloutWorker: 10 | 11 | @store_args 12 | def __init__(self, make_env, policy, dims, logger, T, rollout_batch_size=1, 13 | exploit=False, use_target_net=False, compute_Q=False, noise_eps=0, 14 | random_eps=0, history_len=100, render=False, **kwargs): 15 | """Rollout worker generates experience by interacting with one or many environments. 16 | 17 | Args: 18 | make_env (function): a factory function that creates a new instance of the environment 19 | when called 20 | policy (object): the policy that is used to act 21 | dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u) 22 | logger (object): the logger that is used by the rollout worker 23 | rollout_batch_size (int): the number of parallel rollouts that should be used 24 | exploit (boolean): whether or not to exploit, i.e. to act optimally according to the 25 | current policy without any exploration 26 | use_target_net (boolean): whether or not to use the target net for rollouts 27 | compute_Q (boolean): whether or not to compute the Q values alongside the actions 28 | noise_eps (float): scale of the additive Gaussian noise 29 | random_eps (float): probability of selecting a completely random action 30 | history_len (int): length of history for statistics smoothing 31 | render (boolean): whether or not to render the rollouts 32 | """ 33 | self.envs = [make_env() for _ in range(rollout_batch_size)] 34 | assert self.T > 0 35 | 36 | self.info_keys = [key.replace('info_', '') for key in dims.keys() if key.startswith('info_')] 37 | 38 | self.success_history = deque(maxlen=history_len) 39 | self.Q_history = deque(maxlen=history_len) 40 | 41 | self.n_episodes = 0 42 | self.g = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # goals 43 | self.initial_o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations 44 | self.initial_ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals 45 | self.reset_all_rollouts() 46 | self.clear_history() 47 | 48 | def reset_rollout(self, i): 49 | """Resets the `i`-th rollout environment, re-samples a new goal, and updates the `initial_o` 50 | and `g` arrays accordingly. 51 | """ 52 | obs = self.envs[i].reset() 53 | self.initial_o[i] = obs['observation'] 54 | self.initial_ag[i] = obs['achieved_goal'] 55 | self.g[i] = obs['desired_goal'] 56 | 57 | def reset_all_rollouts(self): 58 | """Resets all `rollout_batch_size` rollout workers. 59 | """ 60 | for i in range(self.rollout_batch_size): 61 | self.reset_rollout(i) 62 | 63 | def generate_rollouts(self): 64 | """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current 65 | policy acting on it accordingly. 66 | """ 67 | self.reset_all_rollouts() 68 | 69 | # compute observations 70 | o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations 71 | ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals 72 | o[:] = self.initial_o 73 | ag[:] = self.initial_ag 74 | 75 | # generate episodes 76 | obs, achieved_goals, acts, goals, successes = [], [], [], [], [] 77 | info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] 78 | Qs = [] 79 | for t in range(self.T): 80 | policy_output = self.policy.get_actions( 81 | o, ag, self.g, 82 | compute_Q=self.compute_Q, 83 | noise_eps=self.noise_eps if not self.exploit else 0., 84 | random_eps=self.random_eps if not self.exploit else 0., 85 | use_target_net=self.use_target_net) 86 | 87 | if self.compute_Q: 88 | u, Q = policy_output 89 | Qs.append(Q) 90 | else: 91 | u = policy_output 92 | 93 | if u.ndim == 1: 94 | # The non-batched case should still have a reasonable shape. 95 | u = u.reshape(1, -1) 96 | 97 | o_new = np.empty((self.rollout_batch_size, self.dims['o'])) 98 | ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) 99 | success = np.zeros(self.rollout_batch_size) 100 | # compute new states and observations 101 | for i in range(self.rollout_batch_size): 102 | try: 103 | # We fully ignore the reward here because it will have to be re-computed 104 | # for HER. 105 | curr_o_new, _, _, info = self.envs[i].step(u[i]) 106 | if 'is_success' in info: 107 | success[i] = info['is_success'] 108 | o_new[i] = curr_o_new['observation'] 109 | ag_new[i] = curr_o_new['achieved_goal'] 110 | for idx, key in enumerate(self.info_keys): 111 | info_values[idx][t, i] = info[key] 112 | if self.render: 113 | self.envs[i].render() 114 | except MujocoException as e: 115 | return self.generate_rollouts() 116 | 117 | if np.isnan(o_new).any(): 118 | self.logger.warning('NaN caught during rollout generation. Trying again...') 119 | self.reset_all_rollouts() 120 | return self.generate_rollouts() 121 | 122 | obs.append(o.copy()) 123 | achieved_goals.append(ag.copy()) 124 | successes.append(success.copy()) 125 | acts.append(u.copy()) 126 | goals.append(self.g.copy()) 127 | o[...] = o_new 128 | ag[...] = ag_new 129 | obs.append(o.copy()) 130 | achieved_goals.append(ag.copy()) 131 | self.initial_o[:] = o 132 | 133 | successful = np.array(successes)[-1, :].copy() 134 | 135 | episode = dict(o=obs, 136 | u=acts, 137 | g=goals, 138 | ag=achieved_goals,) 139 | for key, value in zip(self.info_keys, info_values): 140 | episode['info_{}'.format(key)] = value 141 | 142 | # stats 143 | assert successful.shape == (self.rollout_batch_size,) 144 | success_rate = np.mean(successful) 145 | self.success_history.append(success_rate) 146 | if self.compute_Q: 147 | self.Q_history.append(np.mean(Qs)) 148 | self.n_episodes += self.rollout_batch_size 149 | 150 | return convert_episode_to_batch_major(episode) 151 | 152 | def clear_history(self): 153 | """Clears all histories that are used for statistics 154 | """ 155 | self.success_history.clear() 156 | self.Q_history.clear() 157 | 158 | def current_success_rate(self): 159 | return np.mean(self.success_history) 160 | 161 | def current_mean_Q(self): 162 | return np.mean(self.Q_history) 163 | 164 | def save_policy(self, path): 165 | """Pickles the current policy for later inspection. 166 | """ 167 | with open(path, 'wb') as f: 168 | pickle.dump(self.policy, f) 169 | 170 | def logs(self, prefix='worker'): 171 | """Generates a dictionary that contains all collected statistics. 172 | """ 173 | logs = [] 174 | logs += [('success_rate', np.mean(self.success_history))] 175 | if self.compute_Q: 176 | logs += [('mean_Q', np.mean(self.Q_history))] 177 | logs += [('episode', self.n_episodes)] 178 | 179 | if prefix is not '' and not prefix.endswith('/'): 180 | return [(prefix + '/' + key, val) for key, val in logs] 181 | else: 182 | return logs 183 | 184 | def seed(self, seed): 185 | """Seeds each environment with a distinct seed derived from the passed in global seed. 186 | """ 187 | for idx, env in enumerate(self.envs): 188 | env.seed(seed + 1000 * idx) 189 | -------------------------------------------------------------------------------- /baselines/her/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import importlib 5 | import inspect 6 | import functools 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | 11 | from baselines.common import tf_util as U 12 | import platform 13 | 14 | 15 | def store_args(method): 16 | """Stores provided method args as instance attributes. 17 | """ 18 | argspec = inspect.getfullargspec(method) 19 | defaults = {} 20 | if argspec.defaults is not None: 21 | defaults = dict( 22 | zip(argspec.args[-len(argspec.defaults):], argspec.defaults)) 23 | if argspec.kwonlydefaults is not None: 24 | defaults.update(argspec.kwonlydefaults) 25 | arg_names = argspec.args[1:] 26 | 27 | @functools.wraps(method) 28 | def wrapper(*positional_args, **keyword_args): 29 | self = positional_args[0] 30 | # Get default arg values 31 | args = defaults.copy() 32 | # Add provided arg values 33 | for name, value in zip(arg_names, positional_args[1:]): 34 | args[name] = value 35 | args.update(keyword_args) 36 | self.__dict__.update(args) 37 | return method(*positional_args, **keyword_args) 38 | 39 | return wrapper 40 | 41 | 42 | def import_function(spec): 43 | """Import a function identified by a string like "pkg.module:fn_name". 44 | """ 45 | mod_name, fn_name = spec.split(':') 46 | module = importlib.import_module(mod_name) 47 | fn = getattr(module, fn_name) 48 | return fn 49 | 50 | 51 | def flatten_grads(var_list, grads): 52 | """Flattens a variables and their gradients. 53 | """ 54 | return tf.concat([tf.reshape(grad, [U.numel(v)]) 55 | for (v, grad) in zip(var_list, grads)], 0) 56 | 57 | 58 | def nn(input, layers_sizes, reuse=None, flatten=False, name=""): 59 | """Creates a simple neural network 60 | """ 61 | for i, size in enumerate(layers_sizes): 62 | activation = tf.nn.relu if i < len(layers_sizes)-1 else None 63 | input = tf.layers.dense(inputs=input, 64 | units=size, 65 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 66 | reuse=reuse, 67 | name=name+'_'+str(i)) 68 | if activation: 69 | input = activation(input) 70 | if flatten: 71 | assert layers_sizes[-1] == 1 72 | input = tf.reshape(input, [-1]) 73 | return input 74 | 75 | 76 | def install_mpi_excepthook(): 77 | import sys 78 | from mpi4py import MPI 79 | old_hook = sys.excepthook 80 | 81 | def new_hook(a, b, c): 82 | old_hook(a, b, c) 83 | sys.stdout.flush() 84 | sys.stderr.flush() 85 | MPI.COMM_WORLD.Abort() 86 | sys.excepthook = new_hook 87 | 88 | 89 | def mpi_fork(n, binding="core"): 90 | """Re-launches the current script with workers 91 | Returns "parent" for original parent, "child" for MPI children 92 | """ 93 | if n <= 1: 94 | return "child" 95 | if os.getenv("IN_MPI") is None: 96 | env = os.environ.copy() 97 | env.update( 98 | MKL_NUM_THREADS="1", 99 | OMP_NUM_THREADS="1", 100 | IN_MPI="1" 101 | ) 102 | # "-bind-to core" is crucial for good performance 103 | if platform.system() == 'Darwin': 104 | args = [ 105 | "mpirun", 106 | "-np", 107 | str(n), 108 | "-allow-run-as-root", 109 | sys.executable 110 | ] 111 | else: 112 | args = [ 113 | "mpirun", 114 | "-np", 115 | str(n), 116 | "-bind-to", 117 | binding, # core or none 118 | "-allow-run-as-root", 119 | sys.executable 120 | ] 121 | args += sys.argv 122 | subprocess.check_call(args, env=env) 123 | return "parent" 124 | else: 125 | install_mpi_excepthook() 126 | return "child" 127 | 128 | 129 | def convert_episode_to_batch_major(episode): 130 | """Converts an episode to have the batch dimension in the major (first) 131 | dimension. 132 | """ 133 | episode_batch = {} 134 | for key in episode.keys(): 135 | val = np.array(episode[key]).copy() 136 | # make inputs batch-major instead of time-major 137 | episode_batch[key] = val.swapaxes(0, 1) 138 | 139 | return episode_batch 140 | 141 | 142 | def transitions_in_episode_batch(episode_batch): 143 | """Number of transitions in a given episode batch. 144 | """ 145 | shape = episode_batch['u'].shape 146 | return shape[0] * shape[1] 147 | 148 | 149 | def reshape_for_broadcasting(source, target): 150 | """Reshapes a tensor (source) to have the correct shape and dtype of the target 151 | before broadcasting it with MPI. 152 | """ 153 | dim = len(target.get_shape()) 154 | shape = ([1] * (dim-1)) + [-1] 155 | return tf.reshape(tf.cast(source, target.dtype), shape) 156 | -------------------------------------------------------------------------------- /baselines/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import os.path as osp 5 | import json 6 | import time 7 | import datetime 8 | import tempfile 9 | from collections import defaultdict 10 | 11 | LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv'] 12 | LOG_OUTPUT_FORMATS_MPI = ['log'] 13 | # Also valid: json, tensorboard 14 | 15 | DEBUG = 10 16 | INFO = 20 17 | WARN = 30 18 | ERROR = 40 19 | 20 | DISABLED = 50 21 | 22 | class KVWriter(object): 23 | def writekvs(self, kvs): 24 | raise NotImplementedError 25 | 26 | class SeqWriter(object): 27 | def writeseq(self, seq): 28 | raise NotImplementedError 29 | 30 | class HumanOutputFormat(KVWriter, SeqWriter): 31 | def __init__(self, filename_or_file): 32 | if isinstance(filename_or_file, str): 33 | self.file = open(filename_or_file, 'wt') 34 | self.own_file = True 35 | else: 36 | assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file 37 | self.file = filename_or_file 38 | self.own_file = False 39 | 40 | def writekvs(self, kvs): 41 | # Create strings for printing 42 | key2str = {} 43 | for (key, val) in sorted(kvs.items()): 44 | if isinstance(val, float): 45 | valstr = '%-8.3g' % (val,) 46 | else: 47 | valstr = str(val) 48 | key2str[self._truncate(key)] = self._truncate(valstr) 49 | 50 | # Find max widths 51 | if len(key2str) == 0: 52 | print('WARNING: tried to write empty key-value dict') 53 | return 54 | else: 55 | keywidth = max(map(len, key2str.keys())) 56 | valwidth = max(map(len, key2str.values())) 57 | 58 | # Write out the data 59 | dashes = '-' * (keywidth + valwidth + 7) 60 | lines = [dashes] 61 | for (key, val) in sorted(key2str.items()): 62 | lines.append('| %s%s | %s%s |' % ( 63 | key, 64 | ' ' * (keywidth - len(key)), 65 | val, 66 | ' ' * (valwidth - len(val)), 67 | )) 68 | lines.append(dashes) 69 | self.file.write('\n'.join(lines) + '\n') 70 | 71 | # Flush the output to the file 72 | self.file.flush() 73 | 74 | def _truncate(self, s): 75 | return s[:20] + '...' if len(s) > 23 else s 76 | 77 | def writeseq(self, seq): 78 | for arg in seq: 79 | self.file.write(arg) 80 | self.file.write('\n') 81 | self.file.flush() 82 | 83 | def close(self): 84 | if self.own_file: 85 | self.file.close() 86 | 87 | class JSONOutputFormat(KVWriter): 88 | def __init__(self, filename): 89 | self.file = open(filename, 'wt') 90 | 91 | def writekvs(self, kvs): 92 | for k, v in sorted(kvs.items()): 93 | if hasattr(v, 'dtype'): 94 | v = v.tolist() 95 | kvs[k] = float(v) 96 | self.file.write(json.dumps(kvs) + '\n') 97 | self.file.flush() 98 | 99 | def close(self): 100 | self.file.close() 101 | 102 | class CSVOutputFormat(KVWriter): 103 | def __init__(self, filename): 104 | self.file = open(filename, 'w+t') 105 | self.keys = [] 106 | self.sep = ',' 107 | 108 | def writekvs(self, kvs): 109 | # Add our current row to the history 110 | extra_keys = kvs.keys() - self.keys 111 | if extra_keys: 112 | self.keys.extend(extra_keys) 113 | self.file.seek(0) 114 | lines = self.file.readlines() 115 | self.file.seek(0) 116 | for (i, k) in enumerate(self.keys): 117 | if i > 0: 118 | self.file.write(',') 119 | self.file.write(k) 120 | self.file.write('\n') 121 | for line in lines[1:]: 122 | self.file.write(line[:-1]) 123 | self.file.write(self.sep * len(extra_keys)) 124 | self.file.write('\n') 125 | for (i, k) in enumerate(self.keys): 126 | if i > 0: 127 | self.file.write(',') 128 | v = kvs.get(k) 129 | if v is not None: 130 | self.file.write(str(v)) 131 | self.file.write('\n') 132 | self.file.flush() 133 | 134 | def close(self): 135 | self.file.close() 136 | 137 | 138 | class TensorBoardOutputFormat(KVWriter): 139 | """ 140 | Dumps key/value pairs into TensorBoard's numeric format. 141 | """ 142 | def __init__(self, dir): 143 | os.makedirs(dir, exist_ok=True) 144 | self.dir = dir 145 | self.step = 1 146 | prefix = 'events' 147 | path = osp.join(osp.abspath(dir), prefix) 148 | import tensorflow as tf 149 | from tensorflow.python import pywrap_tensorflow 150 | from tensorflow.core.util import event_pb2 151 | from tensorflow.python.util import compat 152 | self.tf = tf 153 | self.event_pb2 = event_pb2 154 | self.pywrap_tensorflow = pywrap_tensorflow 155 | self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) 156 | 157 | def writekvs(self, kvs): 158 | def summary_val(k, v): 159 | kwargs = {'tag': k, 'simple_value': float(v)} 160 | return self.tf.Summary.Value(**kwargs) 161 | summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) 162 | event = self.event_pb2.Event(wall_time=time.time(), summary=summary) 163 | event.step = self.step # is there any reason why you'd want to specify the step? 164 | self.writer.WriteEvent(event) 165 | self.writer.Flush() 166 | self.step += 1 167 | 168 | def close(self): 169 | if self.writer: 170 | self.writer.Close() 171 | self.writer = None 172 | 173 | def make_output_format(format, ev_dir, log_suffix=''): 174 | os.makedirs(ev_dir, exist_ok=True) 175 | if format == 'stdout': 176 | return HumanOutputFormat(sys.stdout) 177 | elif format == 'log': 178 | return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % log_suffix)) 179 | elif format == 'json': 180 | return JSONOutputFormat(osp.join(ev_dir, 'progress%s.json' % log_suffix)) 181 | elif format == 'csv': 182 | return CSVOutputFormat(osp.join(ev_dir, 'progress%s.csv' % log_suffix)) 183 | elif format == 'tensorboard': 184 | return TensorBoardOutputFormat(osp.join(ev_dir, 'tb%s' % log_suffix)) 185 | else: 186 | raise ValueError('Unknown format specified: %s' % (format,)) 187 | 188 | # ================================================================ 189 | # API 190 | # ================================================================ 191 | 192 | def logkv(key, val): 193 | """ 194 | Log a value of some diagnostic 195 | Call this once for each diagnostic quantity, each iteration 196 | If called many times, last value will be used. 197 | """ 198 | Logger.CURRENT.logkv(key, val) 199 | 200 | def logkv_mean(key, val): 201 | """ 202 | The same as logkv(), but if called many times, values averaged. 203 | """ 204 | Logger.CURRENT.logkv_mean(key, val) 205 | 206 | def logkvs(d): 207 | """ 208 | Log a dictionary of key-value pairs 209 | """ 210 | for (k, v) in d.items(): 211 | logkv(k, v) 212 | 213 | def dumpkvs(): 214 | """ 215 | Write all of the diagnostics from the current iteration 216 | 217 | level: int. (see logger.py docs) If the global logger level is higher than 218 | the level argument here, don't print to stdout. 219 | """ 220 | Logger.CURRENT.dumpkvs() 221 | 222 | def getkvs(): 223 | return Logger.CURRENT.name2val 224 | 225 | 226 | def log(*args, level=INFO): 227 | """ 228 | Write the sequence of args, with no separators, to the console and output files (if you've configured an output file). 229 | """ 230 | Logger.CURRENT.log(*args, level=level) 231 | 232 | def debug(*args): 233 | log(*args, level=DEBUG) 234 | 235 | def info(*args): 236 | log(*args, level=INFO) 237 | 238 | def warn(*args): 239 | log(*args, level=WARN) 240 | 241 | def error(*args): 242 | log(*args, level=ERROR) 243 | 244 | 245 | def set_level(level): 246 | """ 247 | Set logging threshold on current logger. 248 | """ 249 | Logger.CURRENT.set_level(level) 250 | 251 | def get_dir(): 252 | """ 253 | Get directory that log files are being written to. 254 | will be None if there is no output directory (i.e., if you didn't call start) 255 | """ 256 | return Logger.CURRENT.get_dir() 257 | 258 | record_tabular = logkv 259 | dump_tabular = dumpkvs 260 | 261 | class ProfileKV: 262 | """ 263 | Usage: 264 | with logger.ProfileKV("interesting_scope"): 265 | code 266 | """ 267 | def __init__(self, n): 268 | self.n = "wait_" + n 269 | def __enter__(self): 270 | self.t1 = time.time() 271 | def __exit__(self ,type, value, traceback): 272 | Logger.CURRENT.name2val[self.n] += time.time() - self.t1 273 | 274 | def profile(n): 275 | """ 276 | Usage: 277 | @profile("my_func") 278 | def my_func(): code 279 | """ 280 | def decorator_with_name(func): 281 | def func_wrapper(*args, **kwargs): 282 | with ProfileKV(n): 283 | return func(*args, **kwargs) 284 | return func_wrapper 285 | return decorator_with_name 286 | 287 | 288 | # ================================================================ 289 | # Backend 290 | # ================================================================ 291 | 292 | class Logger(object): 293 | DEFAULT = None # A logger with no output files. (See right below class definition) 294 | # So that you can still log to the terminal without setting up any output files 295 | CURRENT = None # Current logger being used by the free functions above 296 | 297 | def __init__(self, dir, output_formats): 298 | self.name2val = defaultdict(float) # values this iteration 299 | self.name2cnt = defaultdict(int) 300 | self.level = INFO 301 | self.dir = dir 302 | self.output_formats = output_formats 303 | 304 | # Logging API, forwarded 305 | # ---------------------------------------- 306 | def logkv(self, key, val): 307 | self.name2val[key] = val 308 | 309 | def logkv_mean(self, key, val): 310 | if val is None: 311 | self.name2val[key] = None 312 | return 313 | oldval, cnt = self.name2val[key], self.name2cnt[key] 314 | self.name2val[key] = oldval*cnt/(cnt+1) + val/(cnt+1) 315 | self.name2cnt[key] = cnt + 1 316 | 317 | def dumpkvs(self): 318 | if self.level == DISABLED: return 319 | for fmt in self.output_formats: 320 | if isinstance(fmt, KVWriter): 321 | fmt.writekvs(self.name2val) 322 | self.name2val.clear() 323 | self.name2cnt.clear() 324 | 325 | def log(self, *args, level=INFO): 326 | if self.level <= level: 327 | self._do_log(args) 328 | 329 | # Configuration 330 | # ---------------------------------------- 331 | def set_level(self, level): 332 | self.level = level 333 | 334 | def get_dir(self): 335 | return self.dir 336 | 337 | def close(self): 338 | for fmt in self.output_formats: 339 | fmt.close() 340 | 341 | # Misc 342 | # ---------------------------------------- 343 | def _do_log(self, args): 344 | for fmt in self.output_formats: 345 | if isinstance(fmt, SeqWriter): 346 | fmt.writeseq(map(str, args)) 347 | 348 | Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) 349 | 350 | def configure(dir=None, format_strs=None): 351 | if dir is None: 352 | dir = os.getenv('OPENAI_LOGDIR') 353 | if dir is None: 354 | dir = osp.join(tempfile.gettempdir(), 355 | datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) 356 | assert isinstance(dir, str) 357 | os.makedirs(dir, exist_ok=True) 358 | 359 | log_suffix = '' 360 | from mpi4py import MPI 361 | rank = MPI.COMM_WORLD.Get_rank() 362 | if rank > 0: 363 | log_suffix = "-rank%03i" % rank 364 | 365 | if format_strs is None: 366 | strs, strs_mpi = os.getenv('OPENAI_LOG_FORMAT'), os.getenv('OPENAI_LOG_FORMAT_MPI') 367 | format_strs = strs_mpi if rank>0 else strs 368 | if format_strs is not None: 369 | format_strs = format_strs.split(',') 370 | else: 371 | format_strs = LOG_OUTPUT_FORMATS_MPI if rank>0 else LOG_OUTPUT_FORMATS 372 | 373 | output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs] 374 | 375 | Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) 376 | log('Logging to %s'%dir) 377 | 378 | def reset(): 379 | if Logger.CURRENT is not Logger.DEFAULT: 380 | Logger.CURRENT.close() 381 | Logger.CURRENT = Logger.DEFAULT 382 | log('Reset logger') 383 | 384 | class scoped_configure(object): 385 | def __init__(self, dir=None, format_strs=None): 386 | self.dir = dir 387 | self.format_strs = format_strs 388 | self.prevlogger = None 389 | def __enter__(self): 390 | self.prevlogger = Logger.CURRENT 391 | configure(dir=self.dir, format_strs=self.format_strs) 392 | def __exit__(self, *args): 393 | Logger.CURRENT.close() 394 | Logger.CURRENT = self.prevlogger 395 | 396 | # ================================================================ 397 | 398 | def _demo(): 399 | info("hi") 400 | debug("shouldn't appear") 401 | set_level(DEBUG) 402 | debug("should appear") 403 | dir = "/tmp/testlogging" 404 | if os.path.exists(dir): 405 | shutil.rmtree(dir) 406 | configure(dir=dir) 407 | logkv("a", 3) 408 | logkv("b", 2.5) 409 | dumpkvs() 410 | logkv("b", -2.5) 411 | logkv("a", 5.5) 412 | dumpkvs() 413 | info("^^^ should see a = 5.5") 414 | logkv_mean("b", -22.5) 415 | logkv_mean("b", -44.4) 416 | logkv("a", 5.5) 417 | dumpkvs() 418 | info("^^^ should see b = 33.3") 419 | 420 | logkv("b", -2.5) 421 | dumpkvs() 422 | 423 | logkv("a", "longasslongasslongasslongasslongasslongassvalue") 424 | dumpkvs() 425 | 426 | 427 | # ================================================================ 428 | # Readers 429 | # ================================================================ 430 | 431 | def read_json(fname): 432 | import pandas 433 | ds = [] 434 | with open(fname, 'rt') as fh: 435 | for line in fh: 436 | ds.append(json.loads(line)) 437 | return pandas.DataFrame(ds) 438 | 439 | def read_csv(fname): 440 | import pandas 441 | return pandas.read_csv(fname, index_col=None, comment='#') 442 | 443 | def read_tb(path): 444 | """ 445 | path : a tensorboard file OR a directory, where we will find all TB files 446 | of the form events.* 447 | """ 448 | import pandas 449 | import numpy as np 450 | from glob import glob 451 | from collections import defaultdict 452 | import tensorflow as tf 453 | if osp.isdir(path): 454 | fnames = glob(osp.join(path, "events.*")) 455 | elif osp.basename(path).startswith("events."): 456 | fnames = [path] 457 | else: 458 | raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path) 459 | tag2pairs = defaultdict(list) 460 | maxstep = 0 461 | for fname in fnames: 462 | for summary in tf.train.summary_iterator(fname): 463 | if summary.step > 0: 464 | for v in summary.summary.value: 465 | pair = (summary.step, v.simple_value) 466 | tag2pairs[v.tag].append(pair) 467 | maxstep = max(summary.step, maxstep) 468 | data = np.empty((maxstep, len(tag2pairs))) 469 | data[:] = np.nan 470 | tags = sorted(tag2pairs.keys()) 471 | for (colidx,tag) in enumerate(tags): 472 | pairs = tag2pairs[tag] 473 | for (step, value) in pairs: 474 | data[step-1, colidx] = value 475 | return pandas.DataFrame(data, columns=tags) 476 | 477 | if __name__ == "__main__": 478 | _demo() 479 | -------------------------------------------------------------------------------- /baselines/results_plotter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode 4 | 5 | import matplotlib.pyplot as plt 6 | plt.rcParams['svg.fonttype'] = 'none' 7 | 8 | from baselines.bench.monitor import load_results 9 | 10 | X_TIMESTEPS = 'timesteps' 11 | X_EPISODES = 'episodes' 12 | X_WALLTIME = 'walltime_hrs' 13 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] 14 | EPISODES_WINDOW = 100 15 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 16 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', 17 | 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] 18 | 19 | def rolling_window(a, window): 20 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 21 | strides = a.strides + (a.strides[-1],) 22 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 23 | 24 | def window_func(x, y, window, func): 25 | yw = rolling_window(y, window) 26 | yw_func = func(yw, axis=-1) 27 | return x[window-1:], yw_func 28 | 29 | def ts2xy(ts, xaxis): 30 | if xaxis == X_TIMESTEPS: 31 | x = np.cumsum(ts.l.values) 32 | y = ts.r.values 33 | elif xaxis == X_EPISODES: 34 | x = np.arange(len(ts)) 35 | y = ts.r.values 36 | elif xaxis == X_WALLTIME: 37 | x = ts.t.values / 3600. 38 | y = ts.r.values 39 | else: 40 | raise NotImplementedError 41 | return x, y 42 | 43 | def plot_curves(xy_list, xaxis, title): 44 | plt.figure(figsize=(8,2)) 45 | maxx = max(xy[0][-1] for xy in xy_list) 46 | minx = 0 47 | for (i, (x, y)) in enumerate(xy_list): 48 | color = COLORS[i] 49 | plt.scatter(x, y, s=2) 50 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes 51 | plt.plot(x, y_mean, color=color) 52 | plt.xlim(minx, maxx) 53 | plt.title(title) 54 | plt.xlabel(xaxis) 55 | plt.ylabel("Episode Rewards") 56 | plt.tight_layout() 57 | 58 | def plot_results(dirs, num_timesteps, xaxis, task_name): 59 | tslist = [] 60 | for dir in dirs: 61 | ts = load_results(dir) 62 | ts = ts[ts.l.cumsum() <= num_timesteps] 63 | tslist.append(ts) 64 | xy_list = [ts2xy(ts, xaxis) for ts in tslist] 65 | plot_curves(xy_list, xaxis, task_name) 66 | 67 | # Example usage in jupyter-notebook 68 | # from baselines import log_viewer 69 | # %matplotlib inline 70 | # log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout") 71 | # Here ./log is a directory containing the monitor.csv files 72 | 73 | def main(): 74 | import argparse 75 | import os 76 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 77 | parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log']) 78 | parser.add_argument('--num_timesteps', type=int, default=int(10e6)) 79 | parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS) 80 | parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout') 81 | args = parser.parse_args() 82 | args.dirs = [os.path.abspath(dir) for dir in args.dirs] 83 | plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name) 84 | plt.show() 85 | 86 | if __name__ == '__main__': 87 | main() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys 3 | 4 | if sys.version_info.major != 3: 5 | print('This Python is only compatible with Python 3, but you are running ' 6 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major)) 7 | 8 | 9 | setup(name='baselines', 10 | packages=[package for package in find_packages() 11 | if package.startswith('baselines')], 12 | install_requires=[ 13 | 'gym[mujoco,atari,classic_control,robotics]', 14 | 'scipy', 15 | 'tqdm', 16 | 'joblib', 17 | 'zmq', 18 | 'dill', 19 | 'progressbar2', 20 | 'mpi4py', 21 | 'cloudpickle', 22 | 'tensorflow>=1.4.0', 23 | 'click', 24 | 'opencv-python' 25 | ], 26 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', 27 | author='OpenAI', 28 | url='https://github.com/openai/baselines', 29 | author_email='gym@openai.com', 30 | version='0.1.5') 31 | --------------------------------------------------------------------------------