├── baselines ├── __init__.py ├── a2c │ ├── __init__.py │ ├── README.md │ ├── run_atari.py │ ├── policies.py │ ├── a2c.py │ └── utils.py ├── acktr │ ├── __init__.py │ ├── README.md │ ├── running_stat.py │ ├── run_mujoco.py │ ├── run_atari.py │ ├── value_functions.py │ ├── filters.py │ ├── policies.py │ ├── kfac_utils.py │ ├── acktr_cont.py │ └── utils.py ├── ddpg │ ├── __init__.py │ ├── README.md │ ├── util.py │ ├── noise.py │ ├── models.py │ ├── memory.py │ ├── main.py │ └── training.py ├── ppo1 │ ├── __init__.py │ ├── README.md │ ├── run_mujoco.py │ ├── run_atari.py │ ├── cnn_policy.py │ └── mlp_policy.py ├── trpo_mpi │ ├── __init__.py │ ├── README.md │ ├── run_mujoco.py │ ├── run_atari.py │ └── nosharing_cnn_policy.py ├── deepq │ ├── experiments │ │ ├── __init__.py │ │ ├── atari │ │ │ ├── __init__.py │ │ │ ├── download_model.py │ │ │ ├── enjoy.py │ │ │ ├── model.py │ │ │ └── wang2015_eval.py │ │ ├── enjoy_cartpole.py │ │ ├── enjoy_mountaincar.py │ │ ├── enjoy_pong.py │ │ ├── train_mountaincar.py │ │ ├── train_cartpole.py │ │ ├── train_pong.py │ │ └── custom_cartpole.py │ ├── __init__.py │ ├── README.md │ ├── models.py │ └── replay_buffer.py ├── bench │ ├── __init__.py │ ├── monitor.py │ └── benchmarks.py ├── common │ ├── __init__.py │ ├── vec_env │ │ ├── __init__.py │ │ └── subproc_vec_env.py │ ├── mpi_fork.py │ ├── tests │ │ ├── test_schedules.py │ │ ├── test_tf_util.py │ │ └── test_segment_tree.py │ ├── cg.py │ ├── console_util.py │ ├── mpi_moments.py │ ├── dataset.py │ ├── math_util.py │ ├── mpi_adam.py │ ├── mpi_running_mean_std.py │ ├── schedules.py │ ├── segment_tree.py │ ├── azure_utils.py │ ├── atari_wrappers.py │ └── atari_wrappers_deprecated.py └── logger.py ├── data ├── logo.jpg └── cartpole.gif ├── .gitignore ├── README.md ├── setup.py └── LICENSE /baselines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/a2c/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/acktr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/ddpg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/ppo1/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/trpo_mpi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/atari/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/baselines/master/data/logo.jpg -------------------------------------------------------------------------------- /data/cartpole.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/baselines/master/data/cartpole.gif -------------------------------------------------------------------------------- /baselines/bench/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.bench.benchmarks import * 2 | from baselines.bench.monitor import * 3 | 4 | -------------------------------------------------------------------------------- /baselines/common/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.common.console_util import * 2 | from baselines.common.dataset import Dataset 3 | from baselines.common.math_util import * 4 | from baselines.common.misc_util import * 5 | -------------------------------------------------------------------------------- /baselines/a2c/README.md: -------------------------------------------------------------------------------- 1 | # A2C 2 | 3 | - Original paper: https://arxiv.org/abs/1602.01783 4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ 5 | - `python -m baselines.a2c.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. -------------------------------------------------------------------------------- /baselines/deepq/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.deepq import models # noqa 2 | from baselines.deepq.build_graph import build_act, build_train # noqa 3 | 4 | from baselines.deepq.simple import learn, load # noqa 5 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa 6 | -------------------------------------------------------------------------------- /baselines/acktr/README.md: -------------------------------------------------------------------------------- 1 | # ACKTR 2 | 3 | - Original paper: https://arxiv.org/abs/1708.05144 4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ 5 | - `python -m baselines.acktr.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. -------------------------------------------------------------------------------- /baselines/ddpg/README.md: -------------------------------------------------------------------------------- 1 | # DDPG 2 | 3 | - Original paper: https://arxiv.org/abs/1509.02971 4 | - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/ 5 | - `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.py~ 4 | .DS_Store 5 | .idea 6 | 7 | # Setuptools distribution and build folders. 8 | /dist/ 9 | /build 10 | keys/ 11 | 12 | # Virtualenv 13 | /env 14 | 15 | 16 | *.sublime-project 17 | *.sublime-workspace 18 | 19 | .idea 20 | 21 | logs/ 22 | 23 | .ipynb_checkpoints 24 | ghostdriver.log 25 | 26 | htmlcov 27 | 28 | junk 29 | src 30 | 31 | *.egg-info 32 | .cache 33 | -------------------------------------------------------------------------------- /baselines/ppo1/README.md: -------------------------------------------------------------------------------- 1 | # PPOSGD 2 | 3 | - Original paper: https://arxiv.org/abs/1707.06347 4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ 5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. 6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment. 7 | 8 | -------------------------------------------------------------------------------- /baselines/trpo_mpi/README.md: -------------------------------------------------------------------------------- 1 | # trpo_mpi 2 | 3 | - Original paper: https://arxiv.org/abs/1502.05477 4 | - Baselines blog post https://blog.openai.com/openai-baselines-ppo/ 5 | - `mpirun -np 16 python -m baselines.trpo_mpi.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. 6 | - `python -m baselines.trpo_mpi.run_mujoco` runs the algorithm for 1M timesteps on a Mujoco environment. 7 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/enjoy_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def main(): 7 | env = gym.make("CartPole-v0") 8 | act = deepq.load("cartpole_model.pkl") 9 | 10 | while True: 11 | obs, done = env.reset(), False 12 | episode_rew = 0 13 | while not done: 14 | env.render() 15 | obs, rew, done, _ = env.step(act(obs[None])[0]) 16 | episode_rew += rew 17 | print("Episode reward", episode_rew) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/enjoy_mountaincar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def main(): 7 | env = gym.make("MountainCar-v0") 8 | act = deepq.load("mountaincar_model.pkl") 9 | 10 | while True: 11 | obs, done = env.reset(), False 12 | episode_rew = 0 13 | while not done: 14 | env.render() 15 | obs, rew, done, _ = env.step(act(obs[None])[0]) 16 | episode_rew += rew 17 | print("Episode reward", episode_rew) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /baselines/common/vec_env/__init__.py: -------------------------------------------------------------------------------- 1 | class VecEnv(object): 2 | """ 3 | Vectorized environment base class 4 | """ 5 | def step(self, vac): 6 | """ 7 | Apply sequence of actions to sequence of environments 8 | actions -> (observations, rewards, news) 9 | 10 | where 'news' is a boolean vector indicating whether each element is new. 11 | """ 12 | raise NotImplementedError 13 | def reset(self): 14 | """ 15 | Reset all environments 16 | """ 17 | raise NotImplementedError 18 | def close(self): 19 | pass -------------------------------------------------------------------------------- /baselines/deepq/experiments/enjoy_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame 5 | 6 | 7 | def main(): 8 | env = gym.make("PongNoFrameskip-v4") 9 | env = ScaledFloatFrame(wrap_dqn(env)) 10 | act = deepq.load("pong_model.pkl") 11 | 12 | while True: 13 | obs, done = env.reset(), False 14 | episode_rew = 0 15 | while not done: 16 | env.render() 17 | obs, rew, done, _ = env.step(act(obs[None])[0]) 18 | episode_rew += rew 19 | print("Episode reward", episode_rew) 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/train_mountaincar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def main(): 7 | env = gym.make("MountainCar-v0") 8 | # Enabling layer_norm here is import for parameter space noise! 9 | model = deepq.models.mlp([64], layer_norm=True) 10 | act = deepq.learn( 11 | env, 12 | q_func=model, 13 | lr=1e-3, 14 | max_timesteps=100000, 15 | buffer_size=50000, 16 | exploration_fraction=0.1, 17 | exploration_final_eps=0.1, 18 | print_freq=10, 19 | param_noise=True 20 | ) 21 | print("Saving model to mountaincar_model.pkl") 22 | act.save("mountaincar_model.pkl") 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /baselines/common/mpi_fork.py: -------------------------------------------------------------------------------- 1 | import os, subprocess, sys 2 | 3 | def mpi_fork(n, bind_to_core=False): 4 | """Re-launches the current script with workers 5 | Returns "parent" for original parent, "child" for MPI children 6 | """ 7 | if n<=1: 8 | return "child" 9 | if os.getenv("IN_MPI") is None: 10 | env = os.environ.copy() 11 | env.update( 12 | MKL_NUM_THREADS="1", 13 | OMP_NUM_THREADS="1", 14 | IN_MPI="1" 15 | ) 16 | args = ["mpirun", "-np", str(n)] 17 | if bind_to_core: 18 | args += ["-bind-to", "core"] 19 | args += [sys.executable] + sys.argv 20 | subprocess.check_call(args, env=env) 21 | return "parent" 22 | else: 23 | return "child" 24 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/train_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def callback(lcl, glb): 7 | # stop training if reward exceeds 199 8 | is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199 9 | return is_solved 10 | 11 | 12 | def main(): 13 | env = gym.make("CartPole-v0") 14 | model = deepq.models.mlp([64]) 15 | act = deepq.learn( 16 | env, 17 | q_func=model, 18 | lr=1e-3, 19 | max_timesteps=100000, 20 | buffer_size=50000, 21 | exploration_fraction=0.1, 22 | exploration_final_eps=0.02, 23 | print_freq=10, 24 | callback=callback 25 | ) 26 | print("Saving model to cartpole_model.pkl") 27 | act.save("cartpole_model.pkl") 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Baselines 4 | 5 | OpenAI Baselines is a set of high-quality implementations of reinforcement learning algorithms. 6 | 7 | These algorithms will make it easier for the research community to replicate, refine, and identify new ideas, and will create good baselines to build research on top of. Our DQN implementation and its variants are roughly on par with the scores in published papers. We expect they will be used as a base around which new ideas can be added, and as a tool for comparing a new approach against existing ones. 8 | 9 | You can install it by typing: 10 | 11 | ```bash 12 | git clone https://github.com/openai/baselines.git 13 | cd baselines 14 | pip install -e . 15 | ``` 16 | 17 | - [A2C](baselines/a2c) 18 | - [ACKTR](baselines/acktr) 19 | - [DDPG](baselines/ddpg) 20 | - [DQN](baselines/deepq) 21 | - [PPO](baselines/ppo1) 22 | - [TRPO](baselines/trpo_mpi) 23 | -------------------------------------------------------------------------------- /baselines/common/tests/test_schedules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule 4 | 5 | 6 | def test_piecewise_schedule(): 7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) 8 | 9 | assert np.isclose(ps.value(-10), 500) 10 | assert np.isclose(ps.value(0), 150) 11 | assert np.isclose(ps.value(5), 200) 12 | assert np.isclose(ps.value(9), 80) 13 | assert np.isclose(ps.value(50), 50) 14 | assert np.isclose(ps.value(80), 50) 15 | assert np.isclose(ps.value(150), 0) 16 | assert np.isclose(ps.value(175), -25) 17 | assert np.isclose(ps.value(201), 500) 18 | assert np.isclose(ps.value(500), 500) 19 | 20 | assert np.isclose(ps.value(200 - 1e-10), -50) 21 | 22 | 23 | def test_constant_schedule(): 24 | cs = ConstantSchedule(5) 25 | for i in range(-100, 100): 26 | assert np.isclose(cs.value(i), 5) 27 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/train_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame 5 | 6 | 7 | def main(): 8 | env = gym.make("PongNoFrameskip-v4") 9 | env = ScaledFloatFrame(wrap_dqn(env)) 10 | model = deepq.models.cnn_to_mlp( 11 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], 12 | hiddens=[256], 13 | dueling=True 14 | ) 15 | act = deepq.learn( 16 | env, 17 | q_func=model, 18 | lr=1e-4, 19 | max_timesteps=2000000, 20 | buffer_size=10000, 21 | exploration_fraction=0.1, 22 | exploration_final_eps=0.01, 23 | train_freq=4, 24 | learning_starts=10000, 25 | target_network_update_freq=1000, 26 | gamma=0.99, 27 | prioritized_replay=True 28 | ) 29 | act.save("pong_model.pkl") 30 | env.close() 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys 3 | 4 | if sys.version_info.major != 3: 5 | print("This Python is only compatible with Python 3, but you are running " 6 | "Python {}. The installation will likely fail.".format(sys.version_info.major)) 7 | 8 | 9 | setup(name='baselines', 10 | packages=[package for package in find_packages() 11 | if package.startswith('baselines')], 12 | install_requires=[ 13 | 'gym[mujoco,atari,classic_control]', 14 | 'scipy', 15 | 'tqdm', 16 | 'joblib', 17 | 'zmq', 18 | 'dill', 19 | 'tensorflow >= 1.0.0', 20 | 'azure==1.0.3', 21 | 'progressbar2', 22 | 'mpi4py', 23 | ], 24 | description="OpenAI baselines: high quality implementations of reinforcement learning algorithms", 25 | author="OpenAI", 26 | url='https://github.com/openai/baselines', 27 | author_email="gym@openai.com", 28 | version="0.1.4") 29 | -------------------------------------------------------------------------------- /baselines/common/cg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): 3 | """ 4 | Demmel p 312 5 | """ 6 | p = b.copy() 7 | r = b.copy() 8 | x = np.zeros_like(b) 9 | rdotr = r.dot(r) 10 | 11 | fmtstr = "%10i %10.3g %10.3g" 12 | titlestr = "%10s %10s %10s" 13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm")) 14 | 15 | for i in range(cg_iters): 16 | if callback is not None: 17 | callback(x) 18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x))) 19 | z = f_Ax(p) 20 | v = rdotr / p.dot(z) 21 | x += v*p 22 | r -= v*z 23 | newrdotr = r.dot(r) 24 | mu = newrdotr/rdotr 25 | p = r + mu*p 26 | 27 | rdotr = newrdotr 28 | if rdotr < residual_tol: 29 | break 30 | 31 | if callback is not None: 32 | callback(x) 33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 34 | return x -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /baselines/ddpg/util.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import gym 4 | import numpy as np 5 | import tensorflow as tf 6 | from mpi4py import MPI 7 | from baselines.common.mpi_moments import mpi_moments 8 | 9 | 10 | def reduce_var(x, axis=None, keepdims=False): 11 | m = tf.reduce_mean(x, axis=axis, keep_dims=True) 12 | devs_squared = tf.square(x - m) 13 | return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims) 14 | 15 | 16 | def reduce_std(x, axis=None, keepdims=False): 17 | return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims)) 18 | 19 | 20 | def mpi_mean(value): 21 | if value == []: 22 | value = [0.] 23 | if not isinstance(value, list): 24 | value = [value] 25 | return mpi_moments(np.array(value))[0][0] 26 | 27 | 28 | def mpi_std(value): 29 | if value == []: 30 | value = [0.] 31 | if not isinstance(value, list): 32 | value = [value] 33 | return mpi_moments(np.array(value))[1][0] 34 | 35 | 36 | def mpi_max(value): 37 | global_max = np.zeros(1, dtype='float64') 38 | local_max = np.max(value).astype('float64') 39 | MPI.COMM_WORLD.Reduce(local_max, global_max, op=MPI.MAX) 40 | return global_max[0] 41 | 42 | 43 | def mpi_sum(value): 44 | global_sum = np.zeros(1, dtype='float64') 45 | local_sum = np.sum(np.array(value)).astype('float64') 46 | MPI.COMM_WORLD.Reduce(local_sum, global_sum, op=MPI.SUM) 47 | return global_sum[0] 48 | -------------------------------------------------------------------------------- /baselines/acktr/running_stat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # http://www.johndcook.com/blog/standard_deviation/ 4 | class RunningStat(object): 5 | def __init__(self, shape): 6 | self._n = 0 7 | self._M = np.zeros(shape) 8 | self._S = np.zeros(shape) 9 | def push(self, x): 10 | x = np.asarray(x) 11 | assert x.shape == self._M.shape 12 | self._n += 1 13 | if self._n == 1: 14 | self._M[...] = x 15 | else: 16 | oldM = self._M.copy() 17 | self._M[...] = oldM + (x - oldM)/self._n 18 | self._S[...] = self._S + (x - oldM)*(x - self._M) 19 | @property 20 | def n(self): 21 | return self._n 22 | @property 23 | def mean(self): 24 | return self._M 25 | @property 26 | def var(self): 27 | return self._S/(self._n - 1) if self._n > 1 else np.square(self._M) 28 | @property 29 | def std(self): 30 | return np.sqrt(self.var) 31 | @property 32 | def shape(self): 33 | return self._M.shape 34 | 35 | def test_running_stat(): 36 | for shp in ((), (3,), (3,4)): 37 | li = [] 38 | rs = RunningStat(shp) 39 | for _ in range(5): 40 | val = np.random.randn(*shp) 41 | rs.push(val) 42 | li.append(val) 43 | m = np.mean(li, axis=0) 44 | assert np.allclose(rs.mean, m) 45 | v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0) 46 | assert np.allclose(rs.var, v) 47 | -------------------------------------------------------------------------------- /baselines/common/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from contextlib import contextmanager 3 | import numpy as np 4 | import time 5 | 6 | # ================================================================ 7 | # Misc 8 | # ================================================================ 9 | 10 | def fmt_row(width, row, header=False): 11 | out = " | ".join(fmt_item(x, width) for x in row) 12 | if header: out = out + "\n" + "-"*len(out) 13 | return out 14 | 15 | def fmt_item(x, l): 16 | if isinstance(x, np.ndarray): 17 | assert x.ndim==0 18 | x = x.item() 19 | if isinstance(x, float): rep = "%g"%x 20 | else: rep = str(x) 21 | return " "*(l - len(rep)) + rep 22 | 23 | color2num = dict( 24 | gray=30, 25 | red=31, 26 | green=32, 27 | yellow=33, 28 | blue=34, 29 | magenta=35, 30 | cyan=36, 31 | white=37, 32 | crimson=38 33 | ) 34 | 35 | def colorize(string, color, bold=False, highlight=False): 36 | attr = [] 37 | num = color2num[color] 38 | if highlight: num += 10 39 | attr.append(str(num)) 40 | if bold: attr.append('1') 41 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 42 | 43 | 44 | MESSAGE_DEPTH = 0 45 | 46 | @contextmanager 47 | def timed(msg): 48 | global MESSAGE_DEPTH #pylint: disable=W0603 49 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) 50 | tstart = time.time() 51 | MESSAGE_DEPTH += 1 52 | yield 53 | MESSAGE_DEPTH -= 1 54 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) 55 | -------------------------------------------------------------------------------- /baselines/ppo1/run_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from baselines.common import set_global_seeds, tf_util as U 3 | from baselines import bench 4 | import os.path as osp 5 | import gym, logging 6 | from baselines import logger 7 | import sys 8 | 9 | def train(env_id, num_timesteps, seed): 10 | from baselines.ppo1 import mlp_policy, pposgd_simple 11 | U.make_session(num_cpu=1).__enter__() 12 | set_global_seeds(seed) 13 | env = gym.make(env_id) 14 | def policy_fn(name, ob_space, ac_space): 15 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 16 | hid_size=64, num_hid_layers=2) 17 | env = bench.Monitor(env, logger.get_dir() and 18 | osp.join(logger.get_dir(), "monitor.json")) 19 | env.seed(seed) 20 | gym.logger.setLevel(logging.WARN) 21 | pposgd_simple.learn(env, policy_fn, 22 | max_timesteps=num_timesteps, 23 | timesteps_per_batch=2048, 24 | clip_param=0.2, entcoeff=0.0, 25 | optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, 26 | gamma=0.99, lam=0.95, schedule='linear', 27 | ) 28 | env.close() 29 | 30 | def main(): 31 | import argparse 32 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 33 | parser.add_argument('--env', help='environment ID', default='Hopper-v1') 34 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 35 | args = parser.parse_args() 36 | train(args.env, num_timesteps=1e6, seed=args.seed) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /baselines/acktr/run_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import logging 4 | import os 5 | import tensorflow as tf 6 | import gym 7 | from baselines import logger 8 | from baselines.common import set_global_seeds 9 | from baselines import bench 10 | from baselines.acktr.acktr_cont import learn 11 | from baselines.acktr.policies import GaussianMlpPolicy 12 | from baselines.acktr.value_functions import NeuralNetValueFunction 13 | 14 | def train(env_id, num_timesteps, seed): 15 | env=gym.make(env_id) 16 | if logger.get_dir(): 17 | env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) 18 | set_global_seeds(seed) 19 | env.seed(seed) 20 | gym.logger.setLevel(logging.WARN) 21 | 22 | with tf.Session(config=tf.ConfigProto()) as session: 23 | ob_dim = env.observation_space.shape[0] 24 | ac_dim = env.action_space.shape[0] 25 | with tf.variable_scope("vf"): 26 | vf = NeuralNetValueFunction(ob_dim, ac_dim) 27 | with tf.variable_scope("pi"): 28 | policy = GaussianMlpPolicy(ob_dim, ac_dim) 29 | 30 | learn(env, policy=policy, vf=vf, 31 | gamma=0.99, lam=0.97, timesteps_per_batch=2500, 32 | desired_kl=0.002, 33 | num_timesteps=num_timesteps, animate=False) 34 | 35 | env.close() 36 | 37 | if __name__ == "__main__": 38 | parser = argparse.ArgumentParser(description='Run Mujoco benchmark.') 39 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 40 | parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1") 41 | args = parser.parse_args() 42 | train(args.env, num_timesteps=1e6, seed=args.seed) 43 | -------------------------------------------------------------------------------- /baselines/common/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | from baselines.common import zipsame 4 | 5 | def mpi_moments(x, axis=0): 6 | x = np.asarray(x, dtype='float64') 7 | newshape = list(x.shape) 8 | newshape.pop(axis) 9 | n = np.prod(newshape,dtype=int) 10 | totalvec = np.zeros(n*2+1, 'float64') 11 | addvec = np.concatenate([x.sum(axis=axis).ravel(), 12 | np.square(x).sum(axis=axis).ravel(), 13 | np.array([x.shape[axis]],dtype='float64')]) 14 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) 15 | sum = totalvec[:n] 16 | sumsq = totalvec[n:2*n] 17 | count = totalvec[2*n] 18 | if count == 0: 19 | mean = np.empty(newshape); mean[:] = np.nan 20 | std = np.empty(newshape); std[:] = np.nan 21 | else: 22 | mean = sum/count 23 | std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0)) 24 | return mean, std, count 25 | 26 | 27 | def test_runningmeanstd(): 28 | comm = MPI.COMM_WORLD 29 | np.random.seed(0) 30 | for (triple,axis) in [ 31 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), 32 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), 33 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), 34 | ]: 35 | 36 | 37 | x = np.concatenate(triple, axis=axis) 38 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] 39 | 40 | 41 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) 42 | 43 | for (a1,a2) in zipsame(ms1, ms2): 44 | print(a1, a2) 45 | assert np.allclose(a1, a2) 46 | print("ok!") 47 | 48 | if __name__ == "__main__": 49 | #mpirun -np 3 python