├── baselines
├── __init__.py
├── a2c
│ ├── __init__.py
│ ├── README.md
│ ├── run_atari.py
│ ├── policies.py
│ ├── a2c.py
│ └── utils.py
├── acktr
│ ├── __init__.py
│ ├── README.md
│ ├── running_stat.py
│ ├── run_mujoco.py
│ ├── run_atari.py
│ ├── value_functions.py
│ ├── filters.py
│ ├── policies.py
│ ├── kfac_utils.py
│ ├── acktr_cont.py
│ └── utils.py
├── ddpg
│ ├── __init__.py
│ ├── README.md
│ ├── util.py
│ ├── noise.py
│ ├── models.py
│ ├── memory.py
│ ├── main.py
│ └── training.py
├── ppo1
│ ├── __init__.py
│ ├── README.md
│ ├── run_mujoco.py
│ ├── run_atari.py
│ ├── cnn_policy.py
│ └── mlp_policy.py
├── trpo_mpi
│ ├── __init__.py
│ ├── README.md
│ ├── run_mujoco.py
│ ├── run_atari.py
│ └── nosharing_cnn_policy.py
├── deepq
│ ├── experiments
│ │ ├── __init__.py
│ │ ├── atari
│ │ │ ├── __init__.py
│ │ │ ├── download_model.py
│ │ │ ├── enjoy.py
│ │ │ ├── model.py
│ │ │ └── wang2015_eval.py
│ │ ├── enjoy_cartpole.py
│ │ ├── enjoy_mountaincar.py
│ │ ├── enjoy_pong.py
│ │ ├── train_mountaincar.py
│ │ ├── train_cartpole.py
│ │ ├── train_pong.py
│ │ └── custom_cartpole.py
│ ├── __init__.py
│ ├── README.md
│ ├── models.py
│ └── replay_buffer.py
├── bench
│ ├── __init__.py
│ ├── monitor.py
│ └── benchmarks.py
├── common
│ ├── __init__.py
│ ├── vec_env
│ │ ├── __init__.py
│ │ └── subproc_vec_env.py
│ ├── mpi_fork.py
│ ├── tests
│ │ ├── test_schedules.py
│ │ ├── test_tf_util.py
│ │ └── test_segment_tree.py
│ ├── cg.py
│ ├── console_util.py
│ ├── mpi_moments.py
│ ├── dataset.py
│ ├── math_util.py
│ ├── mpi_adam.py
│ ├── mpi_running_mean_std.py
│ ├── schedules.py
│ ├── segment_tree.py
│ ├── azure_utils.py
│ ├── atari_wrappers.py
│ └── atari_wrappers_deprecated.py
└── logger.py
├── data
├── logo.jpg
└── cartpole.gif
├── .gitignore
├── README.md
├── setup.py
└── LICENSE
/baselines/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/a2c/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/acktr/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/ddpg/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/ppo1/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/trpo_mpi/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/baselines/master/data/logo.jpg
--------------------------------------------------------------------------------
/data/cartpole.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/baselines/master/data/cartpole.gif
--------------------------------------------------------------------------------
/baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.bench.benchmarks import *
2 | from baselines.bench.monitor import *
3 |
4 |
--------------------------------------------------------------------------------
/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.common.console_util import *
2 | from baselines.common.dataset import Dataset
3 | from baselines.common.math_util import *
4 | from baselines.common.misc_util import *
5 |
--------------------------------------------------------------------------------
/baselines/a2c/README.md:
--------------------------------------------------------------------------------
1 | # A2C
2 |
3 | - Original paper: https://arxiv.org/abs/1602.01783
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.a2c.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
--------------------------------------------------------------------------------
/baselines/deepq/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.deepq import models # noqa
2 | from baselines.deepq.build_graph import build_act, build_train # noqa
3 |
4 | from baselines.deepq.simple import learn, load # noqa
5 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa
6 |
--------------------------------------------------------------------------------
/baselines/acktr/README.md:
--------------------------------------------------------------------------------
1 | # ACKTR
2 |
3 | - Original paper: https://arxiv.org/abs/1708.05144
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.acktr.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
--------------------------------------------------------------------------------
/baselines/ddpg/README.md:
--------------------------------------------------------------------------------
1 | # DDPG
2 |
3 | - Original paper: https://arxiv.org/abs/1509.02971
4 | - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/
5 | - `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options.
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.pyc
3 | *.py~
4 | .DS_Store
5 | .idea
6 |
7 | # Setuptools distribution and build folders.
8 | /dist/
9 | /build
10 | keys/
11 |
12 | # Virtualenv
13 | /env
14 |
15 |
16 | *.sublime-project
17 | *.sublime-workspace
18 |
19 | .idea
20 |
21 | logs/
22 |
23 | .ipynb_checkpoints
24 | ghostdriver.log
25 |
26 | htmlcov
27 |
28 | junk
29 | src
30 |
31 | *.egg-info
32 | .cache
33 |
--------------------------------------------------------------------------------
/baselines/ppo1/README.md:
--------------------------------------------------------------------------------
1 | # PPOSGD
2 |
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
7 |
8 |
--------------------------------------------------------------------------------
/baselines/trpo_mpi/README.md:
--------------------------------------------------------------------------------
1 | # trpo_mpi
2 |
3 | - Original paper: https://arxiv.org/abs/1502.05477
4 | - Baselines blog post https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 16 python -m baselines.trpo_mpi.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.trpo_mpi.run_mujoco` runs the algorithm for 1M timesteps on a Mujoco environment.
7 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_cartpole.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def main():
7 | env = gym.make("CartPole-v0")
8 | act = deepq.load("cartpole_model.pkl")
9 |
10 | while True:
11 | obs, done = env.reset(), False
12 | episode_rew = 0
13 | while not done:
14 | env.render()
15 | obs, rew, done, _ = env.step(act(obs[None])[0])
16 | episode_rew += rew
17 | print("Episode reward", episode_rew)
18 |
19 |
20 | if __name__ == '__main__':
21 | main()
22 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_mountaincar.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def main():
7 | env = gym.make("MountainCar-v0")
8 | act = deepq.load("mountaincar_model.pkl")
9 |
10 | while True:
11 | obs, done = env.reset(), False
12 | episode_rew = 0
13 | while not done:
14 | env.render()
15 | obs, rew, done, _ = env.step(act(obs[None])[0])
16 | episode_rew += rew
17 | print("Episode reward", episode_rew)
18 |
19 |
20 | if __name__ == '__main__':
21 | main()
22 |
--------------------------------------------------------------------------------
/baselines/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
1 | class VecEnv(object):
2 | """
3 | Vectorized environment base class
4 | """
5 | def step(self, vac):
6 | """
7 | Apply sequence of actions to sequence of environments
8 | actions -> (observations, rewards, news)
9 |
10 | where 'news' is a boolean vector indicating whether each element is new.
11 | """
12 | raise NotImplementedError
13 | def reset(self):
14 | """
15 | Reset all environments
16 | """
17 | raise NotImplementedError
18 | def close(self):
19 | pass
--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_pong.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
5 |
6 |
7 | def main():
8 | env = gym.make("PongNoFrameskip-v4")
9 | env = ScaledFloatFrame(wrap_dqn(env))
10 | act = deepq.load("pong_model.pkl")
11 |
12 | while True:
13 | obs, done = env.reset(), False
14 | episode_rew = 0
15 | while not done:
16 | env.render()
17 | obs, rew, done, _ = env.step(act(obs[None])[0])
18 | episode_rew += rew
19 | print("Episode reward", episode_rew)
20 |
21 |
22 | if __name__ == '__main__':
23 | main()
24 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_mountaincar.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def main():
7 | env = gym.make("MountainCar-v0")
8 | # Enabling layer_norm here is import for parameter space noise!
9 | model = deepq.models.mlp([64], layer_norm=True)
10 | act = deepq.learn(
11 | env,
12 | q_func=model,
13 | lr=1e-3,
14 | max_timesteps=100000,
15 | buffer_size=50000,
16 | exploration_fraction=0.1,
17 | exploration_final_eps=0.1,
18 | print_freq=10,
19 | param_noise=True
20 | )
21 | print("Saving model to mountaincar_model.pkl")
22 | act.save("mountaincar_model.pkl")
23 |
24 |
25 | if __name__ == '__main__':
26 | main()
27 |
--------------------------------------------------------------------------------
/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
1 | import os, subprocess, sys
2 |
3 | def mpi_fork(n, bind_to_core=False):
4 | """Re-launches the current script with workers
5 | Returns "parent" for original parent, "child" for MPI children
6 | """
7 | if n<=1:
8 | return "child"
9 | if os.getenv("IN_MPI") is None:
10 | env = os.environ.copy()
11 | env.update(
12 | MKL_NUM_THREADS="1",
13 | OMP_NUM_THREADS="1",
14 | IN_MPI="1"
15 | )
16 | args = ["mpirun", "-np", str(n)]
17 | if bind_to_core:
18 | args += ["-bind-to", "core"]
19 | args += [sys.executable] + sys.argv
20 | subprocess.check_call(args, env=env)
21 | return "parent"
22 | else:
23 | return "child"
24 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_cartpole.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def callback(lcl, glb):
7 | # stop training if reward exceeds 199
8 | is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
9 | return is_solved
10 |
11 |
12 | def main():
13 | env = gym.make("CartPole-v0")
14 | model = deepq.models.mlp([64])
15 | act = deepq.learn(
16 | env,
17 | q_func=model,
18 | lr=1e-3,
19 | max_timesteps=100000,
20 | buffer_size=50000,
21 | exploration_fraction=0.1,
22 | exploration_final_eps=0.02,
23 | print_freq=10,
24 | callback=callback
25 | )
26 | print("Saving model to cartpole_model.pkl")
27 | act.save("cartpole_model.pkl")
28 |
29 |
30 | if __name__ == '__main__':
31 | main()
32 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Baselines
4 |
5 | OpenAI Baselines is a set of high-quality implementations of reinforcement learning algorithms.
6 |
7 | These algorithms will make it easier for the research community to replicate, refine, and identify new ideas, and will create good baselines to build research on top of. Our DQN implementation and its variants are roughly on par with the scores in published papers. We expect they will be used as a base around which new ideas can be added, and as a tool for comparing a new approach against existing ones.
8 |
9 | You can install it by typing:
10 |
11 | ```bash
12 | git clone https://github.com/openai/baselines.git
13 | cd baselines
14 | pip install -e .
15 | ```
16 |
17 | - [A2C](baselines/a2c)
18 | - [ACKTR](baselines/acktr)
19 | - [DDPG](baselines/ddpg)
20 | - [DQN](baselines/deepq)
21 | - [PPO](baselines/ppo1)
22 | - [TRPO](baselines/trpo_mpi)
23 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
4 |
5 |
6 | def test_piecewise_schedule():
7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
8 |
9 | assert np.isclose(ps.value(-10), 500)
10 | assert np.isclose(ps.value(0), 150)
11 | assert np.isclose(ps.value(5), 200)
12 | assert np.isclose(ps.value(9), 80)
13 | assert np.isclose(ps.value(50), 50)
14 | assert np.isclose(ps.value(80), 50)
15 | assert np.isclose(ps.value(150), 0)
16 | assert np.isclose(ps.value(175), -25)
17 | assert np.isclose(ps.value(201), 500)
18 | assert np.isclose(ps.value(500), 500)
19 |
20 | assert np.isclose(ps.value(200 - 1e-10), -50)
21 |
22 |
23 | def test_constant_schedule():
24 | cs = ConstantSchedule(5)
25 | for i in range(-100, 100):
26 | assert np.isclose(cs.value(i), 5)
27 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_pong.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
5 |
6 |
7 | def main():
8 | env = gym.make("PongNoFrameskip-v4")
9 | env = ScaledFloatFrame(wrap_dqn(env))
10 | model = deepq.models.cnn_to_mlp(
11 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
12 | hiddens=[256],
13 | dueling=True
14 | )
15 | act = deepq.learn(
16 | env,
17 | q_func=model,
18 | lr=1e-4,
19 | max_timesteps=2000000,
20 | buffer_size=10000,
21 | exploration_fraction=0.1,
22 | exploration_final_eps=0.01,
23 | train_freq=4,
24 | learning_starts=10000,
25 | target_network_update_freq=1000,
26 | gamma=0.99,
27 | prioritized_replay=True
28 | )
29 | act.save("pong_model.pkl")
30 | env.close()
31 |
32 |
33 | if __name__ == '__main__':
34 | main()
35 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | import sys
3 |
4 | if sys.version_info.major != 3:
5 | print("This Python is only compatible with Python 3, but you are running "
6 | "Python {}. The installation will likely fail.".format(sys.version_info.major))
7 |
8 |
9 | setup(name='baselines',
10 | packages=[package for package in find_packages()
11 | if package.startswith('baselines')],
12 | install_requires=[
13 | 'gym[mujoco,atari,classic_control]',
14 | 'scipy',
15 | 'tqdm',
16 | 'joblib',
17 | 'zmq',
18 | 'dill',
19 | 'tensorflow >= 1.0.0',
20 | 'azure==1.0.3',
21 | 'progressbar2',
22 | 'mpi4py',
23 | ],
24 | description="OpenAI baselines: high quality implementations of reinforcement learning algorithms",
25 | author="OpenAI",
26 | url='https://github.com/openai/baselines',
27 | author_email="gym@openai.com",
28 | version="0.1.4")
29 |
--------------------------------------------------------------------------------
/baselines/common/cg.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
3 | """
4 | Demmel p 312
5 | """
6 | p = b.copy()
7 | r = b.copy()
8 | x = np.zeros_like(b)
9 | rdotr = r.dot(r)
10 |
11 | fmtstr = "%10i %10.3g %10.3g"
12 | titlestr = "%10s %10s %10s"
13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 |
15 | for i in range(cg_iters):
16 | if callback is not None:
17 | callback(x)
18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 | z = f_Ax(p)
20 | v = rdotr / p.dot(z)
21 | x += v*p
22 | r -= v*z
23 | newrdotr = r.dot(r)
24 | mu = newrdotr/rdotr
25 | p = r + mu*p
26 |
27 | rdotr = newrdotr
28 | if rdotr < residual_tol:
29 | break
30 |
31 | if callback is not None:
32 | callback(x)
33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
34 | return x
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2017 OpenAI (http://openai.com)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/baselines/ddpg/util.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import gym
4 | import numpy as np
5 | import tensorflow as tf
6 | from mpi4py import MPI
7 | from baselines.common.mpi_moments import mpi_moments
8 |
9 |
10 | def reduce_var(x, axis=None, keepdims=False):
11 | m = tf.reduce_mean(x, axis=axis, keep_dims=True)
12 | devs_squared = tf.square(x - m)
13 | return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
14 |
15 |
16 | def reduce_std(x, axis=None, keepdims=False):
17 | return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
18 |
19 |
20 | def mpi_mean(value):
21 | if value == []:
22 | value = [0.]
23 | if not isinstance(value, list):
24 | value = [value]
25 | return mpi_moments(np.array(value))[0][0]
26 |
27 |
28 | def mpi_std(value):
29 | if value == []:
30 | value = [0.]
31 | if not isinstance(value, list):
32 | value = [value]
33 | return mpi_moments(np.array(value))[1][0]
34 |
35 |
36 | def mpi_max(value):
37 | global_max = np.zeros(1, dtype='float64')
38 | local_max = np.max(value).astype('float64')
39 | MPI.COMM_WORLD.Reduce(local_max, global_max, op=MPI.MAX)
40 | return global_max[0]
41 |
42 |
43 | def mpi_sum(value):
44 | global_sum = np.zeros(1, dtype='float64')
45 | local_sum = np.sum(np.array(value)).astype('float64')
46 | MPI.COMM_WORLD.Reduce(local_sum, global_sum, op=MPI.SUM)
47 | return global_sum[0]
48 |
--------------------------------------------------------------------------------
/baselines/acktr/running_stat.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | # http://www.johndcook.com/blog/standard_deviation/
4 | class RunningStat(object):
5 | def __init__(self, shape):
6 | self._n = 0
7 | self._M = np.zeros(shape)
8 | self._S = np.zeros(shape)
9 | def push(self, x):
10 | x = np.asarray(x)
11 | assert x.shape == self._M.shape
12 | self._n += 1
13 | if self._n == 1:
14 | self._M[...] = x
15 | else:
16 | oldM = self._M.copy()
17 | self._M[...] = oldM + (x - oldM)/self._n
18 | self._S[...] = self._S + (x - oldM)*(x - self._M)
19 | @property
20 | def n(self):
21 | return self._n
22 | @property
23 | def mean(self):
24 | return self._M
25 | @property
26 | def var(self):
27 | return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
28 | @property
29 | def std(self):
30 | return np.sqrt(self.var)
31 | @property
32 | def shape(self):
33 | return self._M.shape
34 |
35 | def test_running_stat():
36 | for shp in ((), (3,), (3,4)):
37 | li = []
38 | rs = RunningStat(shp)
39 | for _ in range(5):
40 | val = np.random.randn(*shp)
41 | rs.push(val)
42 | li.append(val)
43 | m = np.mean(li, axis=0)
44 | assert np.allclose(rs.mean, m)
45 | v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
46 | assert np.allclose(rs.var, v)
47 |
--------------------------------------------------------------------------------
/baselines/common/console_util.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from contextlib import contextmanager
3 | import numpy as np
4 | import time
5 |
6 | # ================================================================
7 | # Misc
8 | # ================================================================
9 |
10 | def fmt_row(width, row, header=False):
11 | out = " | ".join(fmt_item(x, width) for x in row)
12 | if header: out = out + "\n" + "-"*len(out)
13 | return out
14 |
15 | def fmt_item(x, l):
16 | if isinstance(x, np.ndarray):
17 | assert x.ndim==0
18 | x = x.item()
19 | if isinstance(x, float): rep = "%g"%x
20 | else: rep = str(x)
21 | return " "*(l - len(rep)) + rep
22 |
23 | color2num = dict(
24 | gray=30,
25 | red=31,
26 | green=32,
27 | yellow=33,
28 | blue=34,
29 | magenta=35,
30 | cyan=36,
31 | white=37,
32 | crimson=38
33 | )
34 |
35 | def colorize(string, color, bold=False, highlight=False):
36 | attr = []
37 | num = color2num[color]
38 | if highlight: num += 10
39 | attr.append(str(num))
40 | if bold: attr.append('1')
41 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
42 |
43 |
44 | MESSAGE_DEPTH = 0
45 |
46 | @contextmanager
47 | def timed(msg):
48 | global MESSAGE_DEPTH #pylint: disable=W0603
49 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
50 | tstart = time.time()
51 | MESSAGE_DEPTH += 1
52 | yield
53 | MESSAGE_DEPTH -= 1
54 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
55 |
--------------------------------------------------------------------------------
/baselines/ppo1/run_mujoco.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from baselines.common import set_global_seeds, tf_util as U
3 | from baselines import bench
4 | import os.path as osp
5 | import gym, logging
6 | from baselines import logger
7 | import sys
8 |
9 | def train(env_id, num_timesteps, seed):
10 | from baselines.ppo1 import mlp_policy, pposgd_simple
11 | U.make_session(num_cpu=1).__enter__()
12 | set_global_seeds(seed)
13 | env = gym.make(env_id)
14 | def policy_fn(name, ob_space, ac_space):
15 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
16 | hid_size=64, num_hid_layers=2)
17 | env = bench.Monitor(env, logger.get_dir() and
18 | osp.join(logger.get_dir(), "monitor.json"))
19 | env.seed(seed)
20 | gym.logger.setLevel(logging.WARN)
21 | pposgd_simple.learn(env, policy_fn,
22 | max_timesteps=num_timesteps,
23 | timesteps_per_batch=2048,
24 | clip_param=0.2, entcoeff=0.0,
25 | optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
26 | gamma=0.99, lam=0.95, schedule='linear',
27 | )
28 | env.close()
29 |
30 | def main():
31 | import argparse
32 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
33 | parser.add_argument('--env', help='environment ID', default='Hopper-v1')
34 | parser.add_argument('--seed', help='RNG seed', type=int, default=0)
35 | args = parser.parse_args()
36 | train(args.env, num_timesteps=1e6, seed=args.seed)
37 |
38 |
39 | if __name__ == '__main__':
40 | main()
41 |
--------------------------------------------------------------------------------
/baselines/acktr/run_mujoco.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import logging
4 | import os
5 | import tensorflow as tf
6 | import gym
7 | from baselines import logger
8 | from baselines.common import set_global_seeds
9 | from baselines import bench
10 | from baselines.acktr.acktr_cont import learn
11 | from baselines.acktr.policies import GaussianMlpPolicy
12 | from baselines.acktr.value_functions import NeuralNetValueFunction
13 |
14 | def train(env_id, num_timesteps, seed):
15 | env=gym.make(env_id)
16 | if logger.get_dir():
17 | env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
18 | set_global_seeds(seed)
19 | env.seed(seed)
20 | gym.logger.setLevel(logging.WARN)
21 |
22 | with tf.Session(config=tf.ConfigProto()) as session:
23 | ob_dim = env.observation_space.shape[0]
24 | ac_dim = env.action_space.shape[0]
25 | with tf.variable_scope("vf"):
26 | vf = NeuralNetValueFunction(ob_dim, ac_dim)
27 | with tf.variable_scope("pi"):
28 | policy = GaussianMlpPolicy(ob_dim, ac_dim)
29 |
30 | learn(env, policy=policy, vf=vf,
31 | gamma=0.99, lam=0.97, timesteps_per_batch=2500,
32 | desired_kl=0.002,
33 | num_timesteps=num_timesteps, animate=False)
34 |
35 | env.close()
36 |
37 | if __name__ == "__main__":
38 | parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
39 | parser.add_argument('--seed', help='RNG seed', type=int, default=0)
40 | parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
41 | args = parser.parse_args()
42 | train(args.env, num_timesteps=1e6, seed=args.seed)
43 |
--------------------------------------------------------------------------------
/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
1 | from mpi4py import MPI
2 | import numpy as np
3 | from baselines.common import zipsame
4 |
5 | def mpi_moments(x, axis=0):
6 | x = np.asarray(x, dtype='float64')
7 | newshape = list(x.shape)
8 | newshape.pop(axis)
9 | n = np.prod(newshape,dtype=int)
10 | totalvec = np.zeros(n*2+1, 'float64')
11 | addvec = np.concatenate([x.sum(axis=axis).ravel(),
12 | np.square(x).sum(axis=axis).ravel(),
13 | np.array([x.shape[axis]],dtype='float64')])
14 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
15 | sum = totalvec[:n]
16 | sumsq = totalvec[n:2*n]
17 | count = totalvec[2*n]
18 | if count == 0:
19 | mean = np.empty(newshape); mean[:] = np.nan
20 | std = np.empty(newshape); std[:] = np.nan
21 | else:
22 | mean = sum/count
23 | std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0))
24 | return mean, std, count
25 |
26 |
27 | def test_runningmeanstd():
28 | comm = MPI.COMM_WORLD
29 | np.random.seed(0)
30 | for (triple,axis) in [
31 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
32 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
33 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
34 | ]:
35 |
36 |
37 | x = np.concatenate(triple, axis=axis)
38 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
39 |
40 |
41 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
42 |
43 | for (a1,a2) in zipsame(ms1, ms2):
44 | print(a1, a2)
45 | assert np.allclose(a1, a2)
46 | print("ok!")
47 |
48 | if __name__ == "__main__":
49 | #mpirun -np 3 python